diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-03-25 14:39:37 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-18 08:46:11 -0400 |
commit | 43a21ea81a2400992561146327c4785ce7f7be38 (patch) | |
tree | d4974c0ff9d7f40291515c5c0cf7e0d51abccb66 | |
parent | d3a9262e59f7fb83c6d44df3b2b1460ed57d3ea1 (diff) |
perf_counter: Add event overlow handling
Alternative method of mmap() data output handling that provides
better overflow management and a more reliable data stream.
Unlike the previous method, that didn't have any user->kernel
feedback and relied on userspace keeping up, this method relies on
userspace writing its last read position into the control page.
It will ensure new output doesn't overwrite not-yet read events,
new events for which there is no space left are lost and the
overflow counter is incremented, providing exact event loss
numbers.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/perf_counter.h | 40 | ||||
-rw-r--r-- | kernel/perf_counter.c | 185 |
2 files changed, 158 insertions, 67 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a7d3a61a59b7..0765e8e69843 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h | |||
@@ -236,10 +236,16 @@ struct perf_counter_mmap_page { | |||
236 | /* | 236 | /* |
237 | * Control data for the mmap() data buffer. | 237 | * Control data for the mmap() data buffer. |
238 | * | 238 | * |
239 | * User-space reading this value should issue an rmb(), on SMP capable | 239 | * User-space reading the @data_head value should issue an rmb(), on |
240 | * platforms, after reading this value -- see perf_counter_wakeup(). | 240 | * SMP capable platforms, after reading this value -- see |
241 | * perf_counter_wakeup(). | ||
242 | * | ||
243 | * When the mapping is PROT_WRITE the @data_tail value should be | ||
244 | * written by userspace to reflect the last read data. In this case | ||
245 | * the kernel will not over-write unread data. | ||
241 | */ | 246 | */ |
242 | __u64 data_head; /* head in the data section */ | 247 | __u64 data_head; /* head in the data section */ |
248 | __u64 data_tail; /* user-space written tail */ | ||
243 | }; | 249 | }; |
244 | 250 | ||
245 | #define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) | 251 | #define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) |
@@ -275,6 +281,15 @@ enum perf_event_type { | |||
275 | 281 | ||
276 | /* | 282 | /* |
277 | * struct { | 283 | * struct { |
284 | * struct perf_event_header header; | ||
285 | * u64 id; | ||
286 | * u64 lost; | ||
287 | * }; | ||
288 | */ | ||
289 | PERF_EVENT_LOST = 2, | ||
290 | |||
291 | /* | ||
292 | * struct { | ||
278 | * struct perf_event_header header; | 293 | * struct perf_event_header header; |
279 | * | 294 | * |
280 | * u32 pid, tid; | 295 | * u32 pid, tid; |
@@ -313,26 +328,26 @@ enum perf_event_type { | |||
313 | 328 | ||
314 | /* | 329 | /* |
315 | * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field | 330 | * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field |
316 | * will be PERF_RECORD_* | 331 | * will be PERF_SAMPLE_* |
317 | * | 332 | * |
318 | * struct { | 333 | * struct { |
319 | * struct perf_event_header header; | 334 | * struct perf_event_header header; |
320 | * | 335 | * |
321 | * { u64 ip; } && PERF_RECORD_IP | 336 | * { u64 ip; } && PERF_SAMPLE_IP |
322 | * { u32 pid, tid; } && PERF_RECORD_TID | 337 | * { u32 pid, tid; } && PERF_SAMPLE_TID |
323 | * { u64 time; } && PERF_RECORD_TIME | 338 | * { u64 time; } && PERF_SAMPLE_TIME |
324 | * { u64 addr; } && PERF_RECORD_ADDR | 339 | * { u64 addr; } && PERF_SAMPLE_ADDR |
325 | * { u64 config; } && PERF_RECORD_CONFIG | 340 | * { u64 config; } && PERF_SAMPLE_CONFIG |
326 | * { u32 cpu, res; } && PERF_RECORD_CPU | 341 | * { u32 cpu, res; } && PERF_SAMPLE_CPU |
327 | * | 342 | * |
328 | * { u64 nr; | 343 | * { u64 nr; |
329 | * { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP | 344 | * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP |
330 | * | 345 | * |
331 | * { u16 nr, | 346 | * { u16 nr, |
332 | * hv, | 347 | * hv, |
333 | * kernel, | 348 | * kernel, |
334 | * user; | 349 | * user; |
335 | * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN | 350 | * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN |
336 | * }; | 351 | * }; |
337 | */ | 352 | */ |
338 | }; | 353 | }; |
@@ -424,6 +439,7 @@ struct file; | |||
424 | struct perf_mmap_data { | 439 | struct perf_mmap_data { |
425 | struct rcu_head rcu_head; | 440 | struct rcu_head rcu_head; |
426 | int nr_pages; /* nr of data pages */ | 441 | int nr_pages; /* nr of data pages */ |
442 | int writable; /* are we writable */ | ||
427 | int nr_locked; /* nr pages mlocked */ | 443 | int nr_locked; /* nr pages mlocked */ |
428 | 444 | ||
429 | atomic_t poll; /* POLL_ for wakeups */ | 445 | atomic_t poll; /* POLL_ for wakeups */ |
@@ -433,8 +449,8 @@ struct perf_mmap_data { | |||
433 | atomic_long_t done_head; /* completed head */ | 449 | atomic_long_t done_head; /* completed head */ |
434 | 450 | ||
435 | atomic_t lock; /* concurrent writes */ | 451 | atomic_t lock; /* concurrent writes */ |
436 | |||
437 | atomic_t wakeup; /* needs a wakeup */ | 452 | atomic_t wakeup; /* needs a wakeup */ |
453 | atomic_t lost; /* nr records lost */ | ||
438 | 454 | ||
439 | struct perf_counter_mmap_page *user_page; | 455 | struct perf_counter_mmap_page *user_page; |
440 | void *data_pages[0]; | 456 | void *data_pages[0]; |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 109a95723859..7e9108efd305 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1794 | struct perf_mmap_data *data; | 1794 | struct perf_mmap_data *data; |
1795 | int ret = VM_FAULT_SIGBUS; | 1795 | int ret = VM_FAULT_SIGBUS; |
1796 | 1796 | ||
1797 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | ||
1798 | if (vmf->pgoff == 0) | ||
1799 | ret = 0; | ||
1800 | return ret; | ||
1801 | } | ||
1802 | |||
1797 | rcu_read_lock(); | 1803 | rcu_read_lock(); |
1798 | data = rcu_dereference(counter->data); | 1804 | data = rcu_dereference(counter->data); |
1799 | if (!data) | 1805 | if (!data) |
@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1807 | if ((unsigned)nr > data->nr_pages) | 1813 | if ((unsigned)nr > data->nr_pages) |
1808 | goto unlock; | 1814 | goto unlock; |
1809 | 1815 | ||
1816 | if (vmf->flags & FAULT_FLAG_WRITE) | ||
1817 | goto unlock; | ||
1818 | |||
1810 | vmf->page = virt_to_page(data->data_pages[nr]); | 1819 | vmf->page = virt_to_page(data->data_pages[nr]); |
1811 | } | 1820 | } |
1821 | |||
1812 | get_page(vmf->page); | 1822 | get_page(vmf->page); |
1823 | vmf->page->mapping = vma->vm_file->f_mapping; | ||
1824 | vmf->page->index = vmf->pgoff; | ||
1825 | |||
1813 | ret = 0; | 1826 | ret = 0; |
1814 | unlock: | 1827 | unlock: |
1815 | rcu_read_unlock(); | 1828 | rcu_read_unlock(); |
@@ -1862,6 +1875,14 @@ fail: | |||
1862 | return -ENOMEM; | 1875 | return -ENOMEM; |
1863 | } | 1876 | } |
1864 | 1877 | ||
1878 | static void perf_mmap_free_page(unsigned long addr) | ||
1879 | { | ||
1880 | struct page *page = virt_to_page(addr); | ||
1881 | |||
1882 | page->mapping = NULL; | ||
1883 | __free_page(page); | ||
1884 | } | ||
1885 | |||
1865 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) | 1886 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) |
1866 | { | 1887 | { |
1867 | struct perf_mmap_data *data; | 1888 | struct perf_mmap_data *data; |
@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) | |||
1869 | 1890 | ||
1870 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); | 1891 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); |
1871 | 1892 | ||
1872 | free_page((unsigned long)data->user_page); | 1893 | perf_mmap_free_page((unsigned long)data->user_page); |
1873 | for (i = 0; i < data->nr_pages; i++) | 1894 | for (i = 0; i < data->nr_pages; i++) |
1874 | free_page((unsigned long)data->data_pages[i]); | 1895 | perf_mmap_free_page((unsigned long)data->data_pages[i]); |
1896 | |||
1875 | kfree(data); | 1897 | kfree(data); |
1876 | } | 1898 | } |
1877 | 1899 | ||
@@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
1908 | } | 1930 | } |
1909 | 1931 | ||
1910 | static struct vm_operations_struct perf_mmap_vmops = { | 1932 | static struct vm_operations_struct perf_mmap_vmops = { |
1911 | .open = perf_mmap_open, | 1933 | .open = perf_mmap_open, |
1912 | .close = perf_mmap_close, | 1934 | .close = perf_mmap_close, |
1913 | .fault = perf_mmap_fault, | 1935 | .fault = perf_mmap_fault, |
1936 | .page_mkwrite = perf_mmap_fault, | ||
1914 | }; | 1937 | }; |
1915 | 1938 | ||
1916 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) | 1939 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) |
@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
1924 | long user_extra, extra; | 1947 | long user_extra, extra; |
1925 | int ret = 0; | 1948 | int ret = 0; |
1926 | 1949 | ||
1927 | if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) | 1950 | if (!(vma->vm_flags & VM_SHARED)) |
1928 | return -EINVAL; | 1951 | return -EINVAL; |
1929 | 1952 | ||
1930 | vma_size = vma->vm_end - vma->vm_start; | 1953 | vma_size = vma->vm_end - vma->vm_start; |
@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
1983 | atomic_long_add(user_extra, &user->locked_vm); | 2006 | atomic_long_add(user_extra, &user->locked_vm); |
1984 | vma->vm_mm->locked_vm += extra; | 2007 | vma->vm_mm->locked_vm += extra; |
1985 | counter->data->nr_locked = extra; | 2008 | counter->data->nr_locked = extra; |
2009 | if (vma->vm_flags & VM_WRITE) | ||
2010 | counter->data->writable = 1; | ||
2011 | |||
1986 | unlock: | 2012 | unlock: |
1987 | mutex_unlock(&counter->mmap_mutex); | 2013 | mutex_unlock(&counter->mmap_mutex); |
1988 | 2014 | ||
1989 | vma->vm_flags &= ~VM_MAYWRITE; | ||
1990 | vma->vm_flags |= VM_RESERVED; | 2015 | vma->vm_flags |= VM_RESERVED; |
1991 | vma->vm_ops = &perf_mmap_vmops; | 2016 | vma->vm_ops = &perf_mmap_vmops; |
1992 | 2017 | ||
@@ -2163,11 +2188,38 @@ struct perf_output_handle { | |||
2163 | unsigned long head; | 2188 | unsigned long head; |
2164 | unsigned long offset; | 2189 | unsigned long offset; |
2165 | int nmi; | 2190 | int nmi; |
2166 | int overflow; | 2191 | int sample; |
2167 | int locked; | 2192 | int locked; |
2168 | unsigned long flags; | 2193 | unsigned long flags; |
2169 | }; | 2194 | }; |
2170 | 2195 | ||
2196 | static bool perf_output_space(struct perf_mmap_data *data, | ||
2197 | unsigned int offset, unsigned int head) | ||
2198 | { | ||
2199 | unsigned long tail; | ||
2200 | unsigned long mask; | ||
2201 | |||
2202 | if (!data->writable) | ||
2203 | return true; | ||
2204 | |||
2205 | mask = (data->nr_pages << PAGE_SHIFT) - 1; | ||
2206 | /* | ||
2207 | * Userspace could choose to issue a mb() before updating the tail | ||
2208 | * pointer. So that all reads will be completed before the write is | ||
2209 | * issued. | ||
2210 | */ | ||
2211 | tail = ACCESS_ONCE(data->user_page->data_tail); | ||
2212 | smp_rmb(); | ||
2213 | |||
2214 | offset = (offset - tail) & mask; | ||
2215 | head = (head - tail) & mask; | ||
2216 | |||
2217 | if ((int)(head - offset) < 0) | ||
2218 | return false; | ||
2219 | |||
2220 | return true; | ||
2221 | } | ||
2222 | |||
2171 | static void perf_output_wakeup(struct perf_output_handle *handle) | 2223 | static void perf_output_wakeup(struct perf_output_handle *handle) |
2172 | { | 2224 | { |
2173 | atomic_set(&handle->data->poll, POLL_IN); | 2225 | atomic_set(&handle->data->poll, POLL_IN); |
@@ -2258,12 +2310,57 @@ out: | |||
2258 | local_irq_restore(handle->flags); | 2310 | local_irq_restore(handle->flags); |
2259 | } | 2311 | } |
2260 | 2312 | ||
2313 | static void perf_output_copy(struct perf_output_handle *handle, | ||
2314 | const void *buf, unsigned int len) | ||
2315 | { | ||
2316 | unsigned int pages_mask; | ||
2317 | unsigned int offset; | ||
2318 | unsigned int size; | ||
2319 | void **pages; | ||
2320 | |||
2321 | offset = handle->offset; | ||
2322 | pages_mask = handle->data->nr_pages - 1; | ||
2323 | pages = handle->data->data_pages; | ||
2324 | |||
2325 | do { | ||
2326 | unsigned int page_offset; | ||
2327 | int nr; | ||
2328 | |||
2329 | nr = (offset >> PAGE_SHIFT) & pages_mask; | ||
2330 | page_offset = offset & (PAGE_SIZE - 1); | ||
2331 | size = min_t(unsigned int, PAGE_SIZE - page_offset, len); | ||
2332 | |||
2333 | memcpy(pages[nr] + page_offset, buf, size); | ||
2334 | |||
2335 | len -= size; | ||
2336 | buf += size; | ||
2337 | offset += size; | ||
2338 | } while (len); | ||
2339 | |||
2340 | handle->offset = offset; | ||
2341 | |||
2342 | /* | ||
2343 | * Check we didn't copy past our reservation window, taking the | ||
2344 | * possible unsigned int wrap into account. | ||
2345 | */ | ||
2346 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
2347 | } | ||
2348 | |||
2349 | #define perf_output_put(handle, x) \ | ||
2350 | perf_output_copy((handle), &(x), sizeof(x)) | ||
2351 | |||
2261 | static int perf_output_begin(struct perf_output_handle *handle, | 2352 | static int perf_output_begin(struct perf_output_handle *handle, |
2262 | struct perf_counter *counter, unsigned int size, | 2353 | struct perf_counter *counter, unsigned int size, |
2263 | int nmi, int overflow) | 2354 | int nmi, int sample) |
2264 | { | 2355 | { |
2265 | struct perf_mmap_data *data; | 2356 | struct perf_mmap_data *data; |
2266 | unsigned int offset, head; | 2357 | unsigned int offset, head; |
2358 | int have_lost; | ||
2359 | struct { | ||
2360 | struct perf_event_header header; | ||
2361 | u64 id; | ||
2362 | u64 lost; | ||
2363 | } lost_event; | ||
2267 | 2364 | ||
2268 | /* | 2365 | /* |
2269 | * For inherited counters we send all the output towards the parent. | 2366 | * For inherited counters we send all the output towards the parent. |
@@ -2276,19 +2373,25 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
2276 | if (!data) | 2373 | if (!data) |
2277 | goto out; | 2374 | goto out; |
2278 | 2375 | ||
2279 | handle->data = data; | 2376 | handle->data = data; |
2280 | handle->counter = counter; | 2377 | handle->counter = counter; |
2281 | handle->nmi = nmi; | 2378 | handle->nmi = nmi; |
2282 | handle->overflow = overflow; | 2379 | handle->sample = sample; |
2283 | 2380 | ||
2284 | if (!data->nr_pages) | 2381 | if (!data->nr_pages) |
2285 | goto fail; | 2382 | goto fail; |
2286 | 2383 | ||
2384 | have_lost = atomic_read(&data->lost); | ||
2385 | if (have_lost) | ||
2386 | size += sizeof(lost_event); | ||
2387 | |||
2287 | perf_output_lock(handle); | 2388 | perf_output_lock(handle); |
2288 | 2389 | ||
2289 | do { | 2390 | do { |
2290 | offset = head = atomic_long_read(&data->head); | 2391 | offset = head = atomic_long_read(&data->head); |
2291 | head += size; | 2392 | head += size; |
2393 | if (unlikely(!perf_output_space(data, offset, head))) | ||
2394 | goto fail; | ||
2292 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); | 2395 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); |
2293 | 2396 | ||
2294 | handle->offset = offset; | 2397 | handle->offset = offset; |
@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
2297 | if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) | 2400 | if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) |
2298 | atomic_set(&data->wakeup, 1); | 2401 | atomic_set(&data->wakeup, 1); |
2299 | 2402 | ||
2403 | if (have_lost) { | ||
2404 | lost_event.header.type = PERF_EVENT_LOST; | ||
2405 | lost_event.header.misc = 0; | ||
2406 | lost_event.header.size = sizeof(lost_event); | ||
2407 | lost_event.id = counter->id; | ||
2408 | lost_event.lost = atomic_xchg(&data->lost, 0); | ||
2409 | |||
2410 | perf_output_put(handle, lost_event); | ||
2411 | } | ||
2412 | |||
2300 | return 0; | 2413 | return 0; |
2301 | 2414 | ||
2302 | fail: | 2415 | fail: |
2303 | perf_output_wakeup(handle); | 2416 | atomic_inc(&data->lost); |
2417 | perf_output_unlock(handle); | ||
2304 | out: | 2418 | out: |
2305 | rcu_read_unlock(); | 2419 | rcu_read_unlock(); |
2306 | 2420 | ||
2307 | return -ENOSPC; | 2421 | return -ENOSPC; |
2308 | } | 2422 | } |
2309 | 2423 | ||
2310 | static void perf_output_copy(struct perf_output_handle *handle, | ||
2311 | const void *buf, unsigned int len) | ||
2312 | { | ||
2313 | unsigned int pages_mask; | ||
2314 | unsigned int offset; | ||
2315 | unsigned int size; | ||
2316 | void **pages; | ||
2317 | |||
2318 | offset = handle->offset; | ||
2319 | pages_mask = handle->data->nr_pages - 1; | ||
2320 | pages = handle->data->data_pages; | ||
2321 | |||
2322 | do { | ||
2323 | unsigned int page_offset; | ||
2324 | int nr; | ||
2325 | |||
2326 | nr = (offset >> PAGE_SHIFT) & pages_mask; | ||
2327 | page_offset = offset & (PAGE_SIZE - 1); | ||
2328 | size = min_t(unsigned int, PAGE_SIZE - page_offset, len); | ||
2329 | |||
2330 | memcpy(pages[nr] + page_offset, buf, size); | ||
2331 | |||
2332 | len -= size; | ||
2333 | buf += size; | ||
2334 | offset += size; | ||
2335 | } while (len); | ||
2336 | |||
2337 | handle->offset = offset; | ||
2338 | |||
2339 | /* | ||
2340 | * Check we didn't copy past our reservation window, taking the | ||
2341 | * possible unsigned int wrap into account. | ||
2342 | */ | ||
2343 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
2344 | } | ||
2345 | |||
2346 | #define perf_output_put(handle, x) \ | ||
2347 | perf_output_copy((handle), &(x), sizeof(x)) | ||
2348 | |||
2349 | static void perf_output_end(struct perf_output_handle *handle) | 2424 | static void perf_output_end(struct perf_output_handle *handle) |
2350 | { | 2425 | { |
2351 | struct perf_counter *counter = handle->counter; | 2426 | struct perf_counter *counter = handle->counter; |
@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle) | |||
2353 | 2428 | ||
2354 | int wakeup_events = counter->attr.wakeup_events; | 2429 | int wakeup_events = counter->attr.wakeup_events; |
2355 | 2430 | ||
2356 | if (handle->overflow && wakeup_events) { | 2431 | if (handle->sample && wakeup_events) { |
2357 | int events = atomic_inc_return(&data->events); | 2432 | int events = atomic_inc_return(&data->events); |
2358 | if (events >= wakeup_events) { | 2433 | if (events >= wakeup_events) { |
2359 | atomic_sub(wakeup_events, &data->events); | 2434 | atomic_sub(wakeup_events, &data->events); |
@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) | |||
2958 | } | 3033 | } |
2959 | 3034 | ||
2960 | /* | 3035 | /* |
2961 | * Generic counter overflow handling. | 3036 | * Generic counter overflow handling, sampling. |
2962 | */ | 3037 | */ |
2963 | 3038 | ||
2964 | int perf_counter_overflow(struct perf_counter *counter, int nmi, | 3039 | int perf_counter_overflow(struct perf_counter *counter, int nmi, |