aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-03-25 14:39:37 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-18 08:46:11 -0400
commit43a21ea81a2400992561146327c4785ce7f7be38 (patch)
treed4974c0ff9d7f40291515c5c0cf7e0d51abccb66
parentd3a9262e59f7fb83c6d44df3b2b1460ed57d3ea1 (diff)
perf_counter: Add event overlow handling
Alternative method of mmap() data output handling that provides better overflow management and a more reliable data stream. Unlike the previous method, that didn't have any user->kernel feedback and relied on userspace keeping up, this method relies on userspace writing its last read position into the control page. It will ensure new output doesn't overwrite not-yet read events, new events for which there is no space left are lost and the overflow counter is incremented, providing exact event loss numbers. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/perf_counter.h40
-rw-r--r--kernel/perf_counter.c185
2 files changed, 158 insertions, 67 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a7d3a61a59b7..0765e8e69843 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
236 /* 236 /*
237 * Control data for the mmap() data buffer. 237 * Control data for the mmap() data buffer.
238 * 238 *
239 * User-space reading this value should issue an rmb(), on SMP capable 239 * User-space reading the @data_head value should issue an rmb(), on
240 * platforms, after reading this value -- see perf_counter_wakeup(). 240 * SMP capable platforms, after reading this value -- see
241 * perf_counter_wakeup().
242 *
243 * When the mapping is PROT_WRITE the @data_tail value should be
244 * written by userspace to reflect the last read data. In this case
245 * the kernel will not over-write unread data.
241 */ 246 */
242 __u64 data_head; /* head in the data section */ 247 __u64 data_head; /* head in the data section */
248 __u64 data_tail; /* user-space written tail */
243}; 249};
244 250
245#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) 251#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
@@ -275,6 +281,15 @@ enum perf_event_type {
275 281
276 /* 282 /*
277 * struct { 283 * struct {
284 * struct perf_event_header header;
285 * u64 id;
286 * u64 lost;
287 * };
288 */
289 PERF_EVENT_LOST = 2,
290
291 /*
292 * struct {
278 * struct perf_event_header header; 293 * struct perf_event_header header;
279 * 294 *
280 * u32 pid, tid; 295 * u32 pid, tid;
@@ -313,26 +328,26 @@ enum perf_event_type {
313 328
314 /* 329 /*
315 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field 330 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
316 * will be PERF_RECORD_* 331 * will be PERF_SAMPLE_*
317 * 332 *
318 * struct { 333 * struct {
319 * struct perf_event_header header; 334 * struct perf_event_header header;
320 * 335 *
321 * { u64 ip; } && PERF_RECORD_IP 336 * { u64 ip; } && PERF_SAMPLE_IP
322 * { u32 pid, tid; } && PERF_RECORD_TID 337 * { u32 pid, tid; } && PERF_SAMPLE_TID
323 * { u64 time; } && PERF_RECORD_TIME 338 * { u64 time; } && PERF_SAMPLE_TIME
324 * { u64 addr; } && PERF_RECORD_ADDR 339 * { u64 addr; } && PERF_SAMPLE_ADDR
325 * { u64 config; } && PERF_RECORD_CONFIG 340 * { u64 config; } && PERF_SAMPLE_CONFIG
326 * { u32 cpu, res; } && PERF_RECORD_CPU 341 * { u32 cpu, res; } && PERF_SAMPLE_CPU
327 * 342 *
328 * { u64 nr; 343 * { u64 nr;
329 * { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP 344 * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP
330 * 345 *
331 * { u16 nr, 346 * { u16 nr,
332 * hv, 347 * hv,
333 * kernel, 348 * kernel,
334 * user; 349 * user;
335 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN 350 * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
336 * }; 351 * };
337 */ 352 */
338}; 353};
@@ -424,6 +439,7 @@ struct file;
424struct perf_mmap_data { 439struct perf_mmap_data {
425 struct rcu_head rcu_head; 440 struct rcu_head rcu_head;
426 int nr_pages; /* nr of data pages */ 441 int nr_pages; /* nr of data pages */
442 int writable; /* are we writable */
427 int nr_locked; /* nr pages mlocked */ 443 int nr_locked; /* nr pages mlocked */
428 444
429 atomic_t poll; /* POLL_ for wakeups */ 445 atomic_t poll; /* POLL_ for wakeups */
@@ -433,8 +449,8 @@ struct perf_mmap_data {
433 atomic_long_t done_head; /* completed head */ 449 atomic_long_t done_head; /* completed head */
434 450
435 atomic_t lock; /* concurrent writes */ 451 atomic_t lock; /* concurrent writes */
436
437 atomic_t wakeup; /* needs a wakeup */ 452 atomic_t wakeup; /* needs a wakeup */
453 atomic_t lost; /* nr records lost */
438 454
439 struct perf_counter_mmap_page *user_page; 455 struct perf_counter_mmap_page *user_page;
440 void *data_pages[0]; 456 void *data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 109a95723859..7e9108efd305 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1794 struct perf_mmap_data *data; 1794 struct perf_mmap_data *data;
1795 int ret = VM_FAULT_SIGBUS; 1795 int ret = VM_FAULT_SIGBUS;
1796 1796
1797 if (vmf->flags & FAULT_FLAG_MKWRITE) {
1798 if (vmf->pgoff == 0)
1799 ret = 0;
1800 return ret;
1801 }
1802
1797 rcu_read_lock(); 1803 rcu_read_lock();
1798 data = rcu_dereference(counter->data); 1804 data = rcu_dereference(counter->data);
1799 if (!data) 1805 if (!data)
@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1807 if ((unsigned)nr > data->nr_pages) 1813 if ((unsigned)nr > data->nr_pages)
1808 goto unlock; 1814 goto unlock;
1809 1815
1816 if (vmf->flags & FAULT_FLAG_WRITE)
1817 goto unlock;
1818
1810 vmf->page = virt_to_page(data->data_pages[nr]); 1819 vmf->page = virt_to_page(data->data_pages[nr]);
1811 } 1820 }
1821
1812 get_page(vmf->page); 1822 get_page(vmf->page);
1823 vmf->page->mapping = vma->vm_file->f_mapping;
1824 vmf->page->index = vmf->pgoff;
1825
1813 ret = 0; 1826 ret = 0;
1814unlock: 1827unlock:
1815 rcu_read_unlock(); 1828 rcu_read_unlock();
@@ -1862,6 +1875,14 @@ fail:
1862 return -ENOMEM; 1875 return -ENOMEM;
1863} 1876}
1864 1877
1878static void perf_mmap_free_page(unsigned long addr)
1879{
1880 struct page *page = virt_to_page(addr);
1881
1882 page->mapping = NULL;
1883 __free_page(page);
1884}
1885
1865static void __perf_mmap_data_free(struct rcu_head *rcu_head) 1886static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1866{ 1887{
1867 struct perf_mmap_data *data; 1888 struct perf_mmap_data *data;
@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1869 1890
1870 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 1891 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1871 1892
1872 free_page((unsigned long)data->user_page); 1893 perf_mmap_free_page((unsigned long)data->user_page);
1873 for (i = 0; i < data->nr_pages; i++) 1894 for (i = 0; i < data->nr_pages; i++)
1874 free_page((unsigned long)data->data_pages[i]); 1895 perf_mmap_free_page((unsigned long)data->data_pages[i]);
1896
1875 kfree(data); 1897 kfree(data);
1876} 1898}
1877 1899
@@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1908} 1930}
1909 1931
1910static struct vm_operations_struct perf_mmap_vmops = { 1932static struct vm_operations_struct perf_mmap_vmops = {
1911 .open = perf_mmap_open, 1933 .open = perf_mmap_open,
1912 .close = perf_mmap_close, 1934 .close = perf_mmap_close,
1913 .fault = perf_mmap_fault, 1935 .fault = perf_mmap_fault,
1936 .page_mkwrite = perf_mmap_fault,
1914}; 1937};
1915 1938
1916static int perf_mmap(struct file *file, struct vm_area_struct *vma) 1939static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1924 long user_extra, extra; 1947 long user_extra, extra;
1925 int ret = 0; 1948 int ret = 0;
1926 1949
1927 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 1950 if (!(vma->vm_flags & VM_SHARED))
1928 return -EINVAL; 1951 return -EINVAL;
1929 1952
1930 vma_size = vma->vm_end - vma->vm_start; 1953 vma_size = vma->vm_end - vma->vm_start;
@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1983 atomic_long_add(user_extra, &user->locked_vm); 2006 atomic_long_add(user_extra, &user->locked_vm);
1984 vma->vm_mm->locked_vm += extra; 2007 vma->vm_mm->locked_vm += extra;
1985 counter->data->nr_locked = extra; 2008 counter->data->nr_locked = extra;
2009 if (vma->vm_flags & VM_WRITE)
2010 counter->data->writable = 1;
2011
1986unlock: 2012unlock:
1987 mutex_unlock(&counter->mmap_mutex); 2013 mutex_unlock(&counter->mmap_mutex);
1988 2014
1989 vma->vm_flags &= ~VM_MAYWRITE;
1990 vma->vm_flags |= VM_RESERVED; 2015 vma->vm_flags |= VM_RESERVED;
1991 vma->vm_ops = &perf_mmap_vmops; 2016 vma->vm_ops = &perf_mmap_vmops;
1992 2017
@@ -2163,11 +2188,38 @@ struct perf_output_handle {
2163 unsigned long head; 2188 unsigned long head;
2164 unsigned long offset; 2189 unsigned long offset;
2165 int nmi; 2190 int nmi;
2166 int overflow; 2191 int sample;
2167 int locked; 2192 int locked;
2168 unsigned long flags; 2193 unsigned long flags;
2169}; 2194};
2170 2195
2196static bool perf_output_space(struct perf_mmap_data *data,
2197 unsigned int offset, unsigned int head)
2198{
2199 unsigned long tail;
2200 unsigned long mask;
2201
2202 if (!data->writable)
2203 return true;
2204
2205 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2206 /*
2207 * Userspace could choose to issue a mb() before updating the tail
2208 * pointer. So that all reads will be completed before the write is
2209 * issued.
2210 */
2211 tail = ACCESS_ONCE(data->user_page->data_tail);
2212 smp_rmb();
2213
2214 offset = (offset - tail) & mask;
2215 head = (head - tail) & mask;
2216
2217 if ((int)(head - offset) < 0)
2218 return false;
2219
2220 return true;
2221}
2222
2171static void perf_output_wakeup(struct perf_output_handle *handle) 2223static void perf_output_wakeup(struct perf_output_handle *handle)
2172{ 2224{
2173 atomic_set(&handle->data->poll, POLL_IN); 2225 atomic_set(&handle->data->poll, POLL_IN);
@@ -2258,12 +2310,57 @@ out:
2258 local_irq_restore(handle->flags); 2310 local_irq_restore(handle->flags);
2259} 2311}
2260 2312
2313static void perf_output_copy(struct perf_output_handle *handle,
2314 const void *buf, unsigned int len)
2315{
2316 unsigned int pages_mask;
2317 unsigned int offset;
2318 unsigned int size;
2319 void **pages;
2320
2321 offset = handle->offset;
2322 pages_mask = handle->data->nr_pages - 1;
2323 pages = handle->data->data_pages;
2324
2325 do {
2326 unsigned int page_offset;
2327 int nr;
2328
2329 nr = (offset >> PAGE_SHIFT) & pages_mask;
2330 page_offset = offset & (PAGE_SIZE - 1);
2331 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2332
2333 memcpy(pages[nr] + page_offset, buf, size);
2334
2335 len -= size;
2336 buf += size;
2337 offset += size;
2338 } while (len);
2339
2340 handle->offset = offset;
2341
2342 /*
2343 * Check we didn't copy past our reservation window, taking the
2344 * possible unsigned int wrap into account.
2345 */
2346 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2347}
2348
2349#define perf_output_put(handle, x) \
2350 perf_output_copy((handle), &(x), sizeof(x))
2351
2261static int perf_output_begin(struct perf_output_handle *handle, 2352static int perf_output_begin(struct perf_output_handle *handle,
2262 struct perf_counter *counter, unsigned int size, 2353 struct perf_counter *counter, unsigned int size,
2263 int nmi, int overflow) 2354 int nmi, int sample)
2264{ 2355{
2265 struct perf_mmap_data *data; 2356 struct perf_mmap_data *data;
2266 unsigned int offset, head; 2357 unsigned int offset, head;
2358 int have_lost;
2359 struct {
2360 struct perf_event_header header;
2361 u64 id;
2362 u64 lost;
2363 } lost_event;
2267 2364
2268 /* 2365 /*
2269 * For inherited counters we send all the output towards the parent. 2366 * For inherited counters we send all the output towards the parent.
@@ -2276,19 +2373,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
2276 if (!data) 2373 if (!data)
2277 goto out; 2374 goto out;
2278 2375
2279 handle->data = data; 2376 handle->data = data;
2280 handle->counter = counter; 2377 handle->counter = counter;
2281 handle->nmi = nmi; 2378 handle->nmi = nmi;
2282 handle->overflow = overflow; 2379 handle->sample = sample;
2283 2380
2284 if (!data->nr_pages) 2381 if (!data->nr_pages)
2285 goto fail; 2382 goto fail;
2286 2383
2384 have_lost = atomic_read(&data->lost);
2385 if (have_lost)
2386 size += sizeof(lost_event);
2387
2287 perf_output_lock(handle); 2388 perf_output_lock(handle);
2288 2389
2289 do { 2390 do {
2290 offset = head = atomic_long_read(&data->head); 2391 offset = head = atomic_long_read(&data->head);
2291 head += size; 2392 head += size;
2393 if (unlikely(!perf_output_space(data, offset, head)))
2394 goto fail;
2292 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2395 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2293 2396
2294 handle->offset = offset; 2397 handle->offset = offset;
@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2297 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2400 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2298 atomic_set(&data->wakeup, 1); 2401 atomic_set(&data->wakeup, 1);
2299 2402
2403 if (have_lost) {
2404 lost_event.header.type = PERF_EVENT_LOST;
2405 lost_event.header.misc = 0;
2406 lost_event.header.size = sizeof(lost_event);
2407 lost_event.id = counter->id;
2408 lost_event.lost = atomic_xchg(&data->lost, 0);
2409
2410 perf_output_put(handle, lost_event);
2411 }
2412
2300 return 0; 2413 return 0;
2301 2414
2302fail: 2415fail:
2303 perf_output_wakeup(handle); 2416 atomic_inc(&data->lost);
2417 perf_output_unlock(handle);
2304out: 2418out:
2305 rcu_read_unlock(); 2419 rcu_read_unlock();
2306 2420
2307 return -ENOSPC; 2421 return -ENOSPC;
2308} 2422}
2309 2423
2310static void perf_output_copy(struct perf_output_handle *handle,
2311 const void *buf, unsigned int len)
2312{
2313 unsigned int pages_mask;
2314 unsigned int offset;
2315 unsigned int size;
2316 void **pages;
2317
2318 offset = handle->offset;
2319 pages_mask = handle->data->nr_pages - 1;
2320 pages = handle->data->data_pages;
2321
2322 do {
2323 unsigned int page_offset;
2324 int nr;
2325
2326 nr = (offset >> PAGE_SHIFT) & pages_mask;
2327 page_offset = offset & (PAGE_SIZE - 1);
2328 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2329
2330 memcpy(pages[nr] + page_offset, buf, size);
2331
2332 len -= size;
2333 buf += size;
2334 offset += size;
2335 } while (len);
2336
2337 handle->offset = offset;
2338
2339 /*
2340 * Check we didn't copy past our reservation window, taking the
2341 * possible unsigned int wrap into account.
2342 */
2343 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2344}
2345
2346#define perf_output_put(handle, x) \
2347 perf_output_copy((handle), &(x), sizeof(x))
2348
2349static void perf_output_end(struct perf_output_handle *handle) 2424static void perf_output_end(struct perf_output_handle *handle)
2350{ 2425{
2351 struct perf_counter *counter = handle->counter; 2426 struct perf_counter *counter = handle->counter;
@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2353 2428
2354 int wakeup_events = counter->attr.wakeup_events; 2429 int wakeup_events = counter->attr.wakeup_events;
2355 2430
2356 if (handle->overflow && wakeup_events) { 2431 if (handle->sample && wakeup_events) {
2357 int events = atomic_inc_return(&data->events); 2432 int events = atomic_inc_return(&data->events);
2358 if (events >= wakeup_events) { 2433 if (events >= wakeup_events) {
2359 atomic_sub(wakeup_events, &data->events); 2434 atomic_sub(wakeup_events, &data->events);
@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2958} 3033}
2959 3034
2960/* 3035/*
2961 * Generic counter overflow handling. 3036 * Generic counter overflow handling, sampling.
2962 */ 3037 */
2963 3038
2964int perf_counter_overflow(struct perf_counter *counter, int nmi, 3039int perf_counter_overflow(struct perf_counter *counter, int nmi,