diff options
| author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-03-25 14:39:37 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2009-06-18 08:46:11 -0400 |
| commit | 43a21ea81a2400992561146327c4785ce7f7be38 (patch) | |
| tree | d4974c0ff9d7f40291515c5c0cf7e0d51abccb66 /kernel | |
| parent | d3a9262e59f7fb83c6d44df3b2b1460ed57d3ea1 (diff) | |
perf_counter: Add event overlow handling
Alternative method of mmap() data output handling that provides
better overflow management and a more reliable data stream.
Unlike the previous method, that didn't have any user->kernel
feedback and relied on userspace keeping up, this method relies on
userspace writing its last read position into the control page.
It will ensure new output doesn't overwrite not-yet read events,
new events for which there is no space left are lost and the
overflow counter is incremented, providing exact event loss
numbers.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/perf_counter.c | 185 |
1 files changed, 130 insertions, 55 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 109a95723859..7e9108efd305 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
| @@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1794 | struct perf_mmap_data *data; | 1794 | struct perf_mmap_data *data; |
| 1795 | int ret = VM_FAULT_SIGBUS; | 1795 | int ret = VM_FAULT_SIGBUS; |
| 1796 | 1796 | ||
| 1797 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | ||
| 1798 | if (vmf->pgoff == 0) | ||
| 1799 | ret = 0; | ||
| 1800 | return ret; | ||
| 1801 | } | ||
| 1802 | |||
| 1797 | rcu_read_lock(); | 1803 | rcu_read_lock(); |
| 1798 | data = rcu_dereference(counter->data); | 1804 | data = rcu_dereference(counter->data); |
| 1799 | if (!data) | 1805 | if (!data) |
| @@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1807 | if ((unsigned)nr > data->nr_pages) | 1813 | if ((unsigned)nr > data->nr_pages) |
| 1808 | goto unlock; | 1814 | goto unlock; |
| 1809 | 1815 | ||
| 1816 | if (vmf->flags & FAULT_FLAG_WRITE) | ||
| 1817 | goto unlock; | ||
| 1818 | |||
| 1810 | vmf->page = virt_to_page(data->data_pages[nr]); | 1819 | vmf->page = virt_to_page(data->data_pages[nr]); |
| 1811 | } | 1820 | } |
| 1821 | |||
| 1812 | get_page(vmf->page); | 1822 | get_page(vmf->page); |
| 1823 | vmf->page->mapping = vma->vm_file->f_mapping; | ||
| 1824 | vmf->page->index = vmf->pgoff; | ||
| 1825 | |||
| 1813 | ret = 0; | 1826 | ret = 0; |
| 1814 | unlock: | 1827 | unlock: |
| 1815 | rcu_read_unlock(); | 1828 | rcu_read_unlock(); |
| @@ -1862,6 +1875,14 @@ fail: | |||
| 1862 | return -ENOMEM; | 1875 | return -ENOMEM; |
| 1863 | } | 1876 | } |
| 1864 | 1877 | ||
| 1878 | static void perf_mmap_free_page(unsigned long addr) | ||
| 1879 | { | ||
| 1880 | struct page *page = virt_to_page(addr); | ||
| 1881 | |||
| 1882 | page->mapping = NULL; | ||
| 1883 | __free_page(page); | ||
| 1884 | } | ||
| 1885 | |||
| 1865 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) | 1886 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) |
| 1866 | { | 1887 | { |
| 1867 | struct perf_mmap_data *data; | 1888 | struct perf_mmap_data *data; |
| @@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) | |||
| 1869 | 1890 | ||
| 1870 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); | 1891 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); |
| 1871 | 1892 | ||
| 1872 | free_page((unsigned long)data->user_page); | 1893 | perf_mmap_free_page((unsigned long)data->user_page); |
| 1873 | for (i = 0; i < data->nr_pages; i++) | 1894 | for (i = 0; i < data->nr_pages; i++) |
| 1874 | free_page((unsigned long)data->data_pages[i]); | 1895 | perf_mmap_free_page((unsigned long)data->data_pages[i]); |
| 1896 | |||
| 1875 | kfree(data); | 1897 | kfree(data); |
| 1876 | } | 1898 | } |
| 1877 | 1899 | ||
| @@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 1908 | } | 1930 | } |
| 1909 | 1931 | ||
| 1910 | static struct vm_operations_struct perf_mmap_vmops = { | 1932 | static struct vm_operations_struct perf_mmap_vmops = { |
| 1911 | .open = perf_mmap_open, | 1933 | .open = perf_mmap_open, |
| 1912 | .close = perf_mmap_close, | 1934 | .close = perf_mmap_close, |
| 1913 | .fault = perf_mmap_fault, | 1935 | .fault = perf_mmap_fault, |
| 1936 | .page_mkwrite = perf_mmap_fault, | ||
| 1914 | }; | 1937 | }; |
| 1915 | 1938 | ||
| 1916 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) | 1939 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) |
| @@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 1924 | long user_extra, extra; | 1947 | long user_extra, extra; |
| 1925 | int ret = 0; | 1948 | int ret = 0; |
| 1926 | 1949 | ||
| 1927 | if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) | 1950 | if (!(vma->vm_flags & VM_SHARED)) |
| 1928 | return -EINVAL; | 1951 | return -EINVAL; |
| 1929 | 1952 | ||
| 1930 | vma_size = vma->vm_end - vma->vm_start; | 1953 | vma_size = vma->vm_end - vma->vm_start; |
| @@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 1983 | atomic_long_add(user_extra, &user->locked_vm); | 2006 | atomic_long_add(user_extra, &user->locked_vm); |
| 1984 | vma->vm_mm->locked_vm += extra; | 2007 | vma->vm_mm->locked_vm += extra; |
| 1985 | counter->data->nr_locked = extra; | 2008 | counter->data->nr_locked = extra; |
| 2009 | if (vma->vm_flags & VM_WRITE) | ||
| 2010 | counter->data->writable = 1; | ||
| 2011 | |||
| 1986 | unlock: | 2012 | unlock: |
| 1987 | mutex_unlock(&counter->mmap_mutex); | 2013 | mutex_unlock(&counter->mmap_mutex); |
| 1988 | 2014 | ||
| 1989 | vma->vm_flags &= ~VM_MAYWRITE; | ||
| 1990 | vma->vm_flags |= VM_RESERVED; | 2015 | vma->vm_flags |= VM_RESERVED; |
| 1991 | vma->vm_ops = &perf_mmap_vmops; | 2016 | vma->vm_ops = &perf_mmap_vmops; |
| 1992 | 2017 | ||
| @@ -2163,11 +2188,38 @@ struct perf_output_handle { | |||
| 2163 | unsigned long head; | 2188 | unsigned long head; |
| 2164 | unsigned long offset; | 2189 | unsigned long offset; |
| 2165 | int nmi; | 2190 | int nmi; |
| 2166 | int overflow; | 2191 | int sample; |
| 2167 | int locked; | 2192 | int locked; |
| 2168 | unsigned long flags; | 2193 | unsigned long flags; |
| 2169 | }; | 2194 | }; |
| 2170 | 2195 | ||
| 2196 | static bool perf_output_space(struct perf_mmap_data *data, | ||
| 2197 | unsigned int offset, unsigned int head) | ||
| 2198 | { | ||
| 2199 | unsigned long tail; | ||
| 2200 | unsigned long mask; | ||
| 2201 | |||
| 2202 | if (!data->writable) | ||
| 2203 | return true; | ||
| 2204 | |||
| 2205 | mask = (data->nr_pages << PAGE_SHIFT) - 1; | ||
| 2206 | /* | ||
| 2207 | * Userspace could choose to issue a mb() before updating the tail | ||
| 2208 | * pointer. So that all reads will be completed before the write is | ||
| 2209 | * issued. | ||
| 2210 | */ | ||
| 2211 | tail = ACCESS_ONCE(data->user_page->data_tail); | ||
| 2212 | smp_rmb(); | ||
| 2213 | |||
| 2214 | offset = (offset - tail) & mask; | ||
| 2215 | head = (head - tail) & mask; | ||
| 2216 | |||
| 2217 | if ((int)(head - offset) < 0) | ||
| 2218 | return false; | ||
| 2219 | |||
| 2220 | return true; | ||
| 2221 | } | ||
| 2222 | |||
| 2171 | static void perf_output_wakeup(struct perf_output_handle *handle) | 2223 | static void perf_output_wakeup(struct perf_output_handle *handle) |
| 2172 | { | 2224 | { |
| 2173 | atomic_set(&handle->data->poll, POLL_IN); | 2225 | atomic_set(&handle->data->poll, POLL_IN); |
| @@ -2258,12 +2310,57 @@ out: | |||
| 2258 | local_irq_restore(handle->flags); | 2310 | local_irq_restore(handle->flags); |
| 2259 | } | 2311 | } |
| 2260 | 2312 | ||
| 2313 | static void perf_output_copy(struct perf_output_handle *handle, | ||
| 2314 | const void *buf, unsigned int len) | ||
| 2315 | { | ||
| 2316 | unsigned int pages_mask; | ||
| 2317 | unsigned int offset; | ||
| 2318 | unsigned int size; | ||
| 2319 | void **pages; | ||
| 2320 | |||
| 2321 | offset = handle->offset; | ||
| 2322 | pages_mask = handle->data->nr_pages - 1; | ||
| 2323 | pages = handle->data->data_pages; | ||
| 2324 | |||
| 2325 | do { | ||
| 2326 | unsigned int page_offset; | ||
| 2327 | int nr; | ||
| 2328 | |||
| 2329 | nr = (offset >> PAGE_SHIFT) & pages_mask; | ||
| 2330 | page_offset = offset & (PAGE_SIZE - 1); | ||
| 2331 | size = min_t(unsigned int, PAGE_SIZE - page_offset, len); | ||
| 2332 | |||
| 2333 | memcpy(pages[nr] + page_offset, buf, size); | ||
| 2334 | |||
| 2335 | len -= size; | ||
| 2336 | buf += size; | ||
| 2337 | offset += size; | ||
| 2338 | } while (len); | ||
| 2339 | |||
| 2340 | handle->offset = offset; | ||
| 2341 | |||
| 2342 | /* | ||
| 2343 | * Check we didn't copy past our reservation window, taking the | ||
| 2344 | * possible unsigned int wrap into account. | ||
| 2345 | */ | ||
| 2346 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
| 2347 | } | ||
| 2348 | |||
| 2349 | #define perf_output_put(handle, x) \ | ||
| 2350 | perf_output_copy((handle), &(x), sizeof(x)) | ||
| 2351 | |||
| 2261 | static int perf_output_begin(struct perf_output_handle *handle, | 2352 | static int perf_output_begin(struct perf_output_handle *handle, |
| 2262 | struct perf_counter *counter, unsigned int size, | 2353 | struct perf_counter *counter, unsigned int size, |
| 2263 | int nmi, int overflow) | 2354 | int nmi, int sample) |
| 2264 | { | 2355 | { |
| 2265 | struct perf_mmap_data *data; | 2356 | struct perf_mmap_data *data; |
| 2266 | unsigned int offset, head; | 2357 | unsigned int offset, head; |
| 2358 | int have_lost; | ||
| 2359 | struct { | ||
| 2360 | struct perf_event_header header; | ||
| 2361 | u64 id; | ||
| 2362 | u64 lost; | ||
| 2363 | } lost_event; | ||
| 2267 | 2364 | ||
| 2268 | /* | 2365 | /* |
| 2269 | * For inherited counters we send all the output towards the parent. | 2366 | * For inherited counters we send all the output towards the parent. |
| @@ -2276,19 +2373,25 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
| 2276 | if (!data) | 2373 | if (!data) |
| 2277 | goto out; | 2374 | goto out; |
| 2278 | 2375 | ||
| 2279 | handle->data = data; | 2376 | handle->data = data; |
| 2280 | handle->counter = counter; | 2377 | handle->counter = counter; |
| 2281 | handle->nmi = nmi; | 2378 | handle->nmi = nmi; |
| 2282 | handle->overflow = overflow; | 2379 | handle->sample = sample; |
| 2283 | 2380 | ||
| 2284 | if (!data->nr_pages) | 2381 | if (!data->nr_pages) |
| 2285 | goto fail; | 2382 | goto fail; |
| 2286 | 2383 | ||
| 2384 | have_lost = atomic_read(&data->lost); | ||
| 2385 | if (have_lost) | ||
| 2386 | size += sizeof(lost_event); | ||
| 2387 | |||
| 2287 | perf_output_lock(handle); | 2388 | perf_output_lock(handle); |
| 2288 | 2389 | ||
| 2289 | do { | 2390 | do { |
| 2290 | offset = head = atomic_long_read(&data->head); | 2391 | offset = head = atomic_long_read(&data->head); |
| 2291 | head += size; | 2392 | head += size; |
| 2393 | if (unlikely(!perf_output_space(data, offset, head))) | ||
| 2394 | goto fail; | ||
| 2292 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); | 2395 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); |
| 2293 | 2396 | ||
| 2294 | handle->offset = offset; | 2397 | handle->offset = offset; |
| @@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
| 2297 | if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) | 2400 | if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) |
| 2298 | atomic_set(&data->wakeup, 1); | 2401 | atomic_set(&data->wakeup, 1); |
| 2299 | 2402 | ||
| 2403 | if (have_lost) { | ||
| 2404 | lost_event.header.type = PERF_EVENT_LOST; | ||
| 2405 | lost_event.header.misc = 0; | ||
| 2406 | lost_event.header.size = sizeof(lost_event); | ||
| 2407 | lost_event.id = counter->id; | ||
| 2408 | lost_event.lost = atomic_xchg(&data->lost, 0); | ||
| 2409 | |||
| 2410 | perf_output_put(handle, lost_event); | ||
| 2411 | } | ||
| 2412 | |||
| 2300 | return 0; | 2413 | return 0; |
| 2301 | 2414 | ||
| 2302 | fail: | 2415 | fail: |
| 2303 | perf_output_wakeup(handle); | 2416 | atomic_inc(&data->lost); |
| 2417 | perf_output_unlock(handle); | ||
| 2304 | out: | 2418 | out: |
| 2305 | rcu_read_unlock(); | 2419 | rcu_read_unlock(); |
| 2306 | 2420 | ||
| 2307 | return -ENOSPC; | 2421 | return -ENOSPC; |
| 2308 | } | 2422 | } |
| 2309 | 2423 | ||
| 2310 | static void perf_output_copy(struct perf_output_handle *handle, | ||
| 2311 | const void *buf, unsigned int len) | ||
| 2312 | { | ||
| 2313 | unsigned int pages_mask; | ||
| 2314 | unsigned int offset; | ||
| 2315 | unsigned int size; | ||
| 2316 | void **pages; | ||
| 2317 | |||
| 2318 | offset = handle->offset; | ||
| 2319 | pages_mask = handle->data->nr_pages - 1; | ||
| 2320 | pages = handle->data->data_pages; | ||
| 2321 | |||
| 2322 | do { | ||
| 2323 | unsigned int page_offset; | ||
| 2324 | int nr; | ||
| 2325 | |||
| 2326 | nr = (offset >> PAGE_SHIFT) & pages_mask; | ||
| 2327 | page_offset = offset & (PAGE_SIZE - 1); | ||
| 2328 | size = min_t(unsigned int, PAGE_SIZE - page_offset, len); | ||
| 2329 | |||
| 2330 | memcpy(pages[nr] + page_offset, buf, size); | ||
| 2331 | |||
| 2332 | len -= size; | ||
| 2333 | buf += size; | ||
| 2334 | offset += size; | ||
| 2335 | } while (len); | ||
| 2336 | |||
| 2337 | handle->offset = offset; | ||
| 2338 | |||
| 2339 | /* | ||
| 2340 | * Check we didn't copy past our reservation window, taking the | ||
| 2341 | * possible unsigned int wrap into account. | ||
| 2342 | */ | ||
| 2343 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
| 2344 | } | ||
| 2345 | |||
| 2346 | #define perf_output_put(handle, x) \ | ||
| 2347 | perf_output_copy((handle), &(x), sizeof(x)) | ||
| 2348 | |||
| 2349 | static void perf_output_end(struct perf_output_handle *handle) | 2424 | static void perf_output_end(struct perf_output_handle *handle) |
| 2350 | { | 2425 | { |
| 2351 | struct perf_counter *counter = handle->counter; | 2426 | struct perf_counter *counter = handle->counter; |
| @@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle) | |||
| 2353 | 2428 | ||
| 2354 | int wakeup_events = counter->attr.wakeup_events; | 2429 | int wakeup_events = counter->attr.wakeup_events; |
| 2355 | 2430 | ||
| 2356 | if (handle->overflow && wakeup_events) { | 2431 | if (handle->sample && wakeup_events) { |
| 2357 | int events = atomic_inc_return(&data->events); | 2432 | int events = atomic_inc_return(&data->events); |
| 2358 | if (events >= wakeup_events) { | 2433 | if (events >= wakeup_events) { |
| 2359 | atomic_sub(wakeup_events, &data->events); | 2434 | atomic_sub(wakeup_events, &data->events); |
| @@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) | |||
| 2958 | } | 3033 | } |
| 2959 | 3034 | ||
| 2960 | /* | 3035 | /* |
| 2961 | * Generic counter overflow handling. | 3036 | * Generic counter overflow handling, sampling. |
| 2962 | */ | 3037 | */ |
| 2963 | 3038 | ||
| 2964 | int perf_counter_overflow(struct perf_counter *counter, int nmi, | 3039 | int perf_counter_overflow(struct perf_counter *counter, int nmi, |
