aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-09-21 10:08:49 -0400
committerIngo Molnar <mingo@elte.hu>2009-10-06 08:21:50 -0400
commit906010b2134e14a2e377decbadd357b3d0ab9c6a (patch)
tree598b30d08f5ca8df1e00abc295b120fa1bd2c2e2
parente13dbd7d75d1ecc315c6e3071b3c4e8fba4f6bec (diff)
perf_event: Provide vmalloc() based mmap() backing
Some architectures such as Sparc, ARM and MIPS (basically everything with flush_dcache_page()) need to deal with dcache aliases by carefully placing pages in both kernel and user maps. These architectures typically have to use vmalloc_user() for this. However, on other architectures, vmalloc() is not needed and has the downsides of being more restricted and slower than regular allocations. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: David Miller <davem@davemloft.net> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Paul Mackerras <paulus@samba.org> LKML-Reference: <1254830228.21044.272.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--include/linux/perf_event.h5
-rw-r--r--init/Kconfig18
-rw-r--r--kernel/perf_event.c248
-rw-r--r--tools/perf/design.txt3
5 files changed, 214 insertions, 62 deletions
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 97fca4695e0b..9b70a2f28dc7 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
26 select RTC_CLASS 26 select RTC_CLASS
27 select RTC_DRV_M48T59 27 select RTC_DRV_M48T59
28 select HAVE_PERF_EVENTS 28 select HAVE_PERF_EVENTS
29 select PERF_USE_VMALLOC
29 select HAVE_DMA_ATTRS 30 select HAVE_DMA_ATTRS
30 select HAVE_DMA_API_DEBUG 31 select HAVE_DMA_API_DEBUG
31 32
@@ -48,6 +49,7 @@ config SPARC64
48 select RTC_DRV_SUN4V 49 select RTC_DRV_SUN4V
49 select RTC_DRV_STARFIRE 50 select RTC_DRV_STARFIRE
50 select HAVE_PERF_EVENTS 51 select HAVE_PERF_EVENTS
52 select PERF_USE_VMALLOC
51 53
52config ARCH_DEFCONFIG 54config ARCH_DEFCONFIG
53 string 55 string
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3a9d36d1e92a..2e6d95f97419 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -442,6 +442,7 @@ enum perf_callchain_context {
442#include <linux/hrtimer.h> 442#include <linux/hrtimer.h>
443#include <linux/fs.h> 443#include <linux/fs.h>
444#include <linux/pid_namespace.h> 444#include <linux/pid_namespace.h>
445#include <linux/workqueue.h>
445#include <asm/atomic.h> 446#include <asm/atomic.h>
446 447
447#define PERF_MAX_STACK_DEPTH 255 448#define PERF_MAX_STACK_DEPTH 255
@@ -513,6 +514,10 @@ struct file;
513 514
514struct perf_mmap_data { 515struct perf_mmap_data {
515 struct rcu_head rcu_head; 516 struct rcu_head rcu_head;
517#ifdef CONFIG_PERF_USE_VMALLOC
518 struct work_struct work;
519#endif
520 int data_order;
516 int nr_pages; /* nr of data pages */ 521 int nr_pages; /* nr of data pages */
517 int writable; /* are we writable */ 522 int writable; /* are we writable */
518 int nr_locked; /* nr pages mlocked */ 523 int nr_locked; /* nr pages mlocked */
diff --git a/init/Kconfig b/init/Kconfig
index c7bac39d6c61..09c5c6431f42 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -921,6 +921,11 @@ config HAVE_PERF_EVENTS
921 help 921 help
922 See tools/perf/design.txt for details. 922 See tools/perf/design.txt for details.
923 923
924config PERF_USE_VMALLOC
925 bool
926 help
927 See tools/perf/design.txt for details
928
924menu "Kernel Performance Events And Counters" 929menu "Kernel Performance Events And Counters"
925 930
926config PERF_EVENTS 931config PERF_EVENTS
@@ -976,6 +981,19 @@ config PERF_COUNTERS
976 981
977 Say N if unsure. 982 Say N if unsure.
978 983
984config DEBUG_PERF_USE_VMALLOC
985 default n
986 bool "Debug: use vmalloc to back perf mmap() buffers"
987 depends on PERF_EVENTS && DEBUG_KERNEL
988 select PERF_USE_VMALLOC
989 help
990 Use vmalloc memory to back perf mmap() buffers.
991
992 Mostly useful for debugging the vmalloc code on platforms
993 that don't require it.
994
995 Say N if unsure.
996
979endmenu 997endmenu
980 998
981config VM_EVENT_COUNTERS 999config VM_EVENT_COUNTERS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e491fb087939..9d0b5c665883 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -20,6 +20,7 @@
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
23#include <linux/hardirq.h> 24#include <linux/hardirq.h>
24#include <linux/rculist.h> 25#include <linux/rculist.h>
25#include <linux/uaccess.h> 26#include <linux/uaccess.h>
@@ -2091,49 +2092,31 @@ unlock:
2091 rcu_read_unlock(); 2092 rcu_read_unlock();
2092} 2093}
2093 2094
2094static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2095static unsigned long perf_data_size(struct perf_mmap_data *data)
2095{ 2096{
2096 struct perf_event *event = vma->vm_file->private_data; 2097 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2097 struct perf_mmap_data *data; 2098}
2098 int ret = VM_FAULT_SIGBUS;
2099
2100 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2101 if (vmf->pgoff == 0)
2102 ret = 0;
2103 return ret;
2104 }
2105
2106 rcu_read_lock();
2107 data = rcu_dereference(event->data);
2108 if (!data)
2109 goto unlock;
2110
2111 if (vmf->pgoff == 0) {
2112 vmf->page = virt_to_page(data->user_page);
2113 } else {
2114 int nr = vmf->pgoff - 1;
2115
2116 if ((unsigned)nr > data->nr_pages)
2117 goto unlock;
2118 2099
2119 if (vmf->flags & FAULT_FLAG_WRITE) 2100#ifndef CONFIG_PERF_USE_VMALLOC
2120 goto unlock;
2121 2101
2122 vmf->page = virt_to_page(data->data_pages[nr]); 2102/*
2123 } 2103 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2104 */
2124 2105
2125 get_page(vmf->page); 2106static struct page *
2126 vmf->page->mapping = vma->vm_file->f_mapping; 2107perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2127 vmf->page->index = vmf->pgoff; 2108{
2109 if (pgoff > data->nr_pages)
2110 return NULL;
2128 2111
2129 ret = 0; 2112 if (pgoff == 0)
2130unlock: 2113 return virt_to_page(data->user_page);
2131 rcu_read_unlock();
2132 2114
2133 return ret; 2115 return virt_to_page(data->data_pages[pgoff - 1]);
2134} 2116}
2135 2117
2136static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2118static struct perf_mmap_data *
2119perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2137{ 2120{
2138 struct perf_mmap_data *data; 2121 struct perf_mmap_data *data;
2139 unsigned long size; 2122 unsigned long size;
@@ -2158,19 +2141,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2158 goto fail_data_pages; 2141 goto fail_data_pages;
2159 } 2142 }
2160 2143
2144 data->data_order = 0;
2161 data->nr_pages = nr_pages; 2145 data->nr_pages = nr_pages;
2162 atomic_set(&data->lock, -1);
2163
2164 if (event->attr.watermark) {
2165 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2166 event->attr.wakeup_watermark);
2167 }
2168 if (!data->watermark)
2169 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2170 2146
2171 rcu_assign_pointer(event->data, data); 2147 return data;
2172
2173 return 0;
2174 2148
2175fail_data_pages: 2149fail_data_pages:
2176 for (i--; i >= 0; i--) 2150 for (i--; i >= 0; i--)
@@ -2182,7 +2156,7 @@ fail_user_page:
2182 kfree(data); 2156 kfree(data);
2183 2157
2184fail: 2158fail:
2185 return -ENOMEM; 2159 return NULL;
2186} 2160}
2187 2161
2188static void perf_mmap_free_page(unsigned long addr) 2162static void perf_mmap_free_page(unsigned long addr)
@@ -2193,28 +2167,169 @@ static void perf_mmap_free_page(unsigned long addr)
2193 __free_page(page); 2167 __free_page(page);
2194} 2168}
2195 2169
2196static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2170static void perf_mmap_data_free(struct perf_mmap_data *data)
2197{ 2171{
2198 struct perf_mmap_data *data;
2199 int i; 2172 int i;
2200 2173
2201 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2202
2203 perf_mmap_free_page((unsigned long)data->user_page); 2174 perf_mmap_free_page((unsigned long)data->user_page);
2204 for (i = 0; i < data->nr_pages; i++) 2175 for (i = 0; i < data->nr_pages; i++)
2205 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2176 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2177}
2178
2179#else
2180
2181/*
2182 * Back perf_mmap() with vmalloc memory.
2183 *
2184 * Required for architectures that have d-cache aliasing issues.
2185 */
2186
2187static struct page *
2188perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2189{
2190 if (pgoff > (1UL << data->data_order))
2191 return NULL;
2192
2193 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2194}
2195
2196static void perf_mmap_unmark_page(void *addr)
2197{
2198 struct page *page = vmalloc_to_page(addr);
2199
2200 page->mapping = NULL;
2201}
2202
2203static void perf_mmap_data_free_work(struct work_struct *work)
2204{
2205 struct perf_mmap_data *data;
2206 void *base;
2207 int i, nr;
2208
2209 data = container_of(work, struct perf_mmap_data, work);
2210 nr = 1 << data->data_order;
2211
2212 base = data->user_page;
2213 for (i = 0; i < nr + 1; i++)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215
2216 vfree(base);
2217}
2218
2219static void perf_mmap_data_free(struct perf_mmap_data *data)
2220{
2221 schedule_work(&data->work);
2222}
2223
2224static struct perf_mmap_data *
2225perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2226{
2227 struct perf_mmap_data *data;
2228 unsigned long size;
2229 void *all_buf;
2206 2230
2231 WARN_ON(atomic_read(&event->mmap_count));
2232
2233 size = sizeof(struct perf_mmap_data);
2234 size += sizeof(void *);
2235
2236 data = kzalloc(size, GFP_KERNEL);
2237 if (!data)
2238 goto fail;
2239
2240 INIT_WORK(&data->work, perf_mmap_data_free_work);
2241
2242 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2243 if (!all_buf)
2244 goto fail_all_buf;
2245
2246 data->user_page = all_buf;
2247 data->data_pages[0] = all_buf + PAGE_SIZE;
2248 data->data_order = ilog2(nr_pages);
2249 data->nr_pages = 1;
2250
2251 return data;
2252
2253fail_all_buf:
2254 kfree(data);
2255
2256fail:
2257 return NULL;
2258}
2259
2260#endif
2261
2262static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2263{
2264 struct perf_event *event = vma->vm_file->private_data;
2265 struct perf_mmap_data *data;
2266 int ret = VM_FAULT_SIGBUS;
2267
2268 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2269 if (vmf->pgoff == 0)
2270 ret = 0;
2271 return ret;
2272 }
2273
2274 rcu_read_lock();
2275 data = rcu_dereference(event->data);
2276 if (!data)
2277 goto unlock;
2278
2279 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2280 goto unlock;
2281
2282 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2283 if (!vmf->page)
2284 goto unlock;
2285
2286 get_page(vmf->page);
2287 vmf->page->mapping = vma->vm_file->f_mapping;
2288 vmf->page->index = vmf->pgoff;
2289
2290 ret = 0;
2291unlock:
2292 rcu_read_unlock();
2293
2294 return ret;
2295}
2296
2297static void
2298perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2299{
2300 long max_size = perf_data_size(data);
2301
2302 atomic_set(&data->lock, -1);
2303
2304 if (event->attr.watermark) {
2305 data->watermark = min_t(long, max_size,
2306 event->attr.wakeup_watermark);
2307 }
2308
2309 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2311
2312
2313 rcu_assign_pointer(event->data, data);
2314}
2315
2316static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2317{
2318 struct perf_mmap_data *data;
2319
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data);
2207 kfree(data); 2322 kfree(data);
2208} 2323}
2209 2324
2210static void perf_mmap_data_free(struct perf_event *event) 2325static void perf_mmap_data_release(struct perf_event *event)
2211{ 2326{
2212 struct perf_mmap_data *data = event->data; 2327 struct perf_mmap_data *data = event->data;
2213 2328
2214 WARN_ON(atomic_read(&event->mmap_count)); 2329 WARN_ON(atomic_read(&event->mmap_count));
2215 2330
2216 rcu_assign_pointer(event->data, NULL); 2331 rcu_assign_pointer(event->data, NULL);
2217 call_rcu(&data->rcu_head, __perf_mmap_data_free); 2332 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2218} 2333}
2219 2334
2220static void perf_mmap_open(struct vm_area_struct *vma) 2335static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2230,11 +2345,12 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2230 2345
2231 WARN_ON_ONCE(event->ctx->parent_ctx); 2346 WARN_ON_ONCE(event->ctx->parent_ctx);
2232 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2347 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2348 unsigned long size = perf_data_size(event->data);
2233 struct user_struct *user = current_user(); 2349 struct user_struct *user = current_user();
2234 2350
2235 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); 2351 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2236 vma->vm_mm->locked_vm -= event->data->nr_locked; 2352 vma->vm_mm->locked_vm -= event->data->nr_locked;
2237 perf_mmap_data_free(event); 2353 perf_mmap_data_release(event);
2238 mutex_unlock(&event->mmap_mutex); 2354 mutex_unlock(&event->mmap_mutex);
2239 } 2355 }
2240} 2356}
@@ -2252,6 +2368,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2252 unsigned long user_locked, user_lock_limit; 2368 unsigned long user_locked, user_lock_limit;
2253 struct user_struct *user = current_user(); 2369 struct user_struct *user = current_user();
2254 unsigned long locked, lock_limit; 2370 unsigned long locked, lock_limit;
2371 struct perf_mmap_data *data;
2255 unsigned long vma_size; 2372 unsigned long vma_size;
2256 unsigned long nr_pages; 2373 unsigned long nr_pages;
2257 long user_extra, extra; 2374 long user_extra, extra;
@@ -2314,10 +2431,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2314 } 2431 }
2315 2432
2316 WARN_ON(event->data); 2433 WARN_ON(event->data);
2317 ret = perf_mmap_data_alloc(event, nr_pages); 2434
2318 if (ret) 2435 data = perf_mmap_data_alloc(event, nr_pages);
2436 ret = -ENOMEM;
2437 if (!data)
2319 goto unlock; 2438 goto unlock;
2320 2439
2440 ret = 0;
2441 perf_mmap_data_init(event, data);
2442
2321 atomic_set(&event->mmap_count, 1); 2443 atomic_set(&event->mmap_count, 1);
2322 atomic_long_add(user_extra, &user->locked_vm); 2444 atomic_long_add(user_extra, &user->locked_vm);
2323 vma->vm_mm->locked_vm += extra; 2445 vma->vm_mm->locked_vm += extra;
@@ -2505,7 +2627,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2505 if (!data->writable) 2627 if (!data->writable)
2506 return true; 2628 return true;
2507 2629
2508 mask = (data->nr_pages << PAGE_SHIFT) - 1; 2630 mask = perf_data_size(data) - 1;
2509 2631
2510 offset = (offset - tail) & mask; 2632 offset = (offset - tail) & mask;
2511 head = (head - tail) & mask; 2633 head = (head - tail) & mask;
@@ -2610,7 +2732,7 @@ void perf_output_copy(struct perf_output_handle *handle,
2610 const void *buf, unsigned int len) 2732 const void *buf, unsigned int len)
2611{ 2733{
2612 unsigned int pages_mask; 2734 unsigned int pages_mask;
2613 unsigned int offset; 2735 unsigned long offset;
2614 unsigned int size; 2736 unsigned int size;
2615 void **pages; 2737 void **pages;
2616 2738
@@ -2619,12 +2741,14 @@ void perf_output_copy(struct perf_output_handle *handle,
2619 pages = handle->data->data_pages; 2741 pages = handle->data->data_pages;
2620 2742
2621 do { 2743 do {
2622 unsigned int page_offset; 2744 unsigned long page_offset;
2745 unsigned long page_size;
2623 int nr; 2746 int nr;
2624 2747
2625 nr = (offset >> PAGE_SHIFT) & pages_mask; 2748 nr = (offset >> PAGE_SHIFT) & pages_mask;
2626 page_offset = offset & (PAGE_SIZE - 1); 2749 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2627 size = min_t(unsigned int, PAGE_SIZE - page_offset, len); 2750 page_offset = offset & (page_size - 1);
2751 size = min_t(unsigned int, page_size - page_offset, len);
2628 2752
2629 memcpy(pages[nr] + page_offset, buf, size); 2753 memcpy(pages[nr] + page_offset, buf, size);
2630 2754
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index f1946d107b10..fdd42a824c98 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -455,3 +455,6 @@ will need at least this:
455 455
456If your architecture does have hardware capabilities, you can override the 456If your architecture does have hardware capabilities, you can override the
457weak stub hw_perf_event_init() to register hardware counters. 457weak stub hw_perf_event_init() to register hardware counters.
458
459Architectures that have d-cache aliassing issues, such as Sparc and ARM,
460should select PERF_USE_VMALLOC in order to avoid these for perf mmap().