aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c389
1 files changed, 217 insertions, 172 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..e099650cd249 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2297,11 +2297,6 @@ unlock:
2297 rcu_read_unlock(); 2297 rcu_read_unlock();
2298} 2298}
2299 2299
2300static unsigned long perf_data_size(struct perf_mmap_data *data)
2301{
2302 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2303}
2304
2305#ifndef CONFIG_PERF_USE_VMALLOC 2300#ifndef CONFIG_PERF_USE_VMALLOC
2306 2301
2307/* 2302/*
@@ -2320,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2320 return virt_to_page(data->data_pages[pgoff - 1]); 2315 return virt_to_page(data->data_pages[pgoff - 1]);
2321} 2316}
2322 2317
2318static void *perf_mmap_alloc_page(int cpu)
2319{
2320 struct page *page;
2321 int node;
2322
2323 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2324 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2325 if (!page)
2326 return NULL;
2327
2328 return page_address(page);
2329}
2330
2323static struct perf_mmap_data * 2331static struct perf_mmap_data *
2324perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2332perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2325{ 2333{
@@ -2336,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2336 if (!data) 2344 if (!data)
2337 goto fail; 2345 goto fail;
2338 2346
2339 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2347 data->user_page = perf_mmap_alloc_page(event->cpu);
2340 if (!data->user_page) 2348 if (!data->user_page)
2341 goto fail_user_page; 2349 goto fail_user_page;
2342 2350
2343 for (i = 0; i < nr_pages; i++) { 2351 for (i = 0; i < nr_pages; i++) {
2344 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2352 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2345 if (!data->data_pages[i]) 2353 if (!data->data_pages[i])
2346 goto fail_data_pages; 2354 goto fail_data_pages;
2347 } 2355 }
2348 2356
2349 data->data_order = 0;
2350 data->nr_pages = nr_pages; 2357 data->nr_pages = nr_pages;
2351 2358
2352 return data; 2359 return data;
@@ -2382,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2382 kfree(data); 2389 kfree(data);
2383} 2390}
2384 2391
2392static inline int page_order(struct perf_mmap_data *data)
2393{
2394 return 0;
2395}
2396
2385#else 2397#else
2386 2398
2387/* 2399/*
@@ -2390,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2390 * Required for architectures that have d-cache aliasing issues. 2402 * Required for architectures that have d-cache aliasing issues.
2391 */ 2403 */
2392 2404
2405static inline int page_order(struct perf_mmap_data *data)
2406{
2407 return data->page_order;
2408}
2409
2393static struct page * 2410static struct page *
2394perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2411perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2395{ 2412{
2396 if (pgoff > (1UL << data->data_order)) 2413 if (pgoff > (1UL << page_order(data)))
2397 return NULL; 2414 return NULL;
2398 2415
2399 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2416 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2413,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2413 int i, nr; 2430 int i, nr;
2414 2431
2415 data = container_of(work, struct perf_mmap_data, work); 2432 data = container_of(work, struct perf_mmap_data, work);
2416 nr = 1 << data->data_order; 2433 nr = 1 << page_order(data);
2417 2434
2418 base = data->user_page; 2435 base = data->user_page;
2419 for (i = 0; i < nr + 1; i++) 2436 for (i = 0; i < nr + 1; i++)
@@ -2452,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2452 2469
2453 data->user_page = all_buf; 2470 data->user_page = all_buf;
2454 data->data_pages[0] = all_buf + PAGE_SIZE; 2471 data->data_pages[0] = all_buf + PAGE_SIZE;
2455 data->data_order = ilog2(nr_pages); 2472 data->page_order = ilog2(nr_pages);
2456 data->nr_pages = 1; 2473 data->nr_pages = 1;
2457 2474
2458 return data; 2475 return data;
@@ -2466,6 +2483,11 @@ fail:
2466 2483
2467#endif 2484#endif
2468 2485
2486static unsigned long perf_data_size(struct perf_mmap_data *data)
2487{
2488 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2489}
2490
2469static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2491static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2470{ 2492{
2471 struct perf_event *event = vma->vm_file->private_data; 2493 struct perf_event *event = vma->vm_file->private_data;
@@ -2506,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2506{ 2528{
2507 long max_size = perf_data_size(data); 2529 long max_size = perf_data_size(data);
2508 2530
2509 atomic_set(&data->lock, -1);
2510
2511 if (event->attr.watermark) { 2531 if (event->attr.watermark) {
2512 data->watermark = min_t(long, max_size, 2532 data->watermark = min_t(long, max_size,
2513 event->attr.wakeup_watermark); 2533 event->attr.wakeup_watermark);
@@ -2580,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2580 long user_extra, extra; 2600 long user_extra, extra;
2581 int ret = 0; 2601 int ret = 0;
2582 2602
2603 /*
2604 * Don't allow mmap() of inherited per-task counters. This would
2605 * create a performance issue due to all children writing to the
2606 * same buffer.
2607 */
2608 if (event->cpu == -1 && event->attr.inherit)
2609 return -EINVAL;
2610
2583 if (!(vma->vm_flags & VM_SHARED)) 2611 if (!(vma->vm_flags & VM_SHARED))
2584 return -EINVAL; 2612 return -EINVAL;
2585 2613
@@ -2885,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2885} 2913}
2886 2914
2887/* 2915/*
2888 * Curious locking construct.
2889 *
2890 * We need to ensure a later event_id doesn't publish a head when a former 2916 * We need to ensure a later event_id doesn't publish a head when a former
2891 * event_id isn't done writing. However since we need to deal with NMIs we 2917 * event isn't done writing. However since we need to deal with NMIs we
2892 * cannot fully serialize things. 2918 * cannot fully serialize things.
2893 * 2919 *
2894 * What we do is serialize between CPUs so we only have to deal with NMI
2895 * nesting on a single CPU.
2896 *
2897 * We only publish the head (and generate a wakeup) when the outer-most 2920 * We only publish the head (and generate a wakeup) when the outer-most
2898 * event_id completes. 2921 * event completes.
2899 */ 2922 */
2900static void perf_output_lock(struct perf_output_handle *handle) 2923static void perf_output_get_handle(struct perf_output_handle *handle)
2901{ 2924{
2902 struct perf_mmap_data *data = handle->data; 2925 struct perf_mmap_data *data = handle->data;
2903 int cur, cpu = get_cpu();
2904
2905 handle->locked = 0;
2906
2907 for (;;) {
2908 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2909 if (cur == -1) {
2910 handle->locked = 1;
2911 break;
2912 }
2913 if (cur == cpu)
2914 break;
2915 2926
2916 cpu_relax(); 2927 preempt_disable();
2917 } 2928 local_inc(&data->nest);
2929 handle->wakeup = local_read(&data->wakeup);
2918} 2930}
2919 2931
2920static void perf_output_unlock(struct perf_output_handle *handle) 2932static void perf_output_put_handle(struct perf_output_handle *handle)
2921{ 2933{
2922 struct perf_mmap_data *data = handle->data; 2934 struct perf_mmap_data *data = handle->data;
2923 unsigned long head; 2935 unsigned long head;
2924 int cpu;
2925
2926 data->done_head = data->head;
2927
2928 if (!handle->locked)
2929 goto out;
2930 2936
2931again: 2937again:
2932 /* 2938 head = local_read(&data->head);
2933 * The xchg implies a full barrier that ensures all writes are done
2934 * before we publish the new head, matched by a rmb() in userspace when
2935 * reading this position.
2936 */
2937 while ((head = atomic_long_xchg(&data->done_head, 0)))
2938 data->user_page->data_head = head;
2939 2939
2940 /* 2940 /*
2941 * NMI can happen here, which means we can miss a done_head update. 2941 * IRQ/NMI can happen here, which means we can miss a head update.
2942 */ 2942 */
2943 2943
2944 cpu = atomic_xchg(&data->lock, -1); 2944 if (!local_dec_and_test(&data->nest))
2945 WARN_ON_ONCE(cpu != smp_processor_id()); 2945 goto out;
2946 2946
2947 /* 2947 /*
2948 * Therefore we have to validate we did not indeed do so. 2948 * Publish the known good head. Rely on the full barrier implied
2949 * by atomic_dec_and_test() order the data->head read and this
2950 * write.
2949 */ 2951 */
2950 if (unlikely(atomic_long_read(&data->done_head))) { 2952 data->user_page->data_head = head;
2951 /*
2952 * Since we had it locked, we can lock it again.
2953 */
2954 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2955 cpu_relax();
2956 2953
2954 /*
2955 * Now check if we missed an update, rely on the (compiler)
2956 * barrier in atomic_dec_and_test() to re-read data->head.
2957 */
2958 if (unlikely(head != local_read(&data->head))) {
2959 local_inc(&data->nest);
2957 goto again; 2960 goto again;
2958 } 2961 }
2959 2962
2960 if (atomic_xchg(&data->wakeup, 0)) 2963 if (handle->wakeup != local_read(&data->wakeup))
2961 perf_output_wakeup(handle); 2964 perf_output_wakeup(handle);
2962out: 2965
2963 put_cpu(); 2966 out:
2967 preempt_enable();
2964} 2968}
2965 2969
2966void perf_output_copy(struct perf_output_handle *handle, 2970__always_inline void perf_output_copy(struct perf_output_handle *handle,
2967 const void *buf, unsigned int len) 2971 const void *buf, unsigned int len)
2968{ 2972{
2969 unsigned int pages_mask;
2970 unsigned long offset;
2971 unsigned int size;
2972 void **pages;
2973
2974 offset = handle->offset;
2975 pages_mask = handle->data->nr_pages - 1;
2976 pages = handle->data->data_pages;
2977
2978 do { 2973 do {
2979 unsigned long page_offset; 2974 unsigned long size = min_t(unsigned long, handle->size, len);
2980 unsigned long page_size;
2981 int nr;
2982 2975
2983 nr = (offset >> PAGE_SHIFT) & pages_mask; 2976 memcpy(handle->addr, buf, size);
2984 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2985 page_offset = offset & (page_size - 1);
2986 size = min_t(unsigned int, page_size - page_offset, len);
2987 2977
2988 memcpy(pages[nr] + page_offset, buf, size); 2978 len -= size;
2979 handle->addr += size;
2980 handle->size -= size;
2981 if (!handle->size) {
2982 struct perf_mmap_data *data = handle->data;
2989 2983
2990 len -= size; 2984 handle->page++;
2991 buf += size; 2985 handle->page &= data->nr_pages - 1;
2992 offset += size; 2986 handle->addr = data->data_pages[handle->page];
2987 handle->size = PAGE_SIZE << page_order(data);
2988 }
2993 } while (len); 2989 } while (len);
2994
2995 handle->offset = offset;
2996
2997 /*
2998 * Check we didn't copy past our reservation window, taking the
2999 * possible unsigned int wrap into account.
3000 */
3001 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
3002} 2990}
3003 2991
3004int perf_output_begin(struct perf_output_handle *handle, 2992int perf_output_begin(struct perf_output_handle *handle,
@@ -3036,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,
3036 handle->sample = sample; 3024 handle->sample = sample;
3037 3025
3038 if (!data->nr_pages) 3026 if (!data->nr_pages)
3039 goto fail; 3027 goto out;
3040 3028
3041 have_lost = atomic_read(&data->lost); 3029 have_lost = local_read(&data->lost);
3042 if (have_lost) 3030 if (have_lost)
3043 size += sizeof(lost_event); 3031 size += sizeof(lost_event);
3044 3032
3045 perf_output_lock(handle); 3033 perf_output_get_handle(handle);
3046 3034
3047 do { 3035 do {
3048 /* 3036 /*
@@ -3052,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,
3052 */ 3040 */
3053 tail = ACCESS_ONCE(data->user_page->data_tail); 3041 tail = ACCESS_ONCE(data->user_page->data_tail);
3054 smp_rmb(); 3042 smp_rmb();
3055 offset = head = atomic_long_read(&data->head); 3043 offset = head = local_read(&data->head);
3056 head += size; 3044 head += size;
3057 if (unlikely(!perf_output_space(data, tail, offset, head))) 3045 if (unlikely(!perf_output_space(data, tail, offset, head)))
3058 goto fail; 3046 goto fail;
3059 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3047 } while (local_cmpxchg(&data->head, offset, head) != offset);
3060 3048
3061 handle->offset = offset; 3049 if (head - local_read(&data->wakeup) > data->watermark)
3062 handle->head = head; 3050 local_add(data->watermark, &data->wakeup);
3063 3051
3064 if (head - tail > data->watermark) 3052 handle->page = offset >> (PAGE_SHIFT + page_order(data));
3065 atomic_set(&data->wakeup, 1); 3053 handle->page &= data->nr_pages - 1;
3054 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3055 handle->addr = data->data_pages[handle->page];
3056 handle->addr += handle->size;
3057 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3066 3058
3067 if (have_lost) { 3059 if (have_lost) {
3068 lost_event.header.type = PERF_RECORD_LOST; 3060 lost_event.header.type = PERF_RECORD_LOST;
3069 lost_event.header.misc = 0; 3061 lost_event.header.misc = 0;
3070 lost_event.header.size = sizeof(lost_event); 3062 lost_event.header.size = sizeof(lost_event);
3071 lost_event.id = event->id; 3063 lost_event.id = event->id;
3072 lost_event.lost = atomic_xchg(&data->lost, 0); 3064 lost_event.lost = local_xchg(&data->lost, 0);
3073 3065
3074 perf_output_put(handle, lost_event); 3066 perf_output_put(handle, lost_event);
3075 } 3067 }
@@ -3077,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3077 return 0; 3069 return 0;
3078 3070
3079fail: 3071fail:
3080 atomic_inc(&data->lost); 3072 local_inc(&data->lost);
3081 perf_output_unlock(handle); 3073 perf_output_put_handle(handle);
3082out: 3074out:
3083 rcu_read_unlock(); 3075 rcu_read_unlock();
3084 3076
@@ -3093,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)
3093 int wakeup_events = event->attr.wakeup_events; 3085 int wakeup_events = event->attr.wakeup_events;
3094 3086
3095 if (handle->sample && wakeup_events) { 3087 if (handle->sample && wakeup_events) {
3096 int events = atomic_inc_return(&data->events); 3088 int events = local_inc_return(&data->events);
3097 if (events >= wakeup_events) { 3089 if (events >= wakeup_events) {
3098 atomic_sub(wakeup_events, &data->events); 3090 local_sub(wakeup_events, &data->events);
3099 atomic_set(&data->wakeup, 1); 3091 local_inc(&data->wakeup);
3100 } 3092 }
3101 } 3093 }
3102 3094
3103 perf_output_unlock(handle); 3095 perf_output_put_handle(handle);
3104 rcu_read_unlock(); 3096 rcu_read_unlock();
3105} 3097}
3106 3098
@@ -3436,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,
3436{ 3428{
3437 struct perf_output_handle handle; 3429 struct perf_output_handle handle;
3438 struct task_struct *task = task_event->task; 3430 struct task_struct *task = task_event->task;
3439 unsigned long flags;
3440 int size, ret; 3431 int size, ret;
3441 3432
3442 /*
3443 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3444 * in perf_output_lock() from interrupt context, it's game over.
3445 */
3446 local_irq_save(flags);
3447
3448 size = task_event->event_id.header.size; 3433 size = task_event->event_id.header.size;
3449 ret = perf_output_begin(&handle, event, size, 0, 0); 3434 ret = perf_output_begin(&handle, event, size, 0, 0);
3450 3435
3451 if (ret) { 3436 if (ret)
3452 local_irq_restore(flags);
3453 return; 3437 return;
3454 }
3455 3438
3456 task_event->event_id.pid = perf_event_pid(event, task); 3439 task_event->event_id.pid = perf_event_pid(event, task);
3457 task_event->event_id.ppid = perf_event_pid(event, current); 3440 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,
3462 perf_output_put(&handle, task_event->event_id); 3445 perf_output_put(&handle, task_event->event_id);
3463 3446
3464 perf_output_end(&handle); 3447 perf_output_end(&handle);
3465 local_irq_restore(flags);
3466} 3448}
3467 3449
3468static int perf_event_task_match(struct perf_event *event) 3450static int perf_event_task_match(struct perf_event *event)
@@ -4020,9 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4020 perf_swevent_overflow(event, 0, nmi, data, regs); 4002 perf_swevent_overflow(event, 0, nmi, data, regs);
4021} 4003}
4022 4004
4023static int perf_tp_event_match(struct perf_event *event,
4024 struct perf_sample_data *data);
4025
4026static int perf_exclude_event(struct perf_event *event, 4005static int perf_exclude_event(struct perf_event *event,
4027 struct pt_regs *regs) 4006 struct pt_regs *regs)
4028{ 4007{
@@ -4052,10 +4031,6 @@ static int perf_swevent_match(struct perf_event *event,
4052 if (perf_exclude_event(event, regs)) 4031 if (perf_exclude_event(event, regs))
4053 return 0; 4032 return 0;
4054 4033
4055 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4056 !perf_tp_event_match(event, data))
4057 return 0;
4058
4059 return 1; 4034 return 1;
4060} 4035}
4061 4036
@@ -4066,19 +4041,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
4066 return hash_64(val, SWEVENT_HLIST_BITS); 4041 return hash_64(val, SWEVENT_HLIST_BITS);
4067} 4042}
4068 4043
4069static struct hlist_head * 4044static inline struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4045__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4071{ 4046{
4072 u64 hash; 4047 u64 hash = swevent_hash(type, event_id);
4073 struct swevent_hlist *hlist; 4048
4049 return &hlist->heads[hash];
4050}
4074 4051
4075 hash = swevent_hash(type, event_id); 4052/* For the read side: events when they trigger */
4053static inline struct hlist_head *
4054find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4055{
4056 struct swevent_hlist *hlist;
4076 4057
4077 hlist = rcu_dereference(ctx->swevent_hlist); 4058 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist) 4059 if (!hlist)
4079 return NULL; 4060 return NULL;
4080 4061
4081 return &hlist->heads[hash]; 4062 return __find_swevent_head(hlist, type, event_id);
4063}
4064
4065/* For the event head insertion and removal in the hlist */
4066static inline struct hlist_head *
4067find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4068{
4069 struct swevent_hlist *hlist;
4070 u32 event_id = event->attr.config;
4071 u64 type = event->attr.type;
4072
4073 /*
4074 * Event scheduling is always serialized against hlist allocation
4075 * and release. Which makes the protected version suitable here.
4076 * The context lock guarantees that.
4077 */
4078 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4079 lockdep_is_held(&event->ctx->lock));
4080 if (!hlist)
4081 return NULL;
4082
4083 return __find_swevent_head(hlist, type, event_id);
4082} 4084}
4083 4085
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4086static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4097,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4095 4097
4096 rcu_read_lock(); 4098 rcu_read_lock();
4097 4099
4098 head = find_swevent_head(cpuctx, type, event_id); 4100 head = find_swevent_head_rcu(cpuctx, type, event_id);
4099 4101
4100 if (!head) 4102 if (!head)
4101 goto end; 4103 goto end;
@@ -4110,7 +4112,7 @@ end:
4110 4112
4111int perf_swevent_get_recursion_context(void) 4113int perf_swevent_get_recursion_context(void)
4112{ 4114{
4113 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4115 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4114 int rctx; 4116 int rctx;
4115 4117
4116 if (in_nmi()) 4118 if (in_nmi())
@@ -4122,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)
4122 else 4124 else
4123 rctx = 0; 4125 rctx = 0;
4124 4126
4125 if (cpuctx->recursion[rctx]) { 4127 if (cpuctx->recursion[rctx])
4126 put_cpu_var(perf_cpu_context);
4127 return -1; 4128 return -1;
4128 }
4129 4129
4130 cpuctx->recursion[rctx]++; 4130 cpuctx->recursion[rctx]++;
4131 barrier(); 4131 barrier();
@@ -4139,7 +4139,6 @@ void perf_swevent_put_recursion_context(int rctx)
4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4140 barrier(); 4140 barrier();
4141 cpuctx->recursion[rctx]--; 4141 cpuctx->recursion[rctx]--;
4142 put_cpu_var(perf_cpu_context);
4143} 4142}
4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4143EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4145 4144
@@ -4150,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4150 struct perf_sample_data data; 4149 struct perf_sample_data data;
4151 int rctx; 4150 int rctx;
4152 4151
4152 preempt_disable_notrace();
4153 rctx = perf_swevent_get_recursion_context(); 4153 rctx = perf_swevent_get_recursion_context();
4154 if (rctx < 0) 4154 if (rctx < 0)
4155 return; 4155 return;
@@ -4159,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4160 4160
4161 perf_swevent_put_recursion_context(rctx); 4161 perf_swevent_put_recursion_context(rctx);
4162 preempt_enable_notrace();
4162} 4163}
4163 4164
4164static void perf_swevent_read(struct perf_event *event) 4165static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4179,7 @@ static int perf_swevent_enable(struct perf_event *event)
4178 perf_swevent_set_period(event); 4179 perf_swevent_set_period(event);
4179 } 4180 }
4180 4181
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); 4182 head = find_swevent_head(cpuctx, event);
4182 if (WARN_ON_ONCE(!head)) 4183 if (WARN_ON_ONCE(!head))
4183 return -EINVAL; 4184 return -EINVAL;
4184 4185
@@ -4366,6 +4367,14 @@ static const struct pmu perf_ops_task_clock = {
4366 .read = task_clock_perf_event_read, 4367 .read = task_clock_perf_event_read,
4367}; 4368};
4368 4369
4370/* Deref the hlist from the update side */
4371static inline struct swevent_hlist *
4372swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4373{
4374 return rcu_dereference_protected(cpuctx->swevent_hlist,
4375 lockdep_is_held(&cpuctx->hlist_mutex));
4376}
4377
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4378static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{ 4379{
4371 struct swevent_hlist *hlist; 4380 struct swevent_hlist *hlist;
@@ -4376,12 +4385,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4376 4385
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4386static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{ 4387{
4379 struct swevent_hlist *hlist; 4388 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4380 4389
4381 if (!cpuctx->swevent_hlist) 4390 if (!hlist)
4382 return; 4391 return;
4383 4392
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4393 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4394 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387} 4395}
@@ -4418,7 +4426,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4418 4426
4419 mutex_lock(&cpuctx->hlist_mutex); 4427 mutex_lock(&cpuctx->hlist_mutex);
4420 4428
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) { 4429 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist; 4430 struct swevent_hlist *hlist;
4423 4431
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4432 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4475,46 @@ static int swevent_hlist_get(struct perf_event *event)
4467 4475
4468#ifdef CONFIG_EVENT_TRACING 4476#ifdef CONFIG_EVENT_TRACING
4469 4477
4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4478static const struct pmu perf_ops_tracepoint = {
4471 int entry_size, struct pt_regs *regs) 4479 .enable = perf_trace_enable,
4480 .disable = perf_trace_disable,
4481 .read = perf_swevent_read,
4482 .unthrottle = perf_swevent_unthrottle,
4483};
4484
4485static int perf_tp_filter_match(struct perf_event *event,
4486 struct perf_sample_data *data)
4487{
4488 void *record = data->raw->data;
4489
4490 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4491 return 1;
4492 return 0;
4493}
4494
4495static int perf_tp_event_match(struct perf_event *event,
4496 struct perf_sample_data *data,
4497 struct pt_regs *regs)
4498{
4499 /*
4500 * All tracepoints are from kernel-space.
4501 */
4502 if (event->attr.exclude_kernel)
4503 return 0;
4504
4505 if (!perf_tp_filter_match(event, data))
4506 return 0;
4507
4508 return 1;
4509}
4510
4511void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4512 struct pt_regs *regs, struct hlist_head *head)
4472{ 4513{
4473 struct perf_sample_data data; 4514 struct perf_sample_data data;
4515 struct perf_event *event;
4516 struct hlist_node *node;
4517
4474 struct perf_raw_record raw = { 4518 struct perf_raw_record raw = {
4475 .size = entry_size, 4519 .size = entry_size,
4476 .data = record, 4520 .data = record,
@@ -4479,26 +4523,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4479 perf_sample_data_init(&data, addr); 4523 perf_sample_data_init(&data, addr);
4480 data.raw = &raw; 4524 data.raw = &raw;
4481 4525
4482 /* Trace events already protected against recursion */ 4526 rcu_read_lock();
4483 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4527 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4484 &data, regs); 4528 if (perf_tp_event_match(event, &data, regs))
4529 perf_swevent_add(event, count, 1, &data, regs);
4530 }
4531 rcu_read_unlock();
4485} 4532}
4486EXPORT_SYMBOL_GPL(perf_tp_event); 4533EXPORT_SYMBOL_GPL(perf_tp_event);
4487 4534
4488static int perf_tp_event_match(struct perf_event *event,
4489 struct perf_sample_data *data)
4490{
4491 void *record = data->raw->data;
4492
4493 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4494 return 1;
4495 return 0;
4496}
4497
4498static void tp_perf_event_destroy(struct perf_event *event) 4535static void tp_perf_event_destroy(struct perf_event *event)
4499{ 4536{
4500 perf_trace_disable(event->attr.config); 4537 perf_trace_destroy(event);
4501 swevent_hlist_put(event);
4502} 4538}
4503 4539
4504static const struct pmu *tp_perf_event_init(struct perf_event *event) 4540static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4514 !capable(CAP_SYS_ADMIN)) 4550 !capable(CAP_SYS_ADMIN))
4515 return ERR_PTR(-EPERM); 4551 return ERR_PTR(-EPERM);
4516 4552
4517 if (perf_trace_enable(event->attr.config)) 4553 err = perf_trace_init(event);
4554 if (err)
4518 return NULL; 4555 return NULL;
4519 4556
4520 event->destroy = tp_perf_event_destroy; 4557 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4526 4558
4527 return &perf_ops_generic; 4559 return &perf_ops_tracepoint;
4528} 4560}
4529 4561
4530static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4562static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)
4552 4584
4553#else 4585#else
4554 4586
4555static int perf_tp_event_match(struct perf_event *event,
4556 struct perf_sample_data *data)
4557{
4558 return 1;
4559}
4560
4561static const struct pmu *tp_perf_event_init(struct perf_event *event) 4587static const struct pmu *tp_perf_event_init(struct perf_event *event)
4562{ 4588{
4563 return NULL; 4589 return NULL;
@@ -4894,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4894 int fput_needed = 0; 4920 int fput_needed = 0;
4895 int ret = -EINVAL; 4921 int ret = -EINVAL;
4896 4922
4923 /*
4924 * Don't allow output of inherited per-task events. This would
4925 * create performance issues due to cross cpu access.
4926 */
4927 if (event->cpu == -1 && event->attr.inherit)
4928 return -EINVAL;
4929
4897 if (!output_fd) 4930 if (!output_fd)
4898 goto set; 4931 goto set;
4899 4932
@@ -4914,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4914 if (event->data) 4947 if (event->data)
4915 goto out; 4948 goto out;
4916 4949
4950 /*
4951 * Don't allow cross-cpu buffers
4952 */
4953 if (output_event->cpu != event->cpu)
4954 goto out;
4955
4956 /*
4957 * If its not a per-cpu buffer, it must be the same task.
4958 */
4959 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4960 goto out;
4961
4917 atomic_long_inc(&output_file->f_count); 4962 atomic_long_inc(&output_file->f_count);
4918 4963
4919set: 4964set: