aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/perf_event.c153
-rw-r--r--kernel/trace/trace_event_perf.c11
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c6
4 files changed, 84 insertions, 90 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 511677bc1c6a..2a060be3b07f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2320,6 +2320,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2320 return virt_to_page(data->data_pages[pgoff - 1]); 2320 return virt_to_page(data->data_pages[pgoff - 1]);
2321} 2321}
2322 2322
2323static void *perf_mmap_alloc_page(int cpu)
2324{
2325 struct page *page;
2326 int node;
2327
2328 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2329 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2330 if (!page)
2331 return NULL;
2332
2333 return page_address(page);
2334}
2335
2323static struct perf_mmap_data * 2336static struct perf_mmap_data *
2324perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2337perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2325{ 2338{
@@ -2336,12 +2349,12 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2336 if (!data) 2349 if (!data)
2337 goto fail; 2350 goto fail;
2338 2351
2339 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2352 data->user_page = perf_mmap_alloc_page(event->cpu);
2340 if (!data->user_page) 2353 if (!data->user_page)
2341 goto fail_user_page; 2354 goto fail_user_page;
2342 2355
2343 for (i = 0; i < nr_pages; i++) { 2356 for (i = 0; i < nr_pages; i++) {
2344 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2357 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2345 if (!data->data_pages[i]) 2358 if (!data->data_pages[i])
2346 goto fail_data_pages; 2359 goto fail_data_pages;
2347 } 2360 }
@@ -2506,8 +2519,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2506{ 2519{
2507 long max_size = perf_data_size(data); 2520 long max_size = perf_data_size(data);
2508 2521
2509 atomic_set(&data->lock, -1);
2510
2511 if (event->attr.watermark) { 2522 if (event->attr.watermark) {
2512 data->watermark = min_t(long, max_size, 2523 data->watermark = min_t(long, max_size,
2513 event->attr.wakeup_watermark); 2524 event->attr.wakeup_watermark);
@@ -2580,6 +2591,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2580 long user_extra, extra; 2591 long user_extra, extra;
2581 int ret = 0; 2592 int ret = 0;
2582 2593
2594 /*
2595 * Don't allow mmap() of inherited per-task counters. This would
2596 * create a performance issue due to all children writing to the
2597 * same buffer.
2598 */
2599 if (event->cpu == -1 && event->attr.inherit)
2600 return -EINVAL;
2601
2583 if (!(vma->vm_flags & VM_SHARED)) 2602 if (!(vma->vm_flags & VM_SHARED))
2584 return -EINVAL; 2603 return -EINVAL;
2585 2604
@@ -2885,82 +2904,57 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2885} 2904}
2886 2905
2887/* 2906/*
2888 * Curious locking construct.
2889 *
2890 * We need to ensure a later event_id doesn't publish a head when a former 2907 * We need to ensure a later event_id doesn't publish a head when a former
2891 * event_id isn't done writing. However since we need to deal with NMIs we 2908 * event isn't done writing. However since we need to deal with NMIs we
2892 * cannot fully serialize things. 2909 * cannot fully serialize things.
2893 * 2910 *
2894 * What we do is serialize between CPUs so we only have to deal with NMI
2895 * nesting on a single CPU.
2896 *
2897 * We only publish the head (and generate a wakeup) when the outer-most 2911 * We only publish the head (and generate a wakeup) when the outer-most
2898 * event_id completes. 2912 * event completes.
2899 */ 2913 */
2900static void perf_output_lock(struct perf_output_handle *handle) 2914static void perf_output_get_handle(struct perf_output_handle *handle)
2901{ 2915{
2902 struct perf_mmap_data *data = handle->data; 2916 struct perf_mmap_data *data = handle->data;
2903 int cur, cpu = get_cpu();
2904
2905 handle->locked = 0;
2906
2907 for (;;) {
2908 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2909 if (cur == -1) {
2910 handle->locked = 1;
2911 break;
2912 }
2913 if (cur == cpu)
2914 break;
2915 2917
2916 cpu_relax(); 2918 preempt_disable();
2917 } 2919 local_inc(&data->nest);
2920 handle->wakeup = local_read(&data->wakeup);
2918} 2921}
2919 2922
2920static void perf_output_unlock(struct perf_output_handle *handle) 2923static void perf_output_put_handle(struct perf_output_handle *handle)
2921{ 2924{
2922 struct perf_mmap_data *data = handle->data; 2925 struct perf_mmap_data *data = handle->data;
2923 unsigned long head; 2926 unsigned long head;
2924 int cpu;
2925
2926 data->done_head = data->head;
2927
2928 if (!handle->locked)
2929 goto out;
2930 2927
2931again: 2928again:
2932 /* 2929 head = local_read(&data->head);
2933 * The xchg implies a full barrier that ensures all writes are done
2934 * before we publish the new head, matched by a rmb() in userspace when
2935 * reading this position.
2936 */
2937 while ((head = atomic_long_xchg(&data->done_head, 0)))
2938 data->user_page->data_head = head;
2939 2930
2940 /* 2931 /*
2941 * NMI can happen here, which means we can miss a done_head update. 2932 * IRQ/NMI can happen here, which means we can miss a head update.
2942 */ 2933 */
2943 2934
2944 cpu = atomic_xchg(&data->lock, -1); 2935 if (!local_dec_and_test(&data->nest))
2945 WARN_ON_ONCE(cpu != smp_processor_id()); 2936 return;
2946 2937
2947 /* 2938 /*
2948 * Therefore we have to validate we did not indeed do so. 2939 * Publish the known good head. Rely on the full barrier implied
2940 * by atomic_dec_and_test() order the data->head read and this
2941 * write.
2949 */ 2942 */
2950 if (unlikely(atomic_long_read(&data->done_head))) { 2943 data->user_page->data_head = head;
2951 /*
2952 * Since we had it locked, we can lock it again.
2953 */
2954 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2955 cpu_relax();
2956 2944
2945 /*
2946 * Now check if we missed an update, rely on the (compiler)
2947 * barrier in atomic_dec_and_test() to re-read data->head.
2948 */
2949 if (unlikely(head != local_read(&data->head))) {
2950 local_inc(&data->nest);
2957 goto again; 2951 goto again;
2958 } 2952 }
2959 2953
2960 if (atomic_xchg(&data->wakeup, 0)) 2954 if (handle->wakeup != local_read(&data->wakeup))
2961 perf_output_wakeup(handle); 2955 perf_output_wakeup(handle);
2962out: 2956
2963 put_cpu(); 2957 preempt_enable();
2964} 2958}
2965 2959
2966void perf_output_copy(struct perf_output_handle *handle, 2960void perf_output_copy(struct perf_output_handle *handle,
@@ -3036,13 +3030,13 @@ int perf_output_begin(struct perf_output_handle *handle,
3036 handle->sample = sample; 3030 handle->sample = sample;
3037 3031
3038 if (!data->nr_pages) 3032 if (!data->nr_pages)
3039 goto fail; 3033 goto out;
3040 3034
3041 have_lost = atomic_read(&data->lost); 3035 have_lost = local_read(&data->lost);
3042 if (have_lost) 3036 if (have_lost)
3043 size += sizeof(lost_event); 3037 size += sizeof(lost_event);
3044 3038
3045 perf_output_lock(handle); 3039 perf_output_get_handle(handle);
3046 3040
3047 do { 3041 do {
3048 /* 3042 /*
@@ -3052,24 +3046,24 @@ int perf_output_begin(struct perf_output_handle *handle,
3052 */ 3046 */
3053 tail = ACCESS_ONCE(data->user_page->data_tail); 3047 tail = ACCESS_ONCE(data->user_page->data_tail);
3054 smp_rmb(); 3048 smp_rmb();
3055 offset = head = atomic_long_read(&data->head); 3049 offset = head = local_read(&data->head);
3056 head += size; 3050 head += size;
3057 if (unlikely(!perf_output_space(data, tail, offset, head))) 3051 if (unlikely(!perf_output_space(data, tail, offset, head)))
3058 goto fail; 3052 goto fail;
3059 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3053 } while (local_cmpxchg(&data->head, offset, head) != offset);
3060 3054
3061 handle->offset = offset; 3055 handle->offset = offset;
3062 handle->head = head; 3056 handle->head = head;
3063 3057
3064 if (head - tail > data->watermark) 3058 if (head - tail > data->watermark)
3065 atomic_set(&data->wakeup, 1); 3059 local_inc(&data->wakeup);
3066 3060
3067 if (have_lost) { 3061 if (have_lost) {
3068 lost_event.header.type = PERF_RECORD_LOST; 3062 lost_event.header.type = PERF_RECORD_LOST;
3069 lost_event.header.misc = 0; 3063 lost_event.header.misc = 0;
3070 lost_event.header.size = sizeof(lost_event); 3064 lost_event.header.size = sizeof(lost_event);
3071 lost_event.id = event->id; 3065 lost_event.id = event->id;
3072 lost_event.lost = atomic_xchg(&data->lost, 0); 3066 lost_event.lost = local_xchg(&data->lost, 0);
3073 3067
3074 perf_output_put(handle, lost_event); 3068 perf_output_put(handle, lost_event);
3075 } 3069 }
@@ -3077,8 +3071,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3077 return 0; 3071 return 0;
3078 3072
3079fail: 3073fail:
3080 atomic_inc(&data->lost); 3074 local_inc(&data->lost);
3081 perf_output_unlock(handle); 3075 perf_output_put_handle(handle);
3082out: 3076out:
3083 rcu_read_unlock(); 3077 rcu_read_unlock();
3084 3078
@@ -3093,14 +3087,14 @@ void perf_output_end(struct perf_output_handle *handle)
3093 int wakeup_events = event->attr.wakeup_events; 3087 int wakeup_events = event->attr.wakeup_events;
3094 3088
3095 if (handle->sample && wakeup_events) { 3089 if (handle->sample && wakeup_events) {
3096 int events = atomic_inc_return(&data->events); 3090 int events = local_inc_return(&data->events);
3097 if (events >= wakeup_events) { 3091 if (events >= wakeup_events) {
3098 atomic_sub(wakeup_events, &data->events); 3092 local_sub(wakeup_events, &data->events);
3099 atomic_set(&data->wakeup, 1); 3093 local_inc(&data->wakeup);
3100 } 3094 }
3101 } 3095 }
3102 3096
3103 perf_output_unlock(handle); 3097 perf_output_put_handle(handle);
3104 rcu_read_unlock(); 3098 rcu_read_unlock();
3105} 3099}
3106 3100
@@ -3436,22 +3430,13 @@ static void perf_event_task_output(struct perf_event *event,
3436{ 3430{
3437 struct perf_output_handle handle; 3431 struct perf_output_handle handle;
3438 struct task_struct *task = task_event->task; 3432 struct task_struct *task = task_event->task;
3439 unsigned long flags;
3440 int size, ret; 3433 int size, ret;
3441 3434
3442 /*
3443 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3444 * in perf_output_lock() from interrupt context, it's game over.
3445 */
3446 local_irq_save(flags);
3447
3448 size = task_event->event_id.header.size; 3435 size = task_event->event_id.header.size;
3449 ret = perf_output_begin(&handle, event, size, 0, 0); 3436 ret = perf_output_begin(&handle, event, size, 0, 0);
3450 3437
3451 if (ret) { 3438 if (ret)
3452 local_irq_restore(flags);
3453 return; 3439 return;
3454 }
3455 3440
3456 task_event->event_id.pid = perf_event_pid(event, task); 3441 task_event->event_id.pid = perf_event_pid(event, task);
3457 task_event->event_id.ppid = perf_event_pid(event, current); 3442 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3447,6 @@ static void perf_event_task_output(struct perf_event *event,
3462 perf_output_put(&handle, task_event->event_id); 3447 perf_output_put(&handle, task_event->event_id);
3463 3448
3464 perf_output_end(&handle); 3449 perf_output_end(&handle);
3465 local_irq_restore(flags);
3466} 3450}
3467 3451
3468static int perf_event_task_match(struct perf_event *event) 3452static int perf_event_task_match(struct perf_event *event)
@@ -4502,8 +4486,9 @@ static int swevent_hlist_get(struct perf_event *event)
4502#ifdef CONFIG_EVENT_TRACING 4486#ifdef CONFIG_EVENT_TRACING
4503 4487
4504void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4488void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4505 int entry_size, struct pt_regs *regs) 4489 int entry_size, struct pt_regs *regs, void *event)
4506{ 4490{
4491 const int type = PERF_TYPE_TRACEPOINT;
4507 struct perf_sample_data data; 4492 struct perf_sample_data data;
4508 struct perf_raw_record raw = { 4493 struct perf_raw_record raw = {
4509 .size = entry_size, 4494 .size = entry_size,
@@ -4513,9 +4498,13 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4513 perf_sample_data_init(&data, addr); 4498 perf_sample_data_init(&data, addr);
4514 data.raw = &raw; 4499 data.raw = &raw;
4515 4500
4516 /* Trace events already protected against recursion */ 4501 if (!event) {
4517 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4502 do_perf_sw_event(type, event_id, count, 1, &data, regs);
4518 &data, regs); 4503 return;
4504 }
4505
4506 if (perf_swevent_match(event, type, event_id, &data, regs))
4507 perf_swevent_add(event, count, 1, &data, regs);
4519} 4508}
4520EXPORT_SYMBOL_GPL(perf_tp_event); 4509EXPORT_SYMBOL_GPL(perf_tp_event);
4521 4510
@@ -4548,7 +4537,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4548 !capable(CAP_SYS_ADMIN)) 4537 !capable(CAP_SYS_ADMIN))
4549 return ERR_PTR(-EPERM); 4538 return ERR_PTR(-EPERM);
4550 4539
4551 if (perf_trace_enable(event->attr.config)) 4540 if (perf_trace_enable(event->attr.config, event))
4552 return NULL; 4541 return NULL;
4553 4542
4554 event->destroy = tp_perf_event_destroy; 4543 event->destroy = tp_perf_event_destroy;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..89b780a7c522 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -27,13 +27,15 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
27/* Count the events in use (per event id, not per instance) */ 27/* Count the events in use (per event id, not per instance) */
28static int total_ref_count; 28static int total_ref_count;
29 29
30static int perf_trace_event_enable(struct ftrace_event_call *event) 30static int perf_trace_event_enable(struct ftrace_event_call *event, void *data)
31{ 31{
32 char *buf; 32 char *buf;
33 int ret = -ENOMEM; 33 int ret = -ENOMEM;
34 34
35 if (event->perf_refcount++ > 0) 35 if (event->perf_refcount++ > 0) {
36 event->perf_data = NULL;
36 return 0; 37 return 0;
38 }
37 39
38 if (!total_ref_count) { 40 if (!total_ref_count) {
39 buf = (char *)alloc_percpu(perf_trace_t); 41 buf = (char *)alloc_percpu(perf_trace_t);
@@ -51,6 +53,7 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
51 53
52 ret = event->perf_event_enable(event); 54 ret = event->perf_event_enable(event);
53 if (!ret) { 55 if (!ret) {
56 event->perf_data = data;
54 total_ref_count++; 57 total_ref_count++;
55 return 0; 58 return 0;
56 } 59 }
@@ -68,7 +71,7 @@ fail_buf:
68 return ret; 71 return ret;
69} 72}
70 73
71int perf_trace_enable(int event_id) 74int perf_trace_enable(int event_id, void *data)
72{ 75{
73 struct ftrace_event_call *event; 76 struct ftrace_event_call *event;
74 int ret = -EINVAL; 77 int ret = -EINVAL;
@@ -77,7 +80,7 @@ int perf_trace_enable(int event_id)
77 list_for_each_entry(event, &ftrace_events, list) { 80 list_for_each_entry(event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable && 81 if (event->id == event_id && event->perf_event_enable &&
79 try_module_get(event->mod)) { 82 try_module_get(event->mod)) {
80 ret = perf_trace_event_enable(event); 83 ret = perf_trace_event_enable(event, data);
81 break; 84 break;
82 } 85 }
83 } 86 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..2d7bf4146be8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1362,7 +1362,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1362 for (i = 0; i < tp->nr_args; i++) 1362 for (i = 0; i < tp->nr_args; i++)
1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1364 1364
1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs, call->perf_data);
1366} 1366}
1367 1367
1368/* Kretprobe profile handler */ 1368/* Kretprobe profile handler */
@@ -1395,7 +1395,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1396 1396
1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1398 irq_flags, regs); 1398 irq_flags, regs, call->perf_data);
1399} 1399}
1400 1400
1401static int probe_perf_enable(struct ftrace_event_call *call) 1401static int probe_perf_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..9eff1a4b49b9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -468,7 +468,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
468 rec->nr = syscall_nr; 468 rec->nr = syscall_nr;
469 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 469 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
470 (unsigned long *)&rec->args); 470 (unsigned long *)&rec->args);
471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs,
472 sys_data->enter_event->perf_data);
472} 473}
473 474
474int perf_sysenter_enable(struct ftrace_event_call *call) 475int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -543,7 +544,8 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
543 rec->nr = syscall_nr; 544 rec->nr = syscall_nr;
544 rec->ret = syscall_get_return_value(current, regs); 545 rec->ret = syscall_get_return_value(current, regs);
545 546
546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 547 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs,
548 sys_data->exit_event->perf_data);
547} 549}
548 550
549int perf_sysexit_enable(struct ftrace_event_call *call) 551int perf_sysexit_enable(struct ftrace_event_call *call)