diff options
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 153 |
1 files changed, 71 insertions, 82 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 511677bc1c6a..2a060be3b07f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -2320,6 +2320,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | |||
2320 | return virt_to_page(data->data_pages[pgoff - 1]); | 2320 | return virt_to_page(data->data_pages[pgoff - 1]); |
2321 | } | 2321 | } |
2322 | 2322 | ||
2323 | static void *perf_mmap_alloc_page(int cpu) | ||
2324 | { | ||
2325 | struct page *page; | ||
2326 | int node; | ||
2327 | |||
2328 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
2329 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
2330 | if (!page) | ||
2331 | return NULL; | ||
2332 | |||
2333 | return page_address(page); | ||
2334 | } | ||
2335 | |||
2323 | static struct perf_mmap_data * | 2336 | static struct perf_mmap_data * |
2324 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2337 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) |
2325 | { | 2338 | { |
@@ -2336,12 +2349,12 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
2336 | if (!data) | 2349 | if (!data) |
2337 | goto fail; | 2350 | goto fail; |
2338 | 2351 | ||
2339 | data->user_page = (void *)get_zeroed_page(GFP_KERNEL); | 2352 | data->user_page = perf_mmap_alloc_page(event->cpu); |
2340 | if (!data->user_page) | 2353 | if (!data->user_page) |
2341 | goto fail_user_page; | 2354 | goto fail_user_page; |
2342 | 2355 | ||
2343 | for (i = 0; i < nr_pages; i++) { | 2356 | for (i = 0; i < nr_pages; i++) { |
2344 | data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); | 2357 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); |
2345 | if (!data->data_pages[i]) | 2358 | if (!data->data_pages[i]) |
2346 | goto fail_data_pages; | 2359 | goto fail_data_pages; |
2347 | } | 2360 | } |
@@ -2506,8 +2519,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | |||
2506 | { | 2519 | { |
2507 | long max_size = perf_data_size(data); | 2520 | long max_size = perf_data_size(data); |
2508 | 2521 | ||
2509 | atomic_set(&data->lock, -1); | ||
2510 | |||
2511 | if (event->attr.watermark) { | 2522 | if (event->attr.watermark) { |
2512 | data->watermark = min_t(long, max_size, | 2523 | data->watermark = min_t(long, max_size, |
2513 | event->attr.wakeup_watermark); | 2524 | event->attr.wakeup_watermark); |
@@ -2580,6 +2591,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2580 | long user_extra, extra; | 2591 | long user_extra, extra; |
2581 | int ret = 0; | 2592 | int ret = 0; |
2582 | 2593 | ||
2594 | /* | ||
2595 | * Don't allow mmap() of inherited per-task counters. This would | ||
2596 | * create a performance issue due to all children writing to the | ||
2597 | * same buffer. | ||
2598 | */ | ||
2599 | if (event->cpu == -1 && event->attr.inherit) | ||
2600 | return -EINVAL; | ||
2601 | |||
2583 | if (!(vma->vm_flags & VM_SHARED)) | 2602 | if (!(vma->vm_flags & VM_SHARED)) |
2584 | return -EINVAL; | 2603 | return -EINVAL; |
2585 | 2604 | ||
@@ -2885,82 +2904,57 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
2885 | } | 2904 | } |
2886 | 2905 | ||
2887 | /* | 2906 | /* |
2888 | * Curious locking construct. | ||
2889 | * | ||
2890 | * We need to ensure a later event_id doesn't publish a head when a former | 2907 | * We need to ensure a later event_id doesn't publish a head when a former |
2891 | * event_id isn't done writing. However since we need to deal with NMIs we | 2908 | * event isn't done writing. However since we need to deal with NMIs we |
2892 | * cannot fully serialize things. | 2909 | * cannot fully serialize things. |
2893 | * | 2910 | * |
2894 | * What we do is serialize between CPUs so we only have to deal with NMI | ||
2895 | * nesting on a single CPU. | ||
2896 | * | ||
2897 | * We only publish the head (and generate a wakeup) when the outer-most | 2911 | * We only publish the head (and generate a wakeup) when the outer-most |
2898 | * event_id completes. | 2912 | * event completes. |
2899 | */ | 2913 | */ |
2900 | static void perf_output_lock(struct perf_output_handle *handle) | 2914 | static void perf_output_get_handle(struct perf_output_handle *handle) |
2901 | { | 2915 | { |
2902 | struct perf_mmap_data *data = handle->data; | 2916 | struct perf_mmap_data *data = handle->data; |
2903 | int cur, cpu = get_cpu(); | ||
2904 | |||
2905 | handle->locked = 0; | ||
2906 | |||
2907 | for (;;) { | ||
2908 | cur = atomic_cmpxchg(&data->lock, -1, cpu); | ||
2909 | if (cur == -1) { | ||
2910 | handle->locked = 1; | ||
2911 | break; | ||
2912 | } | ||
2913 | if (cur == cpu) | ||
2914 | break; | ||
2915 | 2917 | ||
2916 | cpu_relax(); | 2918 | preempt_disable(); |
2917 | } | 2919 | local_inc(&data->nest); |
2920 | handle->wakeup = local_read(&data->wakeup); | ||
2918 | } | 2921 | } |
2919 | 2922 | ||
2920 | static void perf_output_unlock(struct perf_output_handle *handle) | 2923 | static void perf_output_put_handle(struct perf_output_handle *handle) |
2921 | { | 2924 | { |
2922 | struct perf_mmap_data *data = handle->data; | 2925 | struct perf_mmap_data *data = handle->data; |
2923 | unsigned long head; | 2926 | unsigned long head; |
2924 | int cpu; | ||
2925 | |||
2926 | data->done_head = data->head; | ||
2927 | |||
2928 | if (!handle->locked) | ||
2929 | goto out; | ||
2930 | 2927 | ||
2931 | again: | 2928 | again: |
2932 | /* | 2929 | head = local_read(&data->head); |
2933 | * The xchg implies a full barrier that ensures all writes are done | ||
2934 | * before we publish the new head, matched by a rmb() in userspace when | ||
2935 | * reading this position. | ||
2936 | */ | ||
2937 | while ((head = atomic_long_xchg(&data->done_head, 0))) | ||
2938 | data->user_page->data_head = head; | ||
2939 | 2930 | ||
2940 | /* | 2931 | /* |
2941 | * NMI can happen here, which means we can miss a done_head update. | 2932 | * IRQ/NMI can happen here, which means we can miss a head update. |
2942 | */ | 2933 | */ |
2943 | 2934 | ||
2944 | cpu = atomic_xchg(&data->lock, -1); | 2935 | if (!local_dec_and_test(&data->nest)) |
2945 | WARN_ON_ONCE(cpu != smp_processor_id()); | 2936 | return; |
2946 | 2937 | ||
2947 | /* | 2938 | /* |
2948 | * Therefore we have to validate we did not indeed do so. | 2939 | * Publish the known good head. Rely on the full barrier implied |
2940 | * by atomic_dec_and_test() order the data->head read and this | ||
2941 | * write. | ||
2949 | */ | 2942 | */ |
2950 | if (unlikely(atomic_long_read(&data->done_head))) { | 2943 | data->user_page->data_head = head; |
2951 | /* | ||
2952 | * Since we had it locked, we can lock it again. | ||
2953 | */ | ||
2954 | while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) | ||
2955 | cpu_relax(); | ||
2956 | 2944 | ||
2945 | /* | ||
2946 | * Now check if we missed an update, rely on the (compiler) | ||
2947 | * barrier in atomic_dec_and_test() to re-read data->head. | ||
2948 | */ | ||
2949 | if (unlikely(head != local_read(&data->head))) { | ||
2950 | local_inc(&data->nest); | ||
2957 | goto again; | 2951 | goto again; |
2958 | } | 2952 | } |
2959 | 2953 | ||
2960 | if (atomic_xchg(&data->wakeup, 0)) | 2954 | if (handle->wakeup != local_read(&data->wakeup)) |
2961 | perf_output_wakeup(handle); | 2955 | perf_output_wakeup(handle); |
2962 | out: | 2956 | |
2963 | put_cpu(); | 2957 | preempt_enable(); |
2964 | } | 2958 | } |
2965 | 2959 | ||
2966 | void perf_output_copy(struct perf_output_handle *handle, | 2960 | void perf_output_copy(struct perf_output_handle *handle, |
@@ -3036,13 +3030,13 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3036 | handle->sample = sample; | 3030 | handle->sample = sample; |
3037 | 3031 | ||
3038 | if (!data->nr_pages) | 3032 | if (!data->nr_pages) |
3039 | goto fail; | 3033 | goto out; |
3040 | 3034 | ||
3041 | have_lost = atomic_read(&data->lost); | 3035 | have_lost = local_read(&data->lost); |
3042 | if (have_lost) | 3036 | if (have_lost) |
3043 | size += sizeof(lost_event); | 3037 | size += sizeof(lost_event); |
3044 | 3038 | ||
3045 | perf_output_lock(handle); | 3039 | perf_output_get_handle(handle); |
3046 | 3040 | ||
3047 | do { | 3041 | do { |
3048 | /* | 3042 | /* |
@@ -3052,24 +3046,24 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3052 | */ | 3046 | */ |
3053 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3047 | tail = ACCESS_ONCE(data->user_page->data_tail); |
3054 | smp_rmb(); | 3048 | smp_rmb(); |
3055 | offset = head = atomic_long_read(&data->head); | 3049 | offset = head = local_read(&data->head); |
3056 | head += size; | 3050 | head += size; |
3057 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3051 | if (unlikely(!perf_output_space(data, tail, offset, head))) |
3058 | goto fail; | 3052 | goto fail; |
3059 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); | 3053 | } while (local_cmpxchg(&data->head, offset, head) != offset); |
3060 | 3054 | ||
3061 | handle->offset = offset; | 3055 | handle->offset = offset; |
3062 | handle->head = head; | 3056 | handle->head = head; |
3063 | 3057 | ||
3064 | if (head - tail > data->watermark) | 3058 | if (head - tail > data->watermark) |
3065 | atomic_set(&data->wakeup, 1); | 3059 | local_inc(&data->wakeup); |
3066 | 3060 | ||
3067 | if (have_lost) { | 3061 | if (have_lost) { |
3068 | lost_event.header.type = PERF_RECORD_LOST; | 3062 | lost_event.header.type = PERF_RECORD_LOST; |
3069 | lost_event.header.misc = 0; | 3063 | lost_event.header.misc = 0; |
3070 | lost_event.header.size = sizeof(lost_event); | 3064 | lost_event.header.size = sizeof(lost_event); |
3071 | lost_event.id = event->id; | 3065 | lost_event.id = event->id; |
3072 | lost_event.lost = atomic_xchg(&data->lost, 0); | 3066 | lost_event.lost = local_xchg(&data->lost, 0); |
3073 | 3067 | ||
3074 | perf_output_put(handle, lost_event); | 3068 | perf_output_put(handle, lost_event); |
3075 | } | 3069 | } |
@@ -3077,8 +3071,8 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3077 | return 0; | 3071 | return 0; |
3078 | 3072 | ||
3079 | fail: | 3073 | fail: |
3080 | atomic_inc(&data->lost); | 3074 | local_inc(&data->lost); |
3081 | perf_output_unlock(handle); | 3075 | perf_output_put_handle(handle); |
3082 | out: | 3076 | out: |
3083 | rcu_read_unlock(); | 3077 | rcu_read_unlock(); |
3084 | 3078 | ||
@@ -3093,14 +3087,14 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3093 | int wakeup_events = event->attr.wakeup_events; | 3087 | int wakeup_events = event->attr.wakeup_events; |
3094 | 3088 | ||
3095 | if (handle->sample && wakeup_events) { | 3089 | if (handle->sample && wakeup_events) { |
3096 | int events = atomic_inc_return(&data->events); | 3090 | int events = local_inc_return(&data->events); |
3097 | if (events >= wakeup_events) { | 3091 | if (events >= wakeup_events) { |
3098 | atomic_sub(wakeup_events, &data->events); | 3092 | local_sub(wakeup_events, &data->events); |
3099 | atomic_set(&data->wakeup, 1); | 3093 | local_inc(&data->wakeup); |
3100 | } | 3094 | } |
3101 | } | 3095 | } |
3102 | 3096 | ||
3103 | perf_output_unlock(handle); | 3097 | perf_output_put_handle(handle); |
3104 | rcu_read_unlock(); | 3098 | rcu_read_unlock(); |
3105 | } | 3099 | } |
3106 | 3100 | ||
@@ -3436,22 +3430,13 @@ static void perf_event_task_output(struct perf_event *event, | |||
3436 | { | 3430 | { |
3437 | struct perf_output_handle handle; | 3431 | struct perf_output_handle handle; |
3438 | struct task_struct *task = task_event->task; | 3432 | struct task_struct *task = task_event->task; |
3439 | unsigned long flags; | ||
3440 | int size, ret; | 3433 | int size, ret; |
3441 | 3434 | ||
3442 | /* | ||
3443 | * If this CPU attempts to acquire an rq lock held by a CPU spinning | ||
3444 | * in perf_output_lock() from interrupt context, it's game over. | ||
3445 | */ | ||
3446 | local_irq_save(flags); | ||
3447 | |||
3448 | size = task_event->event_id.header.size; | 3435 | size = task_event->event_id.header.size; |
3449 | ret = perf_output_begin(&handle, event, size, 0, 0); | 3436 | ret = perf_output_begin(&handle, event, size, 0, 0); |
3450 | 3437 | ||
3451 | if (ret) { | 3438 | if (ret) |
3452 | local_irq_restore(flags); | ||
3453 | return; | 3439 | return; |
3454 | } | ||
3455 | 3440 | ||
3456 | task_event->event_id.pid = perf_event_pid(event, task); | 3441 | task_event->event_id.pid = perf_event_pid(event, task); |
3457 | task_event->event_id.ppid = perf_event_pid(event, current); | 3442 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3462,7 +3447,6 @@ static void perf_event_task_output(struct perf_event *event, | |||
3462 | perf_output_put(&handle, task_event->event_id); | 3447 | perf_output_put(&handle, task_event->event_id); |
3463 | 3448 | ||
3464 | perf_output_end(&handle); | 3449 | perf_output_end(&handle); |
3465 | local_irq_restore(flags); | ||
3466 | } | 3450 | } |
3467 | 3451 | ||
3468 | static int perf_event_task_match(struct perf_event *event) | 3452 | static int perf_event_task_match(struct perf_event *event) |
@@ -4502,8 +4486,9 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4502 | #ifdef CONFIG_EVENT_TRACING | 4486 | #ifdef CONFIG_EVENT_TRACING |
4503 | 4487 | ||
4504 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | 4488 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, |
4505 | int entry_size, struct pt_regs *regs) | 4489 | int entry_size, struct pt_regs *regs, void *event) |
4506 | { | 4490 | { |
4491 | const int type = PERF_TYPE_TRACEPOINT; | ||
4507 | struct perf_sample_data data; | 4492 | struct perf_sample_data data; |
4508 | struct perf_raw_record raw = { | 4493 | struct perf_raw_record raw = { |
4509 | .size = entry_size, | 4494 | .size = entry_size, |
@@ -4513,9 +4498,13 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | |||
4513 | perf_sample_data_init(&data, addr); | 4498 | perf_sample_data_init(&data, addr); |
4514 | data.raw = &raw; | 4499 | data.raw = &raw; |
4515 | 4500 | ||
4516 | /* Trace events already protected against recursion */ | 4501 | if (!event) { |
4517 | do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, | 4502 | do_perf_sw_event(type, event_id, count, 1, &data, regs); |
4518 | &data, regs); | 4503 | return; |
4504 | } | ||
4505 | |||
4506 | if (perf_swevent_match(event, type, event_id, &data, regs)) | ||
4507 | perf_swevent_add(event, count, 1, &data, regs); | ||
4519 | } | 4508 | } |
4520 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4509 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4521 | 4510 | ||
@@ -4548,7 +4537,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4548 | !capable(CAP_SYS_ADMIN)) | 4537 | !capable(CAP_SYS_ADMIN)) |
4549 | return ERR_PTR(-EPERM); | 4538 | return ERR_PTR(-EPERM); |
4550 | 4539 | ||
4551 | if (perf_trace_enable(event->attr.config)) | 4540 | if (perf_trace_enable(event->attr.config, event)) |
4552 | return NULL; | 4541 | return NULL; |
4553 | 4542 | ||
4554 | event->destroy = tp_perf_event_destroy; | 4543 | event->destroy = tp_perf_event_destroy; |