diff options
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 389 |
1 files changed, 217 insertions, 172 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a4fa381db3c2..e099650cd249 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -2297,11 +2297,6 @@ unlock: | |||
2297 | rcu_read_unlock(); | 2297 | rcu_read_unlock(); |
2298 | } | 2298 | } |
2299 | 2299 | ||
2300 | static unsigned long perf_data_size(struct perf_mmap_data *data) | ||
2301 | { | ||
2302 | return data->nr_pages << (PAGE_SHIFT + data->data_order); | ||
2303 | } | ||
2304 | |||
2305 | #ifndef CONFIG_PERF_USE_VMALLOC | 2300 | #ifndef CONFIG_PERF_USE_VMALLOC |
2306 | 2301 | ||
2307 | /* | 2302 | /* |
@@ -2320,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | |||
2320 | return virt_to_page(data->data_pages[pgoff - 1]); | 2315 | return virt_to_page(data->data_pages[pgoff - 1]); |
2321 | } | 2316 | } |
2322 | 2317 | ||
2318 | static void *perf_mmap_alloc_page(int cpu) | ||
2319 | { | ||
2320 | struct page *page; | ||
2321 | int node; | ||
2322 | |||
2323 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
2324 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
2325 | if (!page) | ||
2326 | return NULL; | ||
2327 | |||
2328 | return page_address(page); | ||
2329 | } | ||
2330 | |||
2323 | static struct perf_mmap_data * | 2331 | static struct perf_mmap_data * |
2324 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2332 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) |
2325 | { | 2333 | { |
@@ -2336,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
2336 | if (!data) | 2344 | if (!data) |
2337 | goto fail; | 2345 | goto fail; |
2338 | 2346 | ||
2339 | data->user_page = (void *)get_zeroed_page(GFP_KERNEL); | 2347 | data->user_page = perf_mmap_alloc_page(event->cpu); |
2340 | if (!data->user_page) | 2348 | if (!data->user_page) |
2341 | goto fail_user_page; | 2349 | goto fail_user_page; |
2342 | 2350 | ||
2343 | for (i = 0; i < nr_pages; i++) { | 2351 | for (i = 0; i < nr_pages; i++) { |
2344 | data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); | 2352 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); |
2345 | if (!data->data_pages[i]) | 2353 | if (!data->data_pages[i]) |
2346 | goto fail_data_pages; | 2354 | goto fail_data_pages; |
2347 | } | 2355 | } |
2348 | 2356 | ||
2349 | data->data_order = 0; | ||
2350 | data->nr_pages = nr_pages; | 2357 | data->nr_pages = nr_pages; |
2351 | 2358 | ||
2352 | return data; | 2359 | return data; |
@@ -2382,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) | |||
2382 | kfree(data); | 2389 | kfree(data); |
2383 | } | 2390 | } |
2384 | 2391 | ||
2392 | static inline int page_order(struct perf_mmap_data *data) | ||
2393 | { | ||
2394 | return 0; | ||
2395 | } | ||
2396 | |||
2385 | #else | 2397 | #else |
2386 | 2398 | ||
2387 | /* | 2399 | /* |
@@ -2390,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) | |||
2390 | * Required for architectures that have d-cache aliasing issues. | 2402 | * Required for architectures that have d-cache aliasing issues. |
2391 | */ | 2403 | */ |
2392 | 2404 | ||
2405 | static inline int page_order(struct perf_mmap_data *data) | ||
2406 | { | ||
2407 | return data->page_order; | ||
2408 | } | ||
2409 | |||
2393 | static struct page * | 2410 | static struct page * |
2394 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2411 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) |
2395 | { | 2412 | { |
2396 | if (pgoff > (1UL << data->data_order)) | 2413 | if (pgoff > (1UL << page_order(data))) |
2397 | return NULL; | 2414 | return NULL; |
2398 | 2415 | ||
2399 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); | 2416 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); |
@@ -2413,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) | |||
2413 | int i, nr; | 2430 | int i, nr; |
2414 | 2431 | ||
2415 | data = container_of(work, struct perf_mmap_data, work); | 2432 | data = container_of(work, struct perf_mmap_data, work); |
2416 | nr = 1 << data->data_order; | 2433 | nr = 1 << page_order(data); |
2417 | 2434 | ||
2418 | base = data->user_page; | 2435 | base = data->user_page; |
2419 | for (i = 0; i < nr + 1; i++) | 2436 | for (i = 0; i < nr + 1; i++) |
@@ -2452,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
2452 | 2469 | ||
2453 | data->user_page = all_buf; | 2470 | data->user_page = all_buf; |
2454 | data->data_pages[0] = all_buf + PAGE_SIZE; | 2471 | data->data_pages[0] = all_buf + PAGE_SIZE; |
2455 | data->data_order = ilog2(nr_pages); | 2472 | data->page_order = ilog2(nr_pages); |
2456 | data->nr_pages = 1; | 2473 | data->nr_pages = 1; |
2457 | 2474 | ||
2458 | return data; | 2475 | return data; |
@@ -2466,6 +2483,11 @@ fail: | |||
2466 | 2483 | ||
2467 | #endif | 2484 | #endif |
2468 | 2485 | ||
2486 | static unsigned long perf_data_size(struct perf_mmap_data *data) | ||
2487 | { | ||
2488 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); | ||
2489 | } | ||
2490 | |||
2469 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 2491 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
2470 | { | 2492 | { |
2471 | struct perf_event *event = vma->vm_file->private_data; | 2493 | struct perf_event *event = vma->vm_file->private_data; |
@@ -2506,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | |||
2506 | { | 2528 | { |
2507 | long max_size = perf_data_size(data); | 2529 | long max_size = perf_data_size(data); |
2508 | 2530 | ||
2509 | atomic_set(&data->lock, -1); | ||
2510 | |||
2511 | if (event->attr.watermark) { | 2531 | if (event->attr.watermark) { |
2512 | data->watermark = min_t(long, max_size, | 2532 | data->watermark = min_t(long, max_size, |
2513 | event->attr.wakeup_watermark); | 2533 | event->attr.wakeup_watermark); |
@@ -2580,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2580 | long user_extra, extra; | 2600 | long user_extra, extra; |
2581 | int ret = 0; | 2601 | int ret = 0; |
2582 | 2602 | ||
2603 | /* | ||
2604 | * Don't allow mmap() of inherited per-task counters. This would | ||
2605 | * create a performance issue due to all children writing to the | ||
2606 | * same buffer. | ||
2607 | */ | ||
2608 | if (event->cpu == -1 && event->attr.inherit) | ||
2609 | return -EINVAL; | ||
2610 | |||
2583 | if (!(vma->vm_flags & VM_SHARED)) | 2611 | if (!(vma->vm_flags & VM_SHARED)) |
2584 | return -EINVAL; | 2612 | return -EINVAL; |
2585 | 2613 | ||
@@ -2885,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
2885 | } | 2913 | } |
2886 | 2914 | ||
2887 | /* | 2915 | /* |
2888 | * Curious locking construct. | ||
2889 | * | ||
2890 | * We need to ensure a later event_id doesn't publish a head when a former | 2916 | * We need to ensure a later event_id doesn't publish a head when a former |
2891 | * event_id isn't done writing. However since we need to deal with NMIs we | 2917 | * event isn't done writing. However since we need to deal with NMIs we |
2892 | * cannot fully serialize things. | 2918 | * cannot fully serialize things. |
2893 | * | 2919 | * |
2894 | * What we do is serialize between CPUs so we only have to deal with NMI | ||
2895 | * nesting on a single CPU. | ||
2896 | * | ||
2897 | * We only publish the head (and generate a wakeup) when the outer-most | 2920 | * We only publish the head (and generate a wakeup) when the outer-most |
2898 | * event_id completes. | 2921 | * event completes. |
2899 | */ | 2922 | */ |
2900 | static void perf_output_lock(struct perf_output_handle *handle) | 2923 | static void perf_output_get_handle(struct perf_output_handle *handle) |
2901 | { | 2924 | { |
2902 | struct perf_mmap_data *data = handle->data; | 2925 | struct perf_mmap_data *data = handle->data; |
2903 | int cur, cpu = get_cpu(); | ||
2904 | |||
2905 | handle->locked = 0; | ||
2906 | |||
2907 | for (;;) { | ||
2908 | cur = atomic_cmpxchg(&data->lock, -1, cpu); | ||
2909 | if (cur == -1) { | ||
2910 | handle->locked = 1; | ||
2911 | break; | ||
2912 | } | ||
2913 | if (cur == cpu) | ||
2914 | break; | ||
2915 | 2926 | ||
2916 | cpu_relax(); | 2927 | preempt_disable(); |
2917 | } | 2928 | local_inc(&data->nest); |
2929 | handle->wakeup = local_read(&data->wakeup); | ||
2918 | } | 2930 | } |
2919 | 2931 | ||
2920 | static void perf_output_unlock(struct perf_output_handle *handle) | 2932 | static void perf_output_put_handle(struct perf_output_handle *handle) |
2921 | { | 2933 | { |
2922 | struct perf_mmap_data *data = handle->data; | 2934 | struct perf_mmap_data *data = handle->data; |
2923 | unsigned long head; | 2935 | unsigned long head; |
2924 | int cpu; | ||
2925 | |||
2926 | data->done_head = data->head; | ||
2927 | |||
2928 | if (!handle->locked) | ||
2929 | goto out; | ||
2930 | 2936 | ||
2931 | again: | 2937 | again: |
2932 | /* | 2938 | head = local_read(&data->head); |
2933 | * The xchg implies a full barrier that ensures all writes are done | ||
2934 | * before we publish the new head, matched by a rmb() in userspace when | ||
2935 | * reading this position. | ||
2936 | */ | ||
2937 | while ((head = atomic_long_xchg(&data->done_head, 0))) | ||
2938 | data->user_page->data_head = head; | ||
2939 | 2939 | ||
2940 | /* | 2940 | /* |
2941 | * NMI can happen here, which means we can miss a done_head update. | 2941 | * IRQ/NMI can happen here, which means we can miss a head update. |
2942 | */ | 2942 | */ |
2943 | 2943 | ||
2944 | cpu = atomic_xchg(&data->lock, -1); | 2944 | if (!local_dec_and_test(&data->nest)) |
2945 | WARN_ON_ONCE(cpu != smp_processor_id()); | 2945 | goto out; |
2946 | 2946 | ||
2947 | /* | 2947 | /* |
2948 | * Therefore we have to validate we did not indeed do so. | 2948 | * Publish the known good head. Rely on the full barrier implied |
2949 | * by atomic_dec_and_test() order the data->head read and this | ||
2950 | * write. | ||
2949 | */ | 2951 | */ |
2950 | if (unlikely(atomic_long_read(&data->done_head))) { | 2952 | data->user_page->data_head = head; |
2951 | /* | ||
2952 | * Since we had it locked, we can lock it again. | ||
2953 | */ | ||
2954 | while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) | ||
2955 | cpu_relax(); | ||
2956 | 2953 | ||
2954 | /* | ||
2955 | * Now check if we missed an update, rely on the (compiler) | ||
2956 | * barrier in atomic_dec_and_test() to re-read data->head. | ||
2957 | */ | ||
2958 | if (unlikely(head != local_read(&data->head))) { | ||
2959 | local_inc(&data->nest); | ||
2957 | goto again; | 2960 | goto again; |
2958 | } | 2961 | } |
2959 | 2962 | ||
2960 | if (atomic_xchg(&data->wakeup, 0)) | 2963 | if (handle->wakeup != local_read(&data->wakeup)) |
2961 | perf_output_wakeup(handle); | 2964 | perf_output_wakeup(handle); |
2962 | out: | 2965 | |
2963 | put_cpu(); | 2966 | out: |
2967 | preempt_enable(); | ||
2964 | } | 2968 | } |
2965 | 2969 | ||
2966 | void perf_output_copy(struct perf_output_handle *handle, | 2970 | __always_inline void perf_output_copy(struct perf_output_handle *handle, |
2967 | const void *buf, unsigned int len) | 2971 | const void *buf, unsigned int len) |
2968 | { | 2972 | { |
2969 | unsigned int pages_mask; | ||
2970 | unsigned long offset; | ||
2971 | unsigned int size; | ||
2972 | void **pages; | ||
2973 | |||
2974 | offset = handle->offset; | ||
2975 | pages_mask = handle->data->nr_pages - 1; | ||
2976 | pages = handle->data->data_pages; | ||
2977 | |||
2978 | do { | 2973 | do { |
2979 | unsigned long page_offset; | 2974 | unsigned long size = min_t(unsigned long, handle->size, len); |
2980 | unsigned long page_size; | ||
2981 | int nr; | ||
2982 | 2975 | ||
2983 | nr = (offset >> PAGE_SHIFT) & pages_mask; | 2976 | memcpy(handle->addr, buf, size); |
2984 | page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); | ||
2985 | page_offset = offset & (page_size - 1); | ||
2986 | size = min_t(unsigned int, page_size - page_offset, len); | ||
2987 | 2977 | ||
2988 | memcpy(pages[nr] + page_offset, buf, size); | 2978 | len -= size; |
2979 | handle->addr += size; | ||
2980 | handle->size -= size; | ||
2981 | if (!handle->size) { | ||
2982 | struct perf_mmap_data *data = handle->data; | ||
2989 | 2983 | ||
2990 | len -= size; | 2984 | handle->page++; |
2991 | buf += size; | 2985 | handle->page &= data->nr_pages - 1; |
2992 | offset += size; | 2986 | handle->addr = data->data_pages[handle->page]; |
2987 | handle->size = PAGE_SIZE << page_order(data); | ||
2988 | } | ||
2993 | } while (len); | 2989 | } while (len); |
2994 | |||
2995 | handle->offset = offset; | ||
2996 | |||
2997 | /* | ||
2998 | * Check we didn't copy past our reservation window, taking the | ||
2999 | * possible unsigned int wrap into account. | ||
3000 | */ | ||
3001 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
3002 | } | 2990 | } |
3003 | 2991 | ||
3004 | int perf_output_begin(struct perf_output_handle *handle, | 2992 | int perf_output_begin(struct perf_output_handle *handle, |
@@ -3036,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3036 | handle->sample = sample; | 3024 | handle->sample = sample; |
3037 | 3025 | ||
3038 | if (!data->nr_pages) | 3026 | if (!data->nr_pages) |
3039 | goto fail; | 3027 | goto out; |
3040 | 3028 | ||
3041 | have_lost = atomic_read(&data->lost); | 3029 | have_lost = local_read(&data->lost); |
3042 | if (have_lost) | 3030 | if (have_lost) |
3043 | size += sizeof(lost_event); | 3031 | size += sizeof(lost_event); |
3044 | 3032 | ||
3045 | perf_output_lock(handle); | 3033 | perf_output_get_handle(handle); |
3046 | 3034 | ||
3047 | do { | 3035 | do { |
3048 | /* | 3036 | /* |
@@ -3052,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3052 | */ | 3040 | */ |
3053 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3041 | tail = ACCESS_ONCE(data->user_page->data_tail); |
3054 | smp_rmb(); | 3042 | smp_rmb(); |
3055 | offset = head = atomic_long_read(&data->head); | 3043 | offset = head = local_read(&data->head); |
3056 | head += size; | 3044 | head += size; |
3057 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3045 | if (unlikely(!perf_output_space(data, tail, offset, head))) |
3058 | goto fail; | 3046 | goto fail; |
3059 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); | 3047 | } while (local_cmpxchg(&data->head, offset, head) != offset); |
3060 | 3048 | ||
3061 | handle->offset = offset; | 3049 | if (head - local_read(&data->wakeup) > data->watermark) |
3062 | handle->head = head; | 3050 | local_add(data->watermark, &data->wakeup); |
3063 | 3051 | ||
3064 | if (head - tail > data->watermark) | 3052 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); |
3065 | atomic_set(&data->wakeup, 1); | 3053 | handle->page &= data->nr_pages - 1; |
3054 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); | ||
3055 | handle->addr = data->data_pages[handle->page]; | ||
3056 | handle->addr += handle->size; | ||
3057 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; | ||
3066 | 3058 | ||
3067 | if (have_lost) { | 3059 | if (have_lost) { |
3068 | lost_event.header.type = PERF_RECORD_LOST; | 3060 | lost_event.header.type = PERF_RECORD_LOST; |
3069 | lost_event.header.misc = 0; | 3061 | lost_event.header.misc = 0; |
3070 | lost_event.header.size = sizeof(lost_event); | 3062 | lost_event.header.size = sizeof(lost_event); |
3071 | lost_event.id = event->id; | 3063 | lost_event.id = event->id; |
3072 | lost_event.lost = atomic_xchg(&data->lost, 0); | 3064 | lost_event.lost = local_xchg(&data->lost, 0); |
3073 | 3065 | ||
3074 | perf_output_put(handle, lost_event); | 3066 | perf_output_put(handle, lost_event); |
3075 | } | 3067 | } |
@@ -3077,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3077 | return 0; | 3069 | return 0; |
3078 | 3070 | ||
3079 | fail: | 3071 | fail: |
3080 | atomic_inc(&data->lost); | 3072 | local_inc(&data->lost); |
3081 | perf_output_unlock(handle); | 3073 | perf_output_put_handle(handle); |
3082 | out: | 3074 | out: |
3083 | rcu_read_unlock(); | 3075 | rcu_read_unlock(); |
3084 | 3076 | ||
@@ -3093,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3093 | int wakeup_events = event->attr.wakeup_events; | 3085 | int wakeup_events = event->attr.wakeup_events; |
3094 | 3086 | ||
3095 | if (handle->sample && wakeup_events) { | 3087 | if (handle->sample && wakeup_events) { |
3096 | int events = atomic_inc_return(&data->events); | 3088 | int events = local_inc_return(&data->events); |
3097 | if (events >= wakeup_events) { | 3089 | if (events >= wakeup_events) { |
3098 | atomic_sub(wakeup_events, &data->events); | 3090 | local_sub(wakeup_events, &data->events); |
3099 | atomic_set(&data->wakeup, 1); | 3091 | local_inc(&data->wakeup); |
3100 | } | 3092 | } |
3101 | } | 3093 | } |
3102 | 3094 | ||
3103 | perf_output_unlock(handle); | 3095 | perf_output_put_handle(handle); |
3104 | rcu_read_unlock(); | 3096 | rcu_read_unlock(); |
3105 | } | 3097 | } |
3106 | 3098 | ||
@@ -3436,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event, | |||
3436 | { | 3428 | { |
3437 | struct perf_output_handle handle; | 3429 | struct perf_output_handle handle; |
3438 | struct task_struct *task = task_event->task; | 3430 | struct task_struct *task = task_event->task; |
3439 | unsigned long flags; | ||
3440 | int size, ret; | 3431 | int size, ret; |
3441 | 3432 | ||
3442 | /* | ||
3443 | * If this CPU attempts to acquire an rq lock held by a CPU spinning | ||
3444 | * in perf_output_lock() from interrupt context, it's game over. | ||
3445 | */ | ||
3446 | local_irq_save(flags); | ||
3447 | |||
3448 | size = task_event->event_id.header.size; | 3433 | size = task_event->event_id.header.size; |
3449 | ret = perf_output_begin(&handle, event, size, 0, 0); | 3434 | ret = perf_output_begin(&handle, event, size, 0, 0); |
3450 | 3435 | ||
3451 | if (ret) { | 3436 | if (ret) |
3452 | local_irq_restore(flags); | ||
3453 | return; | 3437 | return; |
3454 | } | ||
3455 | 3438 | ||
3456 | task_event->event_id.pid = perf_event_pid(event, task); | 3439 | task_event->event_id.pid = perf_event_pid(event, task); |
3457 | task_event->event_id.ppid = perf_event_pid(event, current); | 3440 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3462,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event, | |||
3462 | perf_output_put(&handle, task_event->event_id); | 3445 | perf_output_put(&handle, task_event->event_id); |
3463 | 3446 | ||
3464 | perf_output_end(&handle); | 3447 | perf_output_end(&handle); |
3465 | local_irq_restore(flags); | ||
3466 | } | 3448 | } |
3467 | 3449 | ||
3468 | static int perf_event_task_match(struct perf_event *event) | 3450 | static int perf_event_task_match(struct perf_event *event) |
@@ -4020,9 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4020 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4002 | perf_swevent_overflow(event, 0, nmi, data, regs); |
4021 | } | 4003 | } |
4022 | 4004 | ||
4023 | static int perf_tp_event_match(struct perf_event *event, | ||
4024 | struct perf_sample_data *data); | ||
4025 | |||
4026 | static int perf_exclude_event(struct perf_event *event, | 4005 | static int perf_exclude_event(struct perf_event *event, |
4027 | struct pt_regs *regs) | 4006 | struct pt_regs *regs) |
4028 | { | 4007 | { |
@@ -4052,10 +4031,6 @@ static int perf_swevent_match(struct perf_event *event, | |||
4052 | if (perf_exclude_event(event, regs)) | 4031 | if (perf_exclude_event(event, regs)) |
4053 | return 0; | 4032 | return 0; |
4054 | 4033 | ||
4055 | if (event->attr.type == PERF_TYPE_TRACEPOINT && | ||
4056 | !perf_tp_event_match(event, data)) | ||
4057 | return 0; | ||
4058 | |||
4059 | return 1; | 4034 | return 1; |
4060 | } | 4035 | } |
4061 | 4036 | ||
@@ -4066,19 +4041,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id) | |||
4066 | return hash_64(val, SWEVENT_HLIST_BITS); | 4041 | return hash_64(val, SWEVENT_HLIST_BITS); |
4067 | } | 4042 | } |
4068 | 4043 | ||
4069 | static struct hlist_head * | 4044 | static inline struct hlist_head * |
4070 | find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4045 | __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) |
4071 | { | 4046 | { |
4072 | u64 hash; | 4047 | u64 hash = swevent_hash(type, event_id); |
4073 | struct swevent_hlist *hlist; | 4048 | |
4049 | return &hlist->heads[hash]; | ||
4050 | } | ||
4074 | 4051 | ||
4075 | hash = swevent_hash(type, event_id); | 4052 | /* For the read side: events when they trigger */ |
4053 | static inline struct hlist_head * | ||
4054 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | ||
4055 | { | ||
4056 | struct swevent_hlist *hlist; | ||
4076 | 4057 | ||
4077 | hlist = rcu_dereference(ctx->swevent_hlist); | 4058 | hlist = rcu_dereference(ctx->swevent_hlist); |
4078 | if (!hlist) | 4059 | if (!hlist) |
4079 | return NULL; | 4060 | return NULL; |
4080 | 4061 | ||
4081 | return &hlist->heads[hash]; | 4062 | return __find_swevent_head(hlist, type, event_id); |
4063 | } | ||
4064 | |||
4065 | /* For the event head insertion and removal in the hlist */ | ||
4066 | static inline struct hlist_head * | ||
4067 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | ||
4068 | { | ||
4069 | struct swevent_hlist *hlist; | ||
4070 | u32 event_id = event->attr.config; | ||
4071 | u64 type = event->attr.type; | ||
4072 | |||
4073 | /* | ||
4074 | * Event scheduling is always serialized against hlist allocation | ||
4075 | * and release. Which makes the protected version suitable here. | ||
4076 | * The context lock guarantees that. | ||
4077 | */ | ||
4078 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | ||
4079 | lockdep_is_held(&event->ctx->lock)); | ||
4080 | if (!hlist) | ||
4081 | return NULL; | ||
4082 | |||
4083 | return __find_swevent_head(hlist, type, event_id); | ||
4082 | } | 4084 | } |
4083 | 4085 | ||
4084 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | 4086 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, |
@@ -4095,7 +4097,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4095 | 4097 | ||
4096 | rcu_read_lock(); | 4098 | rcu_read_lock(); |
4097 | 4099 | ||
4098 | head = find_swevent_head(cpuctx, type, event_id); | 4100 | head = find_swevent_head_rcu(cpuctx, type, event_id); |
4099 | 4101 | ||
4100 | if (!head) | 4102 | if (!head) |
4101 | goto end; | 4103 | goto end; |
@@ -4110,7 +4112,7 @@ end: | |||
4110 | 4112 | ||
4111 | int perf_swevent_get_recursion_context(void) | 4113 | int perf_swevent_get_recursion_context(void) |
4112 | { | 4114 | { |
4113 | struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); | 4115 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
4114 | int rctx; | 4116 | int rctx; |
4115 | 4117 | ||
4116 | if (in_nmi()) | 4118 | if (in_nmi()) |
@@ -4122,10 +4124,8 @@ int perf_swevent_get_recursion_context(void) | |||
4122 | else | 4124 | else |
4123 | rctx = 0; | 4125 | rctx = 0; |
4124 | 4126 | ||
4125 | if (cpuctx->recursion[rctx]) { | 4127 | if (cpuctx->recursion[rctx]) |
4126 | put_cpu_var(perf_cpu_context); | ||
4127 | return -1; | 4128 | return -1; |
4128 | } | ||
4129 | 4129 | ||
4130 | cpuctx->recursion[rctx]++; | 4130 | cpuctx->recursion[rctx]++; |
4131 | barrier(); | 4131 | barrier(); |
@@ -4139,7 +4139,6 @@ void perf_swevent_put_recursion_context(int rctx) | |||
4139 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4139 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
4140 | barrier(); | 4140 | barrier(); |
4141 | cpuctx->recursion[rctx]--; | 4141 | cpuctx->recursion[rctx]--; |
4142 | put_cpu_var(perf_cpu_context); | ||
4143 | } | 4142 | } |
4144 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | 4143 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); |
4145 | 4144 | ||
@@ -4150,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
4150 | struct perf_sample_data data; | 4149 | struct perf_sample_data data; |
4151 | int rctx; | 4150 | int rctx; |
4152 | 4151 | ||
4152 | preempt_disable_notrace(); | ||
4153 | rctx = perf_swevent_get_recursion_context(); | 4153 | rctx = perf_swevent_get_recursion_context(); |
4154 | if (rctx < 0) | 4154 | if (rctx < 0) |
4155 | return; | 4155 | return; |
@@ -4159,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
4159 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); | 4159 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); |
4160 | 4160 | ||
4161 | perf_swevent_put_recursion_context(rctx); | 4161 | perf_swevent_put_recursion_context(rctx); |
4162 | preempt_enable_notrace(); | ||
4162 | } | 4163 | } |
4163 | 4164 | ||
4164 | static void perf_swevent_read(struct perf_event *event) | 4165 | static void perf_swevent_read(struct perf_event *event) |
@@ -4178,7 +4179,7 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4178 | perf_swevent_set_period(event); | 4179 | perf_swevent_set_period(event); |
4179 | } | 4180 | } |
4180 | 4181 | ||
4181 | head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); | 4182 | head = find_swevent_head(cpuctx, event); |
4182 | if (WARN_ON_ONCE(!head)) | 4183 | if (WARN_ON_ONCE(!head)) |
4183 | return -EINVAL; | 4184 | return -EINVAL; |
4184 | 4185 | ||
@@ -4366,6 +4367,14 @@ static const struct pmu perf_ops_task_clock = { | |||
4366 | .read = task_clock_perf_event_read, | 4367 | .read = task_clock_perf_event_read, |
4367 | }; | 4368 | }; |
4368 | 4369 | ||
4370 | /* Deref the hlist from the update side */ | ||
4371 | static inline struct swevent_hlist * | ||
4372 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | ||
4373 | { | ||
4374 | return rcu_dereference_protected(cpuctx->swevent_hlist, | ||
4375 | lockdep_is_held(&cpuctx->hlist_mutex)); | ||
4376 | } | ||
4377 | |||
4369 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4378 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
4370 | { | 4379 | { |
4371 | struct swevent_hlist *hlist; | 4380 | struct swevent_hlist *hlist; |
@@ -4376,12 +4385,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
4376 | 4385 | ||
4377 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4386 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) |
4378 | { | 4387 | { |
4379 | struct swevent_hlist *hlist; | 4388 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); |
4380 | 4389 | ||
4381 | if (!cpuctx->swevent_hlist) | 4390 | if (!hlist) |
4382 | return; | 4391 | return; |
4383 | 4392 | ||
4384 | hlist = cpuctx->swevent_hlist; | ||
4385 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4393 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); |
4386 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4394 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4387 | } | 4395 | } |
@@ -4418,7 +4426,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4418 | 4426 | ||
4419 | mutex_lock(&cpuctx->hlist_mutex); | 4427 | mutex_lock(&cpuctx->hlist_mutex); |
4420 | 4428 | ||
4421 | if (!cpuctx->swevent_hlist && cpu_online(cpu)) { | 4429 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { |
4422 | struct swevent_hlist *hlist; | 4430 | struct swevent_hlist *hlist; |
4423 | 4431 | ||
4424 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4432 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4467,10 +4475,46 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4467 | 4475 | ||
4468 | #ifdef CONFIG_EVENT_TRACING | 4476 | #ifdef CONFIG_EVENT_TRACING |
4469 | 4477 | ||
4470 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | 4478 | static const struct pmu perf_ops_tracepoint = { |
4471 | int entry_size, struct pt_regs *regs) | 4479 | .enable = perf_trace_enable, |
4480 | .disable = perf_trace_disable, | ||
4481 | .read = perf_swevent_read, | ||
4482 | .unthrottle = perf_swevent_unthrottle, | ||
4483 | }; | ||
4484 | |||
4485 | static int perf_tp_filter_match(struct perf_event *event, | ||
4486 | struct perf_sample_data *data) | ||
4487 | { | ||
4488 | void *record = data->raw->data; | ||
4489 | |||
4490 | if (likely(!event->filter) || filter_match_preds(event->filter, record)) | ||
4491 | return 1; | ||
4492 | return 0; | ||
4493 | } | ||
4494 | |||
4495 | static int perf_tp_event_match(struct perf_event *event, | ||
4496 | struct perf_sample_data *data, | ||
4497 | struct pt_regs *regs) | ||
4498 | { | ||
4499 | /* | ||
4500 | * All tracepoints are from kernel-space. | ||
4501 | */ | ||
4502 | if (event->attr.exclude_kernel) | ||
4503 | return 0; | ||
4504 | |||
4505 | if (!perf_tp_filter_match(event, data)) | ||
4506 | return 0; | ||
4507 | |||
4508 | return 1; | ||
4509 | } | ||
4510 | |||
4511 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | ||
4512 | struct pt_regs *regs, struct hlist_head *head) | ||
4472 | { | 4513 | { |
4473 | struct perf_sample_data data; | 4514 | struct perf_sample_data data; |
4515 | struct perf_event *event; | ||
4516 | struct hlist_node *node; | ||
4517 | |||
4474 | struct perf_raw_record raw = { | 4518 | struct perf_raw_record raw = { |
4475 | .size = entry_size, | 4519 | .size = entry_size, |
4476 | .data = record, | 4520 | .data = record, |
@@ -4479,26 +4523,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | |||
4479 | perf_sample_data_init(&data, addr); | 4523 | perf_sample_data_init(&data, addr); |
4480 | data.raw = &raw; | 4524 | data.raw = &raw; |
4481 | 4525 | ||
4482 | /* Trace events already protected against recursion */ | 4526 | rcu_read_lock(); |
4483 | do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, | 4527 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4484 | &data, regs); | 4528 | if (perf_tp_event_match(event, &data, regs)) |
4529 | perf_swevent_add(event, count, 1, &data, regs); | ||
4530 | } | ||
4531 | rcu_read_unlock(); | ||
4485 | } | 4532 | } |
4486 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4533 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4487 | 4534 | ||
4488 | static int perf_tp_event_match(struct perf_event *event, | ||
4489 | struct perf_sample_data *data) | ||
4490 | { | ||
4491 | void *record = data->raw->data; | ||
4492 | |||
4493 | if (likely(!event->filter) || filter_match_preds(event->filter, record)) | ||
4494 | return 1; | ||
4495 | return 0; | ||
4496 | } | ||
4497 | |||
4498 | static void tp_perf_event_destroy(struct perf_event *event) | 4535 | static void tp_perf_event_destroy(struct perf_event *event) |
4499 | { | 4536 | { |
4500 | perf_trace_disable(event->attr.config); | 4537 | perf_trace_destroy(event); |
4501 | swevent_hlist_put(event); | ||
4502 | } | 4538 | } |
4503 | 4539 | ||
4504 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4540 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
@@ -4514,17 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4514 | !capable(CAP_SYS_ADMIN)) | 4550 | !capable(CAP_SYS_ADMIN)) |
4515 | return ERR_PTR(-EPERM); | 4551 | return ERR_PTR(-EPERM); |
4516 | 4552 | ||
4517 | if (perf_trace_enable(event->attr.config)) | 4553 | err = perf_trace_init(event); |
4554 | if (err) | ||
4518 | return NULL; | 4555 | return NULL; |
4519 | 4556 | ||
4520 | event->destroy = tp_perf_event_destroy; | 4557 | event->destroy = tp_perf_event_destroy; |
4521 | err = swevent_hlist_get(event); | ||
4522 | if (err) { | ||
4523 | perf_trace_disable(event->attr.config); | ||
4524 | return ERR_PTR(err); | ||
4525 | } | ||
4526 | 4558 | ||
4527 | return &perf_ops_generic; | 4559 | return &perf_ops_tracepoint; |
4528 | } | 4560 | } |
4529 | 4561 | ||
4530 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4562 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4552,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4552 | 4584 | ||
4553 | #else | 4585 | #else |
4554 | 4586 | ||
4555 | static int perf_tp_event_match(struct perf_event *event, | ||
4556 | struct perf_sample_data *data) | ||
4557 | { | ||
4558 | return 1; | ||
4559 | } | ||
4560 | |||
4561 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4587 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
4562 | { | 4588 | { |
4563 | return NULL; | 4589 | return NULL; |
@@ -4894,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) | |||
4894 | int fput_needed = 0; | 4920 | int fput_needed = 0; |
4895 | int ret = -EINVAL; | 4921 | int ret = -EINVAL; |
4896 | 4922 | ||
4923 | /* | ||
4924 | * Don't allow output of inherited per-task events. This would | ||
4925 | * create performance issues due to cross cpu access. | ||
4926 | */ | ||
4927 | if (event->cpu == -1 && event->attr.inherit) | ||
4928 | return -EINVAL; | ||
4929 | |||
4897 | if (!output_fd) | 4930 | if (!output_fd) |
4898 | goto set; | 4931 | goto set; |
4899 | 4932 | ||
@@ -4914,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) | |||
4914 | if (event->data) | 4947 | if (event->data) |
4915 | goto out; | 4948 | goto out; |
4916 | 4949 | ||
4950 | /* | ||
4951 | * Don't allow cross-cpu buffers | ||
4952 | */ | ||
4953 | if (output_event->cpu != event->cpu) | ||
4954 | goto out; | ||
4955 | |||
4956 | /* | ||
4957 | * If its not a per-cpu buffer, it must be the same task. | ||
4958 | */ | ||
4959 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | ||
4960 | goto out; | ||
4961 | |||
4917 | atomic_long_inc(&output_file->f_count); | 4962 | atomic_long_inc(&output_file->f_count); |
4918 | 4963 | ||
4919 | set: | 4964 | set: |