diff options
Diffstat (limited to 'kernel/perf_event.c')
| -rw-r--r-- | kernel/perf_event.c | 389 |
1 files changed, 217 insertions, 172 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a4fa381db3c2..e099650cd249 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -2297,11 +2297,6 @@ unlock: | |||
| 2297 | rcu_read_unlock(); | 2297 | rcu_read_unlock(); |
| 2298 | } | 2298 | } |
| 2299 | 2299 | ||
| 2300 | static unsigned long perf_data_size(struct perf_mmap_data *data) | ||
| 2301 | { | ||
| 2302 | return data->nr_pages << (PAGE_SHIFT + data->data_order); | ||
| 2303 | } | ||
| 2304 | |||
| 2305 | #ifndef CONFIG_PERF_USE_VMALLOC | 2300 | #ifndef CONFIG_PERF_USE_VMALLOC |
| 2306 | 2301 | ||
| 2307 | /* | 2302 | /* |
| @@ -2320,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | |||
| 2320 | return virt_to_page(data->data_pages[pgoff - 1]); | 2315 | return virt_to_page(data->data_pages[pgoff - 1]); |
| 2321 | } | 2316 | } |
| 2322 | 2317 | ||
| 2318 | static void *perf_mmap_alloc_page(int cpu) | ||
| 2319 | { | ||
| 2320 | struct page *page; | ||
| 2321 | int node; | ||
| 2322 | |||
| 2323 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
| 2324 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
| 2325 | if (!page) | ||
| 2326 | return NULL; | ||
| 2327 | |||
| 2328 | return page_address(page); | ||
| 2329 | } | ||
| 2330 | |||
| 2323 | static struct perf_mmap_data * | 2331 | static struct perf_mmap_data * |
| 2324 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2332 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) |
| 2325 | { | 2333 | { |
| @@ -2336,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
| 2336 | if (!data) | 2344 | if (!data) |
| 2337 | goto fail; | 2345 | goto fail; |
| 2338 | 2346 | ||
| 2339 | data->user_page = (void *)get_zeroed_page(GFP_KERNEL); | 2347 | data->user_page = perf_mmap_alloc_page(event->cpu); |
| 2340 | if (!data->user_page) | 2348 | if (!data->user_page) |
| 2341 | goto fail_user_page; | 2349 | goto fail_user_page; |
| 2342 | 2350 | ||
| 2343 | for (i = 0; i < nr_pages; i++) { | 2351 | for (i = 0; i < nr_pages; i++) { |
| 2344 | data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); | 2352 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); |
| 2345 | if (!data->data_pages[i]) | 2353 | if (!data->data_pages[i]) |
| 2346 | goto fail_data_pages; | 2354 | goto fail_data_pages; |
| 2347 | } | 2355 | } |
| 2348 | 2356 | ||
| 2349 | data->data_order = 0; | ||
| 2350 | data->nr_pages = nr_pages; | 2357 | data->nr_pages = nr_pages; |
| 2351 | 2358 | ||
| 2352 | return data; | 2359 | return data; |
| @@ -2382,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) | |||
| 2382 | kfree(data); | 2389 | kfree(data); |
| 2383 | } | 2390 | } |
| 2384 | 2391 | ||
| 2392 | static inline int page_order(struct perf_mmap_data *data) | ||
| 2393 | { | ||
| 2394 | return 0; | ||
| 2395 | } | ||
| 2396 | |||
| 2385 | #else | 2397 | #else |
| 2386 | 2398 | ||
| 2387 | /* | 2399 | /* |
| @@ -2390,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) | |||
| 2390 | * Required for architectures that have d-cache aliasing issues. | 2402 | * Required for architectures that have d-cache aliasing issues. |
| 2391 | */ | 2403 | */ |
| 2392 | 2404 | ||
| 2405 | static inline int page_order(struct perf_mmap_data *data) | ||
| 2406 | { | ||
| 2407 | return data->page_order; | ||
| 2408 | } | ||
| 2409 | |||
| 2393 | static struct page * | 2410 | static struct page * |
| 2394 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2411 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) |
| 2395 | { | 2412 | { |
| 2396 | if (pgoff > (1UL << data->data_order)) | 2413 | if (pgoff > (1UL << page_order(data))) |
| 2397 | return NULL; | 2414 | return NULL; |
| 2398 | 2415 | ||
| 2399 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); | 2416 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); |
| @@ -2413,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) | |||
| 2413 | int i, nr; | 2430 | int i, nr; |
| 2414 | 2431 | ||
| 2415 | data = container_of(work, struct perf_mmap_data, work); | 2432 | data = container_of(work, struct perf_mmap_data, work); |
| 2416 | nr = 1 << data->data_order; | 2433 | nr = 1 << page_order(data); |
| 2417 | 2434 | ||
| 2418 | base = data->user_page; | 2435 | base = data->user_page; |
| 2419 | for (i = 0; i < nr + 1; i++) | 2436 | for (i = 0; i < nr + 1; i++) |
| @@ -2452,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
| 2452 | 2469 | ||
| 2453 | data->user_page = all_buf; | 2470 | data->user_page = all_buf; |
| 2454 | data->data_pages[0] = all_buf + PAGE_SIZE; | 2471 | data->data_pages[0] = all_buf + PAGE_SIZE; |
| 2455 | data->data_order = ilog2(nr_pages); | 2472 | data->page_order = ilog2(nr_pages); |
| 2456 | data->nr_pages = 1; | 2473 | data->nr_pages = 1; |
| 2457 | 2474 | ||
| 2458 | return data; | 2475 | return data; |
| @@ -2466,6 +2483,11 @@ fail: | |||
| 2466 | 2483 | ||
| 2467 | #endif | 2484 | #endif |
| 2468 | 2485 | ||
| 2486 | static unsigned long perf_data_size(struct perf_mmap_data *data) | ||
| 2487 | { | ||
| 2488 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); | ||
| 2489 | } | ||
| 2490 | |||
| 2469 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 2491 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 2470 | { | 2492 | { |
| 2471 | struct perf_event *event = vma->vm_file->private_data; | 2493 | struct perf_event *event = vma->vm_file->private_data; |
| @@ -2506,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | |||
| 2506 | { | 2528 | { |
| 2507 | long max_size = perf_data_size(data); | 2529 | long max_size = perf_data_size(data); |
| 2508 | 2530 | ||
| 2509 | atomic_set(&data->lock, -1); | ||
| 2510 | |||
| 2511 | if (event->attr.watermark) { | 2531 | if (event->attr.watermark) { |
| 2512 | data->watermark = min_t(long, max_size, | 2532 | data->watermark = min_t(long, max_size, |
| 2513 | event->attr.wakeup_watermark); | 2533 | event->attr.wakeup_watermark); |
| @@ -2580,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 2580 | long user_extra, extra; | 2600 | long user_extra, extra; |
| 2581 | int ret = 0; | 2601 | int ret = 0; |
| 2582 | 2602 | ||
| 2603 | /* | ||
| 2604 | * Don't allow mmap() of inherited per-task counters. This would | ||
| 2605 | * create a performance issue due to all children writing to the | ||
| 2606 | * same buffer. | ||
| 2607 | */ | ||
| 2608 | if (event->cpu == -1 && event->attr.inherit) | ||
| 2609 | return -EINVAL; | ||
| 2610 | |||
| 2583 | if (!(vma->vm_flags & VM_SHARED)) | 2611 | if (!(vma->vm_flags & VM_SHARED)) |
| 2584 | return -EINVAL; | 2612 | return -EINVAL; |
| 2585 | 2613 | ||
| @@ -2885,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
| 2885 | } | 2913 | } |
| 2886 | 2914 | ||
| 2887 | /* | 2915 | /* |
| 2888 | * Curious locking construct. | ||
| 2889 | * | ||
| 2890 | * We need to ensure a later event_id doesn't publish a head when a former | 2916 | * We need to ensure a later event_id doesn't publish a head when a former |
| 2891 | * event_id isn't done writing. However since we need to deal with NMIs we | 2917 | * event isn't done writing. However since we need to deal with NMIs we |
| 2892 | * cannot fully serialize things. | 2918 | * cannot fully serialize things. |
| 2893 | * | 2919 | * |
| 2894 | * What we do is serialize between CPUs so we only have to deal with NMI | ||
| 2895 | * nesting on a single CPU. | ||
| 2896 | * | ||
| 2897 | * We only publish the head (and generate a wakeup) when the outer-most | 2920 | * We only publish the head (and generate a wakeup) when the outer-most |
| 2898 | * event_id completes. | 2921 | * event completes. |
| 2899 | */ | 2922 | */ |
| 2900 | static void perf_output_lock(struct perf_output_handle *handle) | 2923 | static void perf_output_get_handle(struct perf_output_handle *handle) |
| 2901 | { | 2924 | { |
| 2902 | struct perf_mmap_data *data = handle->data; | 2925 | struct perf_mmap_data *data = handle->data; |
| 2903 | int cur, cpu = get_cpu(); | ||
| 2904 | |||
| 2905 | handle->locked = 0; | ||
| 2906 | |||
| 2907 | for (;;) { | ||
| 2908 | cur = atomic_cmpxchg(&data->lock, -1, cpu); | ||
| 2909 | if (cur == -1) { | ||
| 2910 | handle->locked = 1; | ||
| 2911 | break; | ||
| 2912 | } | ||
| 2913 | if (cur == cpu) | ||
| 2914 | break; | ||
| 2915 | 2926 | ||
| 2916 | cpu_relax(); | 2927 | preempt_disable(); |
| 2917 | } | 2928 | local_inc(&data->nest); |
| 2929 | handle->wakeup = local_read(&data->wakeup); | ||
| 2918 | } | 2930 | } |
| 2919 | 2931 | ||
| 2920 | static void perf_output_unlock(struct perf_output_handle *handle) | 2932 | static void perf_output_put_handle(struct perf_output_handle *handle) |
| 2921 | { | 2933 | { |
| 2922 | struct perf_mmap_data *data = handle->data; | 2934 | struct perf_mmap_data *data = handle->data; |
| 2923 | unsigned long head; | 2935 | unsigned long head; |
| 2924 | int cpu; | ||
| 2925 | |||
| 2926 | data->done_head = data->head; | ||
| 2927 | |||
| 2928 | if (!handle->locked) | ||
| 2929 | goto out; | ||
| 2930 | 2936 | ||
| 2931 | again: | 2937 | again: |
| 2932 | /* | 2938 | head = local_read(&data->head); |
| 2933 | * The xchg implies a full barrier that ensures all writes are done | ||
| 2934 | * before we publish the new head, matched by a rmb() in userspace when | ||
| 2935 | * reading this position. | ||
| 2936 | */ | ||
| 2937 | while ((head = atomic_long_xchg(&data->done_head, 0))) | ||
| 2938 | data->user_page->data_head = head; | ||
| 2939 | 2939 | ||
| 2940 | /* | 2940 | /* |
| 2941 | * NMI can happen here, which means we can miss a done_head update. | 2941 | * IRQ/NMI can happen here, which means we can miss a head update. |
| 2942 | */ | 2942 | */ |
| 2943 | 2943 | ||
| 2944 | cpu = atomic_xchg(&data->lock, -1); | 2944 | if (!local_dec_and_test(&data->nest)) |
| 2945 | WARN_ON_ONCE(cpu != smp_processor_id()); | 2945 | goto out; |
| 2946 | 2946 | ||
| 2947 | /* | 2947 | /* |
| 2948 | * Therefore we have to validate we did not indeed do so. | 2948 | * Publish the known good head. Rely on the full barrier implied |
| 2949 | * by atomic_dec_and_test() order the data->head read and this | ||
| 2950 | * write. | ||
| 2949 | */ | 2951 | */ |
| 2950 | if (unlikely(atomic_long_read(&data->done_head))) { | 2952 | data->user_page->data_head = head; |
| 2951 | /* | ||
| 2952 | * Since we had it locked, we can lock it again. | ||
| 2953 | */ | ||
| 2954 | while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) | ||
| 2955 | cpu_relax(); | ||
| 2956 | 2953 | ||
| 2954 | /* | ||
| 2955 | * Now check if we missed an update, rely on the (compiler) | ||
| 2956 | * barrier in atomic_dec_and_test() to re-read data->head. | ||
| 2957 | */ | ||
| 2958 | if (unlikely(head != local_read(&data->head))) { | ||
| 2959 | local_inc(&data->nest); | ||
| 2957 | goto again; | 2960 | goto again; |
| 2958 | } | 2961 | } |
| 2959 | 2962 | ||
| 2960 | if (atomic_xchg(&data->wakeup, 0)) | 2963 | if (handle->wakeup != local_read(&data->wakeup)) |
| 2961 | perf_output_wakeup(handle); | 2964 | perf_output_wakeup(handle); |
| 2962 | out: | 2965 | |
| 2963 | put_cpu(); | 2966 | out: |
| 2967 | preempt_enable(); | ||
| 2964 | } | 2968 | } |
| 2965 | 2969 | ||
| 2966 | void perf_output_copy(struct perf_output_handle *handle, | 2970 | __always_inline void perf_output_copy(struct perf_output_handle *handle, |
| 2967 | const void *buf, unsigned int len) | 2971 | const void *buf, unsigned int len) |
| 2968 | { | 2972 | { |
| 2969 | unsigned int pages_mask; | ||
| 2970 | unsigned long offset; | ||
| 2971 | unsigned int size; | ||
| 2972 | void **pages; | ||
| 2973 | |||
| 2974 | offset = handle->offset; | ||
| 2975 | pages_mask = handle->data->nr_pages - 1; | ||
| 2976 | pages = handle->data->data_pages; | ||
| 2977 | |||
| 2978 | do { | 2973 | do { |
| 2979 | unsigned long page_offset; | 2974 | unsigned long size = min_t(unsigned long, handle->size, len); |
| 2980 | unsigned long page_size; | ||
| 2981 | int nr; | ||
| 2982 | 2975 | ||
| 2983 | nr = (offset >> PAGE_SHIFT) & pages_mask; | 2976 | memcpy(handle->addr, buf, size); |
| 2984 | page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); | ||
| 2985 | page_offset = offset & (page_size - 1); | ||
| 2986 | size = min_t(unsigned int, page_size - page_offset, len); | ||
| 2987 | 2977 | ||
| 2988 | memcpy(pages[nr] + page_offset, buf, size); | 2978 | len -= size; |
| 2979 | handle->addr += size; | ||
| 2980 | handle->size -= size; | ||
| 2981 | if (!handle->size) { | ||
| 2982 | struct perf_mmap_data *data = handle->data; | ||
| 2989 | 2983 | ||
| 2990 | len -= size; | 2984 | handle->page++; |
| 2991 | buf += size; | 2985 | handle->page &= data->nr_pages - 1; |
| 2992 | offset += size; | 2986 | handle->addr = data->data_pages[handle->page]; |
| 2987 | handle->size = PAGE_SIZE << page_order(data); | ||
| 2988 | } | ||
| 2993 | } while (len); | 2989 | } while (len); |
| 2994 | |||
| 2995 | handle->offset = offset; | ||
| 2996 | |||
| 2997 | /* | ||
| 2998 | * Check we didn't copy past our reservation window, taking the | ||
| 2999 | * possible unsigned int wrap into account. | ||
| 3000 | */ | ||
| 3001 | WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); | ||
| 3002 | } | 2990 | } |
| 3003 | 2991 | ||
| 3004 | int perf_output_begin(struct perf_output_handle *handle, | 2992 | int perf_output_begin(struct perf_output_handle *handle, |
| @@ -3036,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3036 | handle->sample = sample; | 3024 | handle->sample = sample; |
| 3037 | 3025 | ||
| 3038 | if (!data->nr_pages) | 3026 | if (!data->nr_pages) |
| 3039 | goto fail; | 3027 | goto out; |
| 3040 | 3028 | ||
| 3041 | have_lost = atomic_read(&data->lost); | 3029 | have_lost = local_read(&data->lost); |
| 3042 | if (have_lost) | 3030 | if (have_lost) |
| 3043 | size += sizeof(lost_event); | 3031 | size += sizeof(lost_event); |
| 3044 | 3032 | ||
| 3045 | perf_output_lock(handle); | 3033 | perf_output_get_handle(handle); |
| 3046 | 3034 | ||
| 3047 | do { | 3035 | do { |
| 3048 | /* | 3036 | /* |
| @@ -3052,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3052 | */ | 3040 | */ |
| 3053 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3041 | tail = ACCESS_ONCE(data->user_page->data_tail); |
| 3054 | smp_rmb(); | 3042 | smp_rmb(); |
| 3055 | offset = head = atomic_long_read(&data->head); | 3043 | offset = head = local_read(&data->head); |
| 3056 | head += size; | 3044 | head += size; |
| 3057 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3045 | if (unlikely(!perf_output_space(data, tail, offset, head))) |
| 3058 | goto fail; | 3046 | goto fail; |
| 3059 | } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); | 3047 | } while (local_cmpxchg(&data->head, offset, head) != offset); |
| 3060 | 3048 | ||
| 3061 | handle->offset = offset; | 3049 | if (head - local_read(&data->wakeup) > data->watermark) |
| 3062 | handle->head = head; | 3050 | local_add(data->watermark, &data->wakeup); |
| 3063 | 3051 | ||
| 3064 | if (head - tail > data->watermark) | 3052 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); |
| 3065 | atomic_set(&data->wakeup, 1); | 3053 | handle->page &= data->nr_pages - 1; |
| 3054 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); | ||
| 3055 | handle->addr = data->data_pages[handle->page]; | ||
| 3056 | handle->addr += handle->size; | ||
| 3057 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; | ||
| 3066 | 3058 | ||
| 3067 | if (have_lost) { | 3059 | if (have_lost) { |
| 3068 | lost_event.header.type = PERF_RECORD_LOST; | 3060 | lost_event.header.type = PERF_RECORD_LOST; |
| 3069 | lost_event.header.misc = 0; | 3061 | lost_event.header.misc = 0; |
| 3070 | lost_event.header.size = sizeof(lost_event); | 3062 | lost_event.header.size = sizeof(lost_event); |
| 3071 | lost_event.id = event->id; | 3063 | lost_event.id = event->id; |
| 3072 | lost_event.lost = atomic_xchg(&data->lost, 0); | 3064 | lost_event.lost = local_xchg(&data->lost, 0); |
| 3073 | 3065 | ||
| 3074 | perf_output_put(handle, lost_event); | 3066 | perf_output_put(handle, lost_event); |
| 3075 | } | 3067 | } |
| @@ -3077,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3077 | return 0; | 3069 | return 0; |
| 3078 | 3070 | ||
| 3079 | fail: | 3071 | fail: |
| 3080 | atomic_inc(&data->lost); | 3072 | local_inc(&data->lost); |
| 3081 | perf_output_unlock(handle); | 3073 | perf_output_put_handle(handle); |
| 3082 | out: | 3074 | out: |
| 3083 | rcu_read_unlock(); | 3075 | rcu_read_unlock(); |
| 3084 | 3076 | ||
| @@ -3093,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle) | |||
| 3093 | int wakeup_events = event->attr.wakeup_events; | 3085 | int wakeup_events = event->attr.wakeup_events; |
| 3094 | 3086 | ||
| 3095 | if (handle->sample && wakeup_events) { | 3087 | if (handle->sample && wakeup_events) { |
| 3096 | int events = atomic_inc_return(&data->events); | 3088 | int events = local_inc_return(&data->events); |
| 3097 | if (events >= wakeup_events) { | 3089 | if (events >= wakeup_events) { |
| 3098 | atomic_sub(wakeup_events, &data->events); | 3090 | local_sub(wakeup_events, &data->events); |
| 3099 | atomic_set(&data->wakeup, 1); | 3091 | local_inc(&data->wakeup); |
| 3100 | } | 3092 | } |
| 3101 | } | 3093 | } |
| 3102 | 3094 | ||
| 3103 | perf_output_unlock(handle); | 3095 | perf_output_put_handle(handle); |
| 3104 | rcu_read_unlock(); | 3096 | rcu_read_unlock(); |
| 3105 | } | 3097 | } |
| 3106 | 3098 | ||
| @@ -3436,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3436 | { | 3428 | { |
| 3437 | struct perf_output_handle handle; | 3429 | struct perf_output_handle handle; |
| 3438 | struct task_struct *task = task_event->task; | 3430 | struct task_struct *task = task_event->task; |
| 3439 | unsigned long flags; | ||
| 3440 | int size, ret; | 3431 | int size, ret; |
| 3441 | 3432 | ||
| 3442 | /* | ||
| 3443 | * If this CPU attempts to acquire an rq lock held by a CPU spinning | ||
| 3444 | * in perf_output_lock() from interrupt context, it's game over. | ||
| 3445 | */ | ||
| 3446 | local_irq_save(flags); | ||
| 3447 | |||
| 3448 | size = task_event->event_id.header.size; | 3433 | size = task_event->event_id.header.size; |
| 3449 | ret = perf_output_begin(&handle, event, size, 0, 0); | 3434 | ret = perf_output_begin(&handle, event, size, 0, 0); |
| 3450 | 3435 | ||
| 3451 | if (ret) { | 3436 | if (ret) |
| 3452 | local_irq_restore(flags); | ||
| 3453 | return; | 3437 | return; |
| 3454 | } | ||
| 3455 | 3438 | ||
| 3456 | task_event->event_id.pid = perf_event_pid(event, task); | 3439 | task_event->event_id.pid = perf_event_pid(event, task); |
| 3457 | task_event->event_id.ppid = perf_event_pid(event, current); | 3440 | task_event->event_id.ppid = perf_event_pid(event, current); |
| @@ -3462,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3462 | perf_output_put(&handle, task_event->event_id); | 3445 | perf_output_put(&handle, task_event->event_id); |
| 3463 | 3446 | ||
| 3464 | perf_output_end(&handle); | 3447 | perf_output_end(&handle); |
| 3465 | local_irq_restore(flags); | ||
| 3466 | } | 3448 | } |
| 3467 | 3449 | ||
| 3468 | static int perf_event_task_match(struct perf_event *event) | 3450 | static int perf_event_task_match(struct perf_event *event) |
| @@ -4020,9 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
| 4020 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4002 | perf_swevent_overflow(event, 0, nmi, data, regs); |
| 4021 | } | 4003 | } |
| 4022 | 4004 | ||
| 4023 | static int perf_tp_event_match(struct perf_event *event, | ||
| 4024 | struct perf_sample_data *data); | ||
| 4025 | |||
| 4026 | static int perf_exclude_event(struct perf_event *event, | 4005 | static int perf_exclude_event(struct perf_event *event, |
| 4027 | struct pt_regs *regs) | 4006 | struct pt_regs *regs) |
| 4028 | { | 4007 | { |
| @@ -4052,10 +4031,6 @@ static int perf_swevent_match(struct perf_event *event, | |||
| 4052 | if (perf_exclude_event(event, regs)) | 4031 | if (perf_exclude_event(event, regs)) |
| 4053 | return 0; | 4032 | return 0; |
| 4054 | 4033 | ||
| 4055 | if (event->attr.type == PERF_TYPE_TRACEPOINT && | ||
| 4056 | !perf_tp_event_match(event, data)) | ||
| 4057 | return 0; | ||
| 4058 | |||
| 4059 | return 1; | 4034 | return 1; |
| 4060 | } | 4035 | } |
| 4061 | 4036 | ||
| @@ -4066,19 +4041,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id) | |||
| 4066 | return hash_64(val, SWEVENT_HLIST_BITS); | 4041 | return hash_64(val, SWEVENT_HLIST_BITS); |
| 4067 | } | 4042 | } |
| 4068 | 4043 | ||
| 4069 | static struct hlist_head * | 4044 | static inline struct hlist_head * |
| 4070 | find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4045 | __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) |
| 4071 | { | 4046 | { |
| 4072 | u64 hash; | 4047 | u64 hash = swevent_hash(type, event_id); |
| 4073 | struct swevent_hlist *hlist; | 4048 | |
| 4049 | return &hlist->heads[hash]; | ||
| 4050 | } | ||
| 4074 | 4051 | ||
| 4075 | hash = swevent_hash(type, event_id); | 4052 | /* For the read side: events when they trigger */ |
| 4053 | static inline struct hlist_head * | ||
| 4054 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | ||
| 4055 | { | ||
| 4056 | struct swevent_hlist *hlist; | ||
| 4076 | 4057 | ||
| 4077 | hlist = rcu_dereference(ctx->swevent_hlist); | 4058 | hlist = rcu_dereference(ctx->swevent_hlist); |
| 4078 | if (!hlist) | 4059 | if (!hlist) |
| 4079 | return NULL; | 4060 | return NULL; |
| 4080 | 4061 | ||
| 4081 | return &hlist->heads[hash]; | 4062 | return __find_swevent_head(hlist, type, event_id); |
| 4063 | } | ||
| 4064 | |||
| 4065 | /* For the event head insertion and removal in the hlist */ | ||
| 4066 | static inline struct hlist_head * | ||
| 4067 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | ||
| 4068 | { | ||
| 4069 | struct swevent_hlist *hlist; | ||
| 4070 | u32 event_id = event->attr.config; | ||
| 4071 | u64 type = event->attr.type; | ||
| 4072 | |||
| 4073 | /* | ||
| 4074 | * Event scheduling is always serialized against hlist allocation | ||
| 4075 | * and release. Which makes the protected version suitable here. | ||
| 4076 | * The context lock guarantees that. | ||
| 4077 | */ | ||
| 4078 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | ||
| 4079 | lockdep_is_held(&event->ctx->lock)); | ||
| 4080 | if (!hlist) | ||
| 4081 | return NULL; | ||
| 4082 | |||
| 4083 | return __find_swevent_head(hlist, type, event_id); | ||
| 4082 | } | 4084 | } |
| 4083 | 4085 | ||
| 4084 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | 4086 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, |
| @@ -4095,7 +4097,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 4095 | 4097 | ||
| 4096 | rcu_read_lock(); | 4098 | rcu_read_lock(); |
| 4097 | 4099 | ||
| 4098 | head = find_swevent_head(cpuctx, type, event_id); | 4100 | head = find_swevent_head_rcu(cpuctx, type, event_id); |
| 4099 | 4101 | ||
| 4100 | if (!head) | 4102 | if (!head) |
| 4101 | goto end; | 4103 | goto end; |
| @@ -4110,7 +4112,7 @@ end: | |||
| 4110 | 4112 | ||
| 4111 | int perf_swevent_get_recursion_context(void) | 4113 | int perf_swevent_get_recursion_context(void) |
| 4112 | { | 4114 | { |
| 4113 | struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); | 4115 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 4114 | int rctx; | 4116 | int rctx; |
| 4115 | 4117 | ||
| 4116 | if (in_nmi()) | 4118 | if (in_nmi()) |
| @@ -4122,10 +4124,8 @@ int perf_swevent_get_recursion_context(void) | |||
| 4122 | else | 4124 | else |
| 4123 | rctx = 0; | 4125 | rctx = 0; |
| 4124 | 4126 | ||
| 4125 | if (cpuctx->recursion[rctx]) { | 4127 | if (cpuctx->recursion[rctx]) |
| 4126 | put_cpu_var(perf_cpu_context); | ||
| 4127 | return -1; | 4128 | return -1; |
| 4128 | } | ||
| 4129 | 4129 | ||
| 4130 | cpuctx->recursion[rctx]++; | 4130 | cpuctx->recursion[rctx]++; |
| 4131 | barrier(); | 4131 | barrier(); |
| @@ -4139,7 +4139,6 @@ void perf_swevent_put_recursion_context(int rctx) | |||
| 4139 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4139 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 4140 | barrier(); | 4140 | barrier(); |
| 4141 | cpuctx->recursion[rctx]--; | 4141 | cpuctx->recursion[rctx]--; |
| 4142 | put_cpu_var(perf_cpu_context); | ||
| 4143 | } | 4142 | } |
| 4144 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | 4143 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); |
| 4145 | 4144 | ||
| @@ -4150,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
| 4150 | struct perf_sample_data data; | 4149 | struct perf_sample_data data; |
| 4151 | int rctx; | 4150 | int rctx; |
| 4152 | 4151 | ||
| 4152 | preempt_disable_notrace(); | ||
| 4153 | rctx = perf_swevent_get_recursion_context(); | 4153 | rctx = perf_swevent_get_recursion_context(); |
| 4154 | if (rctx < 0) | 4154 | if (rctx < 0) |
| 4155 | return; | 4155 | return; |
| @@ -4159,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
| 4159 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); | 4159 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); |
| 4160 | 4160 | ||
| 4161 | perf_swevent_put_recursion_context(rctx); | 4161 | perf_swevent_put_recursion_context(rctx); |
| 4162 | preempt_enable_notrace(); | ||
| 4162 | } | 4163 | } |
| 4163 | 4164 | ||
| 4164 | static void perf_swevent_read(struct perf_event *event) | 4165 | static void perf_swevent_read(struct perf_event *event) |
| @@ -4178,7 +4179,7 @@ static int perf_swevent_enable(struct perf_event *event) | |||
| 4178 | perf_swevent_set_period(event); | 4179 | perf_swevent_set_period(event); |
| 4179 | } | 4180 | } |
| 4180 | 4181 | ||
| 4181 | head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); | 4182 | head = find_swevent_head(cpuctx, event); |
| 4182 | if (WARN_ON_ONCE(!head)) | 4183 | if (WARN_ON_ONCE(!head)) |
| 4183 | return -EINVAL; | 4184 | return -EINVAL; |
| 4184 | 4185 | ||
| @@ -4366,6 +4367,14 @@ static const struct pmu perf_ops_task_clock = { | |||
| 4366 | .read = task_clock_perf_event_read, | 4367 | .read = task_clock_perf_event_read, |
| 4367 | }; | 4368 | }; |
| 4368 | 4369 | ||
| 4370 | /* Deref the hlist from the update side */ | ||
| 4371 | static inline struct swevent_hlist * | ||
| 4372 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | ||
| 4373 | { | ||
| 4374 | return rcu_dereference_protected(cpuctx->swevent_hlist, | ||
| 4375 | lockdep_is_held(&cpuctx->hlist_mutex)); | ||
| 4376 | } | ||
| 4377 | |||
| 4369 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4378 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
| 4370 | { | 4379 | { |
| 4371 | struct swevent_hlist *hlist; | 4380 | struct swevent_hlist *hlist; |
| @@ -4376,12 +4385,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
| 4376 | 4385 | ||
| 4377 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4386 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) |
| 4378 | { | 4387 | { |
| 4379 | struct swevent_hlist *hlist; | 4388 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); |
| 4380 | 4389 | ||
| 4381 | if (!cpuctx->swevent_hlist) | 4390 | if (!hlist) |
| 4382 | return; | 4391 | return; |
| 4383 | 4392 | ||
| 4384 | hlist = cpuctx->swevent_hlist; | ||
| 4385 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4393 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); |
| 4386 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4394 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
| 4387 | } | 4395 | } |
| @@ -4418,7 +4426,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
| 4418 | 4426 | ||
| 4419 | mutex_lock(&cpuctx->hlist_mutex); | 4427 | mutex_lock(&cpuctx->hlist_mutex); |
| 4420 | 4428 | ||
| 4421 | if (!cpuctx->swevent_hlist && cpu_online(cpu)) { | 4429 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { |
| 4422 | struct swevent_hlist *hlist; | 4430 | struct swevent_hlist *hlist; |
| 4423 | 4431 | ||
| 4424 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4432 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
| @@ -4467,10 +4475,46 @@ static int swevent_hlist_get(struct perf_event *event) | |||
| 4467 | 4475 | ||
| 4468 | #ifdef CONFIG_EVENT_TRACING | 4476 | #ifdef CONFIG_EVENT_TRACING |
| 4469 | 4477 | ||
| 4470 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | 4478 | static const struct pmu perf_ops_tracepoint = { |
| 4471 | int entry_size, struct pt_regs *regs) | 4479 | .enable = perf_trace_enable, |
| 4480 | .disable = perf_trace_disable, | ||
| 4481 | .read = perf_swevent_read, | ||
| 4482 | .unthrottle = perf_swevent_unthrottle, | ||
| 4483 | }; | ||
| 4484 | |||
| 4485 | static int perf_tp_filter_match(struct perf_event *event, | ||
| 4486 | struct perf_sample_data *data) | ||
| 4487 | { | ||
| 4488 | void *record = data->raw->data; | ||
| 4489 | |||
| 4490 | if (likely(!event->filter) || filter_match_preds(event->filter, record)) | ||
| 4491 | return 1; | ||
| 4492 | return 0; | ||
| 4493 | } | ||
| 4494 | |||
| 4495 | static int perf_tp_event_match(struct perf_event *event, | ||
| 4496 | struct perf_sample_data *data, | ||
| 4497 | struct pt_regs *regs) | ||
| 4498 | { | ||
| 4499 | /* | ||
| 4500 | * All tracepoints are from kernel-space. | ||
| 4501 | */ | ||
| 4502 | if (event->attr.exclude_kernel) | ||
| 4503 | return 0; | ||
| 4504 | |||
| 4505 | if (!perf_tp_filter_match(event, data)) | ||
| 4506 | return 0; | ||
| 4507 | |||
| 4508 | return 1; | ||
| 4509 | } | ||
| 4510 | |||
| 4511 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | ||
| 4512 | struct pt_regs *regs, struct hlist_head *head) | ||
| 4472 | { | 4513 | { |
| 4473 | struct perf_sample_data data; | 4514 | struct perf_sample_data data; |
| 4515 | struct perf_event *event; | ||
| 4516 | struct hlist_node *node; | ||
| 4517 | |||
| 4474 | struct perf_raw_record raw = { | 4518 | struct perf_raw_record raw = { |
| 4475 | .size = entry_size, | 4519 | .size = entry_size, |
| 4476 | .data = record, | 4520 | .data = record, |
| @@ -4479,26 +4523,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | |||
| 4479 | perf_sample_data_init(&data, addr); | 4523 | perf_sample_data_init(&data, addr); |
| 4480 | data.raw = &raw; | 4524 | data.raw = &raw; |
| 4481 | 4525 | ||
| 4482 | /* Trace events already protected against recursion */ | 4526 | rcu_read_lock(); |
| 4483 | do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, | 4527 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 4484 | &data, regs); | 4528 | if (perf_tp_event_match(event, &data, regs)) |
| 4529 | perf_swevent_add(event, count, 1, &data, regs); | ||
| 4530 | } | ||
| 4531 | rcu_read_unlock(); | ||
| 4485 | } | 4532 | } |
| 4486 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4533 | EXPORT_SYMBOL_GPL(perf_tp_event); |
| 4487 | 4534 | ||
| 4488 | static int perf_tp_event_match(struct perf_event *event, | ||
| 4489 | struct perf_sample_data *data) | ||
| 4490 | { | ||
| 4491 | void *record = data->raw->data; | ||
| 4492 | |||
| 4493 | if (likely(!event->filter) || filter_match_preds(event->filter, record)) | ||
| 4494 | return 1; | ||
| 4495 | return 0; | ||
| 4496 | } | ||
| 4497 | |||
| 4498 | static void tp_perf_event_destroy(struct perf_event *event) | 4535 | static void tp_perf_event_destroy(struct perf_event *event) |
| 4499 | { | 4536 | { |
| 4500 | perf_trace_disable(event->attr.config); | 4537 | perf_trace_destroy(event); |
| 4501 | swevent_hlist_put(event); | ||
| 4502 | } | 4538 | } |
| 4503 | 4539 | ||
| 4504 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4540 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
| @@ -4514,17 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
| 4514 | !capable(CAP_SYS_ADMIN)) | 4550 | !capable(CAP_SYS_ADMIN)) |
| 4515 | return ERR_PTR(-EPERM); | 4551 | return ERR_PTR(-EPERM); |
| 4516 | 4552 | ||
| 4517 | if (perf_trace_enable(event->attr.config)) | 4553 | err = perf_trace_init(event); |
| 4554 | if (err) | ||
| 4518 | return NULL; | 4555 | return NULL; |
| 4519 | 4556 | ||
| 4520 | event->destroy = tp_perf_event_destroy; | 4557 | event->destroy = tp_perf_event_destroy; |
| 4521 | err = swevent_hlist_get(event); | ||
| 4522 | if (err) { | ||
| 4523 | perf_trace_disable(event->attr.config); | ||
| 4524 | return ERR_PTR(err); | ||
| 4525 | } | ||
| 4526 | 4558 | ||
| 4527 | return &perf_ops_generic; | 4559 | return &perf_ops_tracepoint; |
| 4528 | } | 4560 | } |
| 4529 | 4561 | ||
| 4530 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4562 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
| @@ -4552,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 4552 | 4584 | ||
| 4553 | #else | 4585 | #else |
| 4554 | 4586 | ||
| 4555 | static int perf_tp_event_match(struct perf_event *event, | ||
| 4556 | struct perf_sample_data *data) | ||
| 4557 | { | ||
| 4558 | return 1; | ||
| 4559 | } | ||
| 4560 | |||
| 4561 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4587 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
| 4562 | { | 4588 | { |
| 4563 | return NULL; | 4589 | return NULL; |
| @@ -4894,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) | |||
| 4894 | int fput_needed = 0; | 4920 | int fput_needed = 0; |
| 4895 | int ret = -EINVAL; | 4921 | int ret = -EINVAL; |
| 4896 | 4922 | ||
| 4923 | /* | ||
| 4924 | * Don't allow output of inherited per-task events. This would | ||
| 4925 | * create performance issues due to cross cpu access. | ||
| 4926 | */ | ||
| 4927 | if (event->cpu == -1 && event->attr.inherit) | ||
| 4928 | return -EINVAL; | ||
| 4929 | |||
| 4897 | if (!output_fd) | 4930 | if (!output_fd) |
| 4898 | goto set; | 4931 | goto set; |
| 4899 | 4932 | ||
| @@ -4914,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) | |||
| 4914 | if (event->data) | 4947 | if (event->data) |
| 4915 | goto out; | 4948 | goto out; |
| 4916 | 4949 | ||
| 4950 | /* | ||
| 4951 | * Don't allow cross-cpu buffers | ||
| 4952 | */ | ||
| 4953 | if (output_event->cpu != event->cpu) | ||
| 4954 | goto out; | ||
| 4955 | |||
| 4956 | /* | ||
| 4957 | * If its not a per-cpu buffer, it must be the same task. | ||
| 4958 | */ | ||
| 4959 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | ||
| 4960 | goto out; | ||
| 4961 | |||
| 4917 | atomic_long_inc(&output_file->f_count); | 4962 | atomic_long_inc(&output_file->f_count); |
| 4918 | 4963 | ||
| 4919 | set: | 4964 | set: |
