diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-05-27 06:54:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-05-31 02:46:08 -0400 |
commit | ac9721f3f54b27a16c7e1afb2481e7ee95a70318 (patch) | |
tree | a9f21d60c7c4c1910696553a6f8273edcca03c64 /kernel/perf_event.c | |
parent | 67a3e12b05e055c0415c556a315a3d3eb637e29e (diff) |
perf_events: Fix races and clean up perf_event and perf_mmap_data interaction
In order to move toward separate buffer objects, rework the whole
perf_mmap_data construct to be a more self-sufficient entity, one
with its own lifetime rules.
This greatly sanitizes the whole output redirection code, which
was riddled with bugs and races.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 224 |
1 files changed, 126 insertions, 98 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bd7ce8ca5bb9..848d49a043e9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -1841,6 +1841,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
1841 | } | 1841 | } |
1842 | 1842 | ||
1843 | static void perf_pending_sync(struct perf_event *event); | 1843 | static void perf_pending_sync(struct perf_event *event); |
1844 | static void perf_mmap_data_put(struct perf_mmap_data *data); | ||
1844 | 1845 | ||
1845 | static void free_event(struct perf_event *event) | 1846 | static void free_event(struct perf_event *event) |
1846 | { | 1847 | { |
@@ -1856,9 +1857,9 @@ static void free_event(struct perf_event *event) | |||
1856 | atomic_dec(&nr_task_events); | 1857 | atomic_dec(&nr_task_events); |
1857 | } | 1858 | } |
1858 | 1859 | ||
1859 | if (event->output) { | 1860 | if (event->data) { |
1860 | fput(event->output->filp); | 1861 | perf_mmap_data_put(event->data); |
1861 | event->output = NULL; | 1862 | event->data = NULL; |
1862 | } | 1863 | } |
1863 | 1864 | ||
1864 | if (event->destroy) | 1865 | if (event->destroy) |
@@ -2175,7 +2176,27 @@ unlock: | |||
2175 | return ret; | 2176 | return ret; |
2176 | } | 2177 | } |
2177 | 2178 | ||
2178 | static int perf_event_set_output(struct perf_event *event, int output_fd); | 2179 | static const struct file_operations perf_fops; |
2180 | |||
2181 | static struct perf_event *perf_fget_light(int fd, int *fput_needed) | ||
2182 | { | ||
2183 | struct file *file; | ||
2184 | |||
2185 | file = fget_light(fd, fput_needed); | ||
2186 | if (!file) | ||
2187 | return ERR_PTR(-EBADF); | ||
2188 | |||
2189 | if (file->f_op != &perf_fops) { | ||
2190 | fput_light(file, *fput_needed); | ||
2191 | *fput_needed = 0; | ||
2192 | return ERR_PTR(-EBADF); | ||
2193 | } | ||
2194 | |||
2195 | return file->private_data; | ||
2196 | } | ||
2197 | |||
2198 | static int perf_event_set_output(struct perf_event *event, | ||
2199 | struct perf_event *output_event); | ||
2179 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 2200 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
2180 | 2201 | ||
2181 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 2202 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
@@ -2202,7 +2223,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
2202 | return perf_event_period(event, (u64 __user *)arg); | 2223 | return perf_event_period(event, (u64 __user *)arg); |
2203 | 2224 | ||
2204 | case PERF_EVENT_IOC_SET_OUTPUT: | 2225 | case PERF_EVENT_IOC_SET_OUTPUT: |
2205 | return perf_event_set_output(event, arg); | 2226 | { |
2227 | struct perf_event *output_event = NULL; | ||
2228 | int fput_needed = 0; | ||
2229 | int ret; | ||
2230 | |||
2231 | if (arg != -1) { | ||
2232 | output_event = perf_fget_light(arg, &fput_needed); | ||
2233 | if (IS_ERR(output_event)) | ||
2234 | return PTR_ERR(output_event); | ||
2235 | } | ||
2236 | |||
2237 | ret = perf_event_set_output(event, output_event); | ||
2238 | if (output_event) | ||
2239 | fput_light(output_event->filp, fput_needed); | ||
2240 | |||
2241 | return ret; | ||
2242 | } | ||
2206 | 2243 | ||
2207 | case PERF_EVENT_IOC_SET_FILTER: | 2244 | case PERF_EVENT_IOC_SET_FILTER: |
2208 | return perf_event_set_filter(event, (void __user *)arg); | 2245 | return perf_event_set_filter(event, (void __user *)arg); |
@@ -2335,8 +2372,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
2335 | unsigned long size; | 2372 | unsigned long size; |
2336 | int i; | 2373 | int i; |
2337 | 2374 | ||
2338 | WARN_ON(atomic_read(&event->mmap_count)); | ||
2339 | |||
2340 | size = sizeof(struct perf_mmap_data); | 2375 | size = sizeof(struct perf_mmap_data); |
2341 | size += nr_pages * sizeof(void *); | 2376 | size += nr_pages * sizeof(void *); |
2342 | 2377 | ||
@@ -2452,8 +2487,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | |||
2452 | unsigned long size; | 2487 | unsigned long size; |
2453 | void *all_buf; | 2488 | void *all_buf; |
2454 | 2489 | ||
2455 | WARN_ON(atomic_read(&event->mmap_count)); | ||
2456 | |||
2457 | size = sizeof(struct perf_mmap_data); | 2490 | size = sizeof(struct perf_mmap_data); |
2458 | size += sizeof(void *); | 2491 | size += sizeof(void *); |
2459 | 2492 | ||
@@ -2536,7 +2569,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | |||
2536 | if (!data->watermark) | 2569 | if (!data->watermark) |
2537 | data->watermark = max_size / 2; | 2570 | data->watermark = max_size / 2; |
2538 | 2571 | ||
2539 | 2572 | atomic_set(&data->refcount, 1); | |
2540 | rcu_assign_pointer(event->data, data); | 2573 | rcu_assign_pointer(event->data, data); |
2541 | } | 2574 | } |
2542 | 2575 | ||
@@ -2548,13 +2581,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) | |||
2548 | perf_mmap_data_free(data); | 2581 | perf_mmap_data_free(data); |
2549 | } | 2582 | } |
2550 | 2583 | ||
2551 | static void perf_mmap_data_release(struct perf_event *event) | 2584 | static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) |
2552 | { | 2585 | { |
2553 | struct perf_mmap_data *data = event->data; | 2586 | struct perf_mmap_data *data; |
2587 | |||
2588 | rcu_read_lock(); | ||
2589 | data = rcu_dereference(event->data); | ||
2590 | if (data) { | ||
2591 | if (!atomic_inc_not_zero(&data->refcount)) | ||
2592 | data = NULL; | ||
2593 | } | ||
2594 | rcu_read_unlock(); | ||
2595 | |||
2596 | return data; | ||
2597 | } | ||
2554 | 2598 | ||
2555 | WARN_ON(atomic_read(&event->mmap_count)); | 2599 | static void perf_mmap_data_put(struct perf_mmap_data *data) |
2600 | { | ||
2601 | if (!atomic_dec_and_test(&data->refcount)) | ||
2602 | return; | ||
2556 | 2603 | ||
2557 | rcu_assign_pointer(event->data, NULL); | ||
2558 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); | 2604 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); |
2559 | } | 2605 | } |
2560 | 2606 | ||
@@ -2569,15 +2615,18 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
2569 | { | 2615 | { |
2570 | struct perf_event *event = vma->vm_file->private_data; | 2616 | struct perf_event *event = vma->vm_file->private_data; |
2571 | 2617 | ||
2572 | WARN_ON_ONCE(event->ctx->parent_ctx); | ||
2573 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 2618 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
2574 | unsigned long size = perf_data_size(event->data); | 2619 | unsigned long size = perf_data_size(event->data); |
2575 | struct user_struct *user = current_user(); | 2620 | struct user_struct *user = event->mmap_user; |
2621 | struct perf_mmap_data *data = event->data; | ||
2576 | 2622 | ||
2577 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 2623 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
2578 | vma->vm_mm->locked_vm -= event->data->nr_locked; | 2624 | vma->vm_mm->locked_vm -= event->mmap_locked; |
2579 | perf_mmap_data_release(event); | 2625 | rcu_assign_pointer(event->data, NULL); |
2580 | mutex_unlock(&event->mmap_mutex); | 2626 | mutex_unlock(&event->mmap_mutex); |
2627 | |||
2628 | perf_mmap_data_put(data); | ||
2629 | free_uid(user); | ||
2581 | } | 2630 | } |
2582 | } | 2631 | } |
2583 | 2632 | ||
@@ -2629,13 +2678,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2629 | 2678 | ||
2630 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2679 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2631 | mutex_lock(&event->mmap_mutex); | 2680 | mutex_lock(&event->mmap_mutex); |
2632 | if (event->output) { | 2681 | if (event->data) { |
2633 | ret = -EINVAL; | 2682 | if (event->data->nr_pages == nr_pages) |
2634 | goto unlock; | 2683 | atomic_inc(&event->data->refcount); |
2635 | } | 2684 | else |
2636 | |||
2637 | if (atomic_inc_not_zero(&event->mmap_count)) { | ||
2638 | if (nr_pages != event->data->nr_pages) | ||
2639 | ret = -EINVAL; | 2685 | ret = -EINVAL; |
2640 | goto unlock; | 2686 | goto unlock; |
2641 | } | 2687 | } |
@@ -2667,21 +2713,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2667 | WARN_ON(event->data); | 2713 | WARN_ON(event->data); |
2668 | 2714 | ||
2669 | data = perf_mmap_data_alloc(event, nr_pages); | 2715 | data = perf_mmap_data_alloc(event, nr_pages); |
2670 | ret = -ENOMEM; | 2716 | if (!data) { |
2671 | if (!data) | 2717 | ret = -ENOMEM; |
2672 | goto unlock; | 2718 | goto unlock; |
2719 | } | ||
2673 | 2720 | ||
2674 | ret = 0; | ||
2675 | perf_mmap_data_init(event, data); | 2721 | perf_mmap_data_init(event, data); |
2676 | |||
2677 | atomic_set(&event->mmap_count, 1); | ||
2678 | atomic_long_add(user_extra, &user->locked_vm); | ||
2679 | vma->vm_mm->locked_vm += extra; | ||
2680 | event->data->nr_locked = extra; | ||
2681 | if (vma->vm_flags & VM_WRITE) | 2722 | if (vma->vm_flags & VM_WRITE) |
2682 | event->data->writable = 1; | 2723 | event->data->writable = 1; |
2683 | 2724 | ||
2725 | atomic_long_add(user_extra, &user->locked_vm); | ||
2726 | event->mmap_locked = extra; | ||
2727 | event->mmap_user = get_current_user(); | ||
2728 | vma->vm_mm->locked_vm += event->mmap_locked; | ||
2729 | |||
2684 | unlock: | 2730 | unlock: |
2731 | if (!ret) | ||
2732 | atomic_inc(&event->mmap_count); | ||
2685 | mutex_unlock(&event->mmap_mutex); | 2733 | mutex_unlock(&event->mmap_mutex); |
2686 | 2734 | ||
2687 | vma->vm_flags |= VM_RESERVED; | 2735 | vma->vm_flags |= VM_RESERVED; |
@@ -2993,7 +3041,6 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
2993 | struct perf_event *event, unsigned int size, | 3041 | struct perf_event *event, unsigned int size, |
2994 | int nmi, int sample) | 3042 | int nmi, int sample) |
2995 | { | 3043 | { |
2996 | struct perf_event *output_event; | ||
2997 | struct perf_mmap_data *data; | 3044 | struct perf_mmap_data *data; |
2998 | unsigned long tail, offset, head; | 3045 | unsigned long tail, offset, head; |
2999 | int have_lost; | 3046 | int have_lost; |
@@ -3010,10 +3057,6 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3010 | if (event->parent) | 3057 | if (event->parent) |
3011 | event = event->parent; | 3058 | event = event->parent; |
3012 | 3059 | ||
3013 | output_event = rcu_dereference(event->output); | ||
3014 | if (output_event) | ||
3015 | event = output_event; | ||
3016 | |||
3017 | data = rcu_dereference(event->data); | 3060 | data = rcu_dereference(event->data); |
3018 | if (!data) | 3061 | if (!data) |
3019 | goto out; | 3062 | goto out; |
@@ -4912,39 +4955,17 @@ err_size: | |||
4912 | goto out; | 4955 | goto out; |
4913 | } | 4956 | } |
4914 | 4957 | ||
4915 | static int perf_event_set_output(struct perf_event *event, int output_fd) | 4958 | static int |
4959 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | ||
4916 | { | 4960 | { |
4917 | struct perf_event *output_event = NULL; | 4961 | struct perf_mmap_data *data = NULL, *old_data = NULL; |
4918 | struct file *output_file = NULL; | ||
4919 | struct perf_event *old_output; | ||
4920 | int fput_needed = 0; | ||
4921 | int ret = -EINVAL; | 4962 | int ret = -EINVAL; |
4922 | 4963 | ||
4923 | /* | 4964 | if (!output_event) |
4924 | * Don't allow output of inherited per-task events. This would | ||
4925 | * create performance issues due to cross cpu access. | ||
4926 | */ | ||
4927 | if (event->cpu == -1 && event->attr.inherit) | ||
4928 | return -EINVAL; | ||
4929 | |||
4930 | if (!output_fd) | ||
4931 | goto set; | 4965 | goto set; |
4932 | 4966 | ||
4933 | output_file = fget_light(output_fd, &fput_needed); | 4967 | /* don't allow circular references */ |
4934 | if (!output_file) | 4968 | if (event == output_event) |
4935 | return -EBADF; | ||
4936 | |||
4937 | if (output_file->f_op != &perf_fops) | ||
4938 | goto out; | ||
4939 | |||
4940 | output_event = output_file->private_data; | ||
4941 | |||
4942 | /* Don't chain output fds */ | ||
4943 | if (output_event->output) | ||
4944 | goto out; | ||
4945 | |||
4946 | /* Don't set an output fd when we already have an output channel */ | ||
4947 | if (event->data) | ||
4948 | goto out; | 4969 | goto out; |
4949 | 4970 | ||
4950 | /* | 4971 | /* |
@@ -4959,26 +4980,28 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) | |||
4959 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 4980 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
4960 | goto out; | 4981 | goto out; |
4961 | 4982 | ||
4962 | atomic_long_inc(&output_file->f_count); | ||
4963 | |||
4964 | set: | 4983 | set: |
4965 | mutex_lock(&event->mmap_mutex); | 4984 | mutex_lock(&event->mmap_mutex); |
4966 | old_output = event->output; | 4985 | /* Can't redirect output if we've got an active mmap() */ |
4967 | rcu_assign_pointer(event->output, output_event); | 4986 | if (atomic_read(&event->mmap_count)) |
4968 | mutex_unlock(&event->mmap_mutex); | 4987 | goto unlock; |
4969 | 4988 | ||
4970 | if (old_output) { | 4989 | if (output_event) { |
4971 | /* | 4990 | /* get the buffer we want to redirect to */ |
4972 | * we need to make sure no existing perf_output_*() | 4991 | data = perf_mmap_data_get(output_event); |
4973 | * is still referencing this event. | 4992 | if (!data) |
4974 | */ | 4993 | goto unlock; |
4975 | synchronize_rcu(); | ||
4976 | fput(old_output->filp); | ||
4977 | } | 4994 | } |
4978 | 4995 | ||
4996 | old_data = event->data; | ||
4997 | rcu_assign_pointer(event->data, data); | ||
4979 | ret = 0; | 4998 | ret = 0; |
4999 | unlock: | ||
5000 | mutex_unlock(&event->mmap_mutex); | ||
5001 | |||
5002 | if (old_data) | ||
5003 | perf_mmap_data_put(old_data); | ||
4980 | out: | 5004 | out: |
4981 | fput_light(output_file, fput_needed); | ||
4982 | return ret; | 5005 | return ret; |
4983 | } | 5006 | } |
4984 | 5007 | ||
@@ -4994,7 +5017,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
4994 | struct perf_event_attr __user *, attr_uptr, | 5017 | struct perf_event_attr __user *, attr_uptr, |
4995 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5018 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
4996 | { | 5019 | { |
4997 | struct perf_event *event, *group_leader; | 5020 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; |
4998 | struct perf_event_attr attr; | 5021 | struct perf_event_attr attr; |
4999 | struct perf_event_context *ctx; | 5022 | struct perf_event_context *ctx; |
5000 | struct file *event_file = NULL; | 5023 | struct file *event_file = NULL; |
@@ -5034,19 +5057,25 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5034 | goto err_fd; | 5057 | goto err_fd; |
5035 | } | 5058 | } |
5036 | 5059 | ||
5060 | if (group_fd != -1) { | ||
5061 | group_leader = perf_fget_light(group_fd, &fput_needed); | ||
5062 | if (IS_ERR(group_leader)) { | ||
5063 | err = PTR_ERR(group_leader); | ||
5064 | goto err_put_context; | ||
5065 | } | ||
5066 | group_file = group_leader->filp; | ||
5067 | if (flags & PERF_FLAG_FD_OUTPUT) | ||
5068 | output_event = group_leader; | ||
5069 | if (flags & PERF_FLAG_FD_NO_GROUP) | ||
5070 | group_leader = NULL; | ||
5071 | } | ||
5072 | |||
5037 | /* | 5073 | /* |
5038 | * Look up the group leader (we will attach this event to it): | 5074 | * Look up the group leader (we will attach this event to it): |
5039 | */ | 5075 | */ |
5040 | group_leader = NULL; | 5076 | if (group_leader) { |
5041 | if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { | ||
5042 | err = -EINVAL; | 5077 | err = -EINVAL; |
5043 | group_file = fget_light(group_fd, &fput_needed); | ||
5044 | if (!group_file) | ||
5045 | goto err_put_context; | ||
5046 | if (group_file->f_op != &perf_fops) | ||
5047 | goto err_put_context; | ||
5048 | 5078 | ||
5049 | group_leader = group_file->private_data; | ||
5050 | /* | 5079 | /* |
5051 | * Do not allow a recursive hierarchy (this new sibling | 5080 | * Do not allow a recursive hierarchy (this new sibling |
5052 | * becoming part of another group-sibling): | 5081 | * becoming part of another group-sibling): |
@@ -5068,9 +5097,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5068 | 5097 | ||
5069 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | 5098 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, |
5070 | NULL, NULL, GFP_KERNEL); | 5099 | NULL, NULL, GFP_KERNEL); |
5071 | err = PTR_ERR(event); | 5100 | if (IS_ERR(event)) { |
5072 | if (IS_ERR(event)) | 5101 | err = PTR_ERR(event); |
5073 | goto err_put_context; | 5102 | goto err_put_context; |
5103 | } | ||
5104 | |||
5105 | if (output_event) { | ||
5106 | err = perf_event_set_output(event, output_event); | ||
5107 | if (err) | ||
5108 | goto err_free_put_context; | ||
5109 | } | ||
5074 | 5110 | ||
5075 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5111 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5076 | if (IS_ERR(event_file)) { | 5112 | if (IS_ERR(event_file)) { |
@@ -5078,12 +5114,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5078 | goto err_free_put_context; | 5114 | goto err_free_put_context; |
5079 | } | 5115 | } |
5080 | 5116 | ||
5081 | if (flags & PERF_FLAG_FD_OUTPUT) { | ||
5082 | err = perf_event_set_output(event, group_fd); | ||
5083 | if (err) | ||
5084 | goto err_fput_free_put_context; | ||
5085 | } | ||
5086 | |||
5087 | event->filp = event_file; | 5117 | event->filp = event_file; |
5088 | WARN_ON_ONCE(ctx->parent_ctx); | 5118 | WARN_ON_ONCE(ctx->parent_ctx); |
5089 | mutex_lock(&ctx->mutex); | 5119 | mutex_lock(&ctx->mutex); |
@@ -5101,8 +5131,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5101 | fd_install(event_fd, event_file); | 5131 | fd_install(event_fd, event_file); |
5102 | return event_fd; | 5132 | return event_fd; |
5103 | 5133 | ||
5104 | err_fput_free_put_context: | ||
5105 | fput(event_file); | ||
5106 | err_free_put_context: | 5134 | err_free_put_context: |
5107 | free_event(event); | 5135 | free_event(event); |
5108 | err_put_context: | 5136 | err_put_context: |