diff options
author | Peter Zijlstra <peterz@infradead.org> | 2014-03-14 05:50:33 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2014-05-19 08:44:56 -0400 |
commit | b69cf53640da2b86439596118cfa95233154ee76 (patch) | |
tree | fcb809f430c17449edebf179ec4dd330950af94a /kernel/events | |
parent | 39af6b1678afa5880dda7e375cf3f9d395087f6d (diff) |
perf: Fix a race between ring_buffer_detach() and ring_buffer_attach()
Alexander noticed that we use RCU iteration on rb->event_list but do
not use list_{add,del}_rcu() to add,remove entries to that list, nor
do we observe proper grace periods when re-using the entries.
Merge ring_buffer_detach() into ring_buffer_attach() such that
attaching to the NULL buffer is detaching.
Furthermore, ensure that between any 'detach' and 'attach' of the same
event we observe the required grace period, but only when strictly
required. In effect this means that only ioctl(.request =
PERF_EVENT_IOC_SET_OUTPUT) will wait for a grace period, while the
normal initial attach and final detach will not be delayed.
This patch should, I think, do the right thing under all
circumstances, the 'normal' cases all should never see the extra grace
period, but the two cases:
1) PERF_EVENT_IOC_SET_OUTPUT on an event which already has a
ring_buffer set, will now observe the required grace period between
removing itself from the old and attaching itself to the new buffer.
This case is 'simple' in that both buffers are present in
perf_event_set_output() one could think an unconditional
synchronize_rcu() would be sufficient; however...
2) an event that has a buffer attached, the buffer is destroyed
(munmap) and then the event is attached to a new/different buffer
using PERF_EVENT_IOC_SET_OUTPUT.
This case is more complex because the buffer destruction does:
ring_buffer_attach(.rb = NULL)
followed by the ioctl() doing:
ring_buffer_attach(.rb = foo);
and we still need to observe the grace period between these two
calls due to us reusing the event->rb_entry list_head.
In order to make 2 happen we use Paul's latest cond_synchronize_rcu()
call.
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Reported-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140507123526.GD13658@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 109 |
1 files changed, 49 insertions, 60 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index feb1329ca331..440eefc67397 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -3192,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head) | |||
3192 | } | 3192 | } |
3193 | 3193 | ||
3194 | static void ring_buffer_put(struct ring_buffer *rb); | 3194 | static void ring_buffer_put(struct ring_buffer *rb); |
3195 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3195 | static void ring_buffer_attach(struct perf_event *event, |
3196 | struct ring_buffer *rb); | ||
3196 | 3197 | ||
3197 | static void unaccount_event_cpu(struct perf_event *event, int cpu) | 3198 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3198 | { | 3199 | { |
@@ -3252,8 +3253,6 @@ static void free_event(struct perf_event *event) | |||
3252 | unaccount_event(event); | 3253 | unaccount_event(event); |
3253 | 3254 | ||
3254 | if (event->rb) { | 3255 | if (event->rb) { |
3255 | struct ring_buffer *rb; | ||
3256 | |||
3257 | /* | 3256 | /* |
3258 | * Can happen when we close an event with re-directed output. | 3257 | * Can happen when we close an event with re-directed output. |
3259 | * | 3258 | * |
@@ -3261,12 +3260,7 @@ static void free_event(struct perf_event *event) | |||
3261 | * over us; possibly making our ring_buffer_put() the last. | 3260 | * over us; possibly making our ring_buffer_put() the last. |
3262 | */ | 3261 | */ |
3263 | mutex_lock(&event->mmap_mutex); | 3262 | mutex_lock(&event->mmap_mutex); |
3264 | rb = event->rb; | 3263 | ring_buffer_attach(event, NULL); |
3265 | if (rb) { | ||
3266 | rcu_assign_pointer(event->rb, NULL); | ||
3267 | ring_buffer_detach(event, rb); | ||
3268 | ring_buffer_put(rb); /* could be last */ | ||
3269 | } | ||
3270 | mutex_unlock(&event->mmap_mutex); | 3264 | mutex_unlock(&event->mmap_mutex); |
3271 | } | 3265 | } |
3272 | 3266 | ||
@@ -3850,28 +3844,47 @@ unlock: | |||
3850 | static void ring_buffer_attach(struct perf_event *event, | 3844 | static void ring_buffer_attach(struct perf_event *event, |
3851 | struct ring_buffer *rb) | 3845 | struct ring_buffer *rb) |
3852 | { | 3846 | { |
3847 | struct ring_buffer *old_rb = NULL; | ||
3853 | unsigned long flags; | 3848 | unsigned long flags; |
3854 | 3849 | ||
3855 | if (!list_empty(&event->rb_entry)) | 3850 | if (event->rb) { |
3856 | return; | 3851 | /* |
3852 | * Should be impossible, we set this when removing | ||
3853 | * event->rb_entry and wait/clear when adding event->rb_entry. | ||
3854 | */ | ||
3855 | WARN_ON_ONCE(event->rcu_pending); | ||
3857 | 3856 | ||
3858 | spin_lock_irqsave(&rb->event_lock, flags); | 3857 | old_rb = event->rb; |
3859 | if (list_empty(&event->rb_entry)) | 3858 | event->rcu_batches = get_state_synchronize_rcu(); |
3860 | list_add(&event->rb_entry, &rb->event_list); | 3859 | event->rcu_pending = 1; |
3861 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3862 | } | ||
3863 | 3860 | ||
3864 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) | 3861 | spin_lock_irqsave(&old_rb->event_lock, flags); |
3865 | { | 3862 | list_del_rcu(&event->rb_entry); |
3866 | unsigned long flags; | 3863 | spin_unlock_irqrestore(&old_rb->event_lock, flags); |
3864 | } | ||
3867 | 3865 | ||
3868 | if (list_empty(&event->rb_entry)) | 3866 | if (event->rcu_pending && rb) { |
3869 | return; | 3867 | cond_synchronize_rcu(event->rcu_batches); |
3868 | event->rcu_pending = 0; | ||
3869 | } | ||
3870 | 3870 | ||
3871 | spin_lock_irqsave(&rb->event_lock, flags); | 3871 | if (rb) { |
3872 | list_del_init(&event->rb_entry); | 3872 | spin_lock_irqsave(&rb->event_lock, flags); |
3873 | wake_up_all(&event->waitq); | 3873 | list_add_rcu(&event->rb_entry, &rb->event_list); |
3874 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3874 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3875 | } | ||
3876 | |||
3877 | rcu_assign_pointer(event->rb, rb); | ||
3878 | |||
3879 | if (old_rb) { | ||
3880 | ring_buffer_put(old_rb); | ||
3881 | /* | ||
3882 | * Since we detached before setting the new rb, so that we | ||
3883 | * could attach the new rb, we could have missed a wakeup. | ||
3884 | * Provide it now. | ||
3885 | */ | ||
3886 | wake_up_all(&event->waitq); | ||
3887 | } | ||
3875 | } | 3888 | } |
3876 | 3889 | ||
3877 | static void ring_buffer_wakeup(struct perf_event *event) | 3890 | static void ring_buffer_wakeup(struct perf_event *event) |
@@ -3940,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3940 | { | 3953 | { |
3941 | struct perf_event *event = vma->vm_file->private_data; | 3954 | struct perf_event *event = vma->vm_file->private_data; |
3942 | 3955 | ||
3943 | struct ring_buffer *rb = event->rb; | 3956 | struct ring_buffer *rb = ring_buffer_get(event); |
3944 | struct user_struct *mmap_user = rb->mmap_user; | 3957 | struct user_struct *mmap_user = rb->mmap_user; |
3945 | int mmap_locked = rb->mmap_locked; | 3958 | int mmap_locked = rb->mmap_locked; |
3946 | unsigned long size = perf_data_size(rb); | 3959 | unsigned long size = perf_data_size(rb); |
@@ -3948,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3948 | atomic_dec(&rb->mmap_count); | 3961 | atomic_dec(&rb->mmap_count); |
3949 | 3962 | ||
3950 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 3963 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
3951 | return; | 3964 | goto out_put; |
3952 | 3965 | ||
3953 | /* Detach current event from the buffer. */ | 3966 | ring_buffer_attach(event, NULL); |
3954 | rcu_assign_pointer(event->rb, NULL); | ||
3955 | ring_buffer_detach(event, rb); | ||
3956 | mutex_unlock(&event->mmap_mutex); | 3967 | mutex_unlock(&event->mmap_mutex); |
3957 | 3968 | ||
3958 | /* If there's still other mmap()s of this buffer, we're done. */ | 3969 | /* If there's still other mmap()s of this buffer, we're done. */ |
3959 | if (atomic_read(&rb->mmap_count)) { | 3970 | if (atomic_read(&rb->mmap_count)) |
3960 | ring_buffer_put(rb); /* can't be last */ | 3971 | goto out_put; |
3961 | return; | ||
3962 | } | ||
3963 | 3972 | ||
3964 | /* | 3973 | /* |
3965 | * No other mmap()s, detach from all other events that might redirect | 3974 | * No other mmap()s, detach from all other events that might redirect |
@@ -3989,11 +3998,9 @@ again: | |||
3989 | * still restart the iteration to make sure we're not now | 3998 | * still restart the iteration to make sure we're not now |
3990 | * iterating the wrong list. | 3999 | * iterating the wrong list. |
3991 | */ | 4000 | */ |
3992 | if (event->rb == rb) { | 4001 | if (event->rb == rb) |
3993 | rcu_assign_pointer(event->rb, NULL); | 4002 | ring_buffer_attach(event, NULL); |
3994 | ring_buffer_detach(event, rb); | 4003 | |
3995 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3996 | } | ||
3997 | mutex_unlock(&event->mmap_mutex); | 4004 | mutex_unlock(&event->mmap_mutex); |
3998 | put_event(event); | 4005 | put_event(event); |
3999 | 4006 | ||
@@ -4018,6 +4025,7 @@ again: | |||
4018 | vma->vm_mm->pinned_vm -= mmap_locked; | 4025 | vma->vm_mm->pinned_vm -= mmap_locked; |
4019 | free_uid(mmap_user); | 4026 | free_uid(mmap_user); |
4020 | 4027 | ||
4028 | out_put: | ||
4021 | ring_buffer_put(rb); /* could be last */ | 4029 | ring_buffer_put(rb); /* could be last */ |
4022 | } | 4030 | } |
4023 | 4031 | ||
@@ -4135,7 +4143,6 @@ again: | |||
4135 | vma->vm_mm->pinned_vm += extra; | 4143 | vma->vm_mm->pinned_vm += extra; |
4136 | 4144 | ||
4137 | ring_buffer_attach(event, rb); | 4145 | ring_buffer_attach(event, rb); |
4138 | rcu_assign_pointer(event->rb, rb); | ||
4139 | 4146 | ||
4140 | perf_event_init_userpage(event); | 4147 | perf_event_init_userpage(event); |
4141 | perf_event_update_userpage(event); | 4148 | perf_event_update_userpage(event); |
@@ -6934,7 +6941,7 @@ err_size: | |||
6934 | static int | 6941 | static int |
6935 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 6942 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
6936 | { | 6943 | { |
6937 | struct ring_buffer *rb = NULL, *old_rb = NULL; | 6944 | struct ring_buffer *rb = NULL; |
6938 | int ret = -EINVAL; | 6945 | int ret = -EINVAL; |
6939 | 6946 | ||
6940 | if (!output_event) | 6947 | if (!output_event) |
@@ -6962,8 +6969,6 @@ set: | |||
6962 | if (atomic_read(&event->mmap_count)) | 6969 | if (atomic_read(&event->mmap_count)) |
6963 | goto unlock; | 6970 | goto unlock; |
6964 | 6971 | ||
6965 | old_rb = event->rb; | ||
6966 | |||
6967 | if (output_event) { | 6972 | if (output_event) { |
6968 | /* get the rb we want to redirect to */ | 6973 | /* get the rb we want to redirect to */ |
6969 | rb = ring_buffer_get(output_event); | 6974 | rb = ring_buffer_get(output_event); |
@@ -6971,23 +6976,7 @@ set: | |||
6971 | goto unlock; | 6976 | goto unlock; |
6972 | } | 6977 | } |
6973 | 6978 | ||
6974 | if (old_rb) | 6979 | ring_buffer_attach(event, rb); |
6975 | ring_buffer_detach(event, old_rb); | ||
6976 | |||
6977 | if (rb) | ||
6978 | ring_buffer_attach(event, rb); | ||
6979 | |||
6980 | rcu_assign_pointer(event->rb, rb); | ||
6981 | |||
6982 | if (old_rb) { | ||
6983 | ring_buffer_put(old_rb); | ||
6984 | /* | ||
6985 | * Since we detached before setting the new rb, so that we | ||
6986 | * could attach the new rb, we could have missed a wakeup. | ||
6987 | * Provide it now. | ||
6988 | */ | ||
6989 | wake_up_all(&event->waitq); | ||
6990 | } | ||
6991 | 6980 | ||
6992 | ret = 0; | 6981 | ret = 0; |
6993 | unlock: | 6982 | unlock: |