aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2013-06-04 04:44:21 -0400
committerIngo Molnar <mingo@kernel.org>2013-06-19 06:44:13 -0400
commit9bb5d40cd93c9dd4be74834b1dcb1ba03629716b (patch)
tree28226bb0f9c0e8f43e2680f4ca34d76068b88ceb /kernel
parent26cb63ad11e04047a64309362674bcbbd6a6f246 (diff)
perf: Fix mmap() accounting hole
Vince's fuzzer once again found holes. This time it spotted a leak in the locked page accounting. When an event had redirected output and its close() was the last reference to the buffer we didn't have a vm context to undo accounting. Change the code to destroy the buffer on the last munmap() and detach all redirected events at that time. This provides us the right context to undo the vm accounting. Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net Cc: <stable@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c228
-rw-r--r--kernel/events/internal.h3
2 files changed, 159 insertions, 72 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae752cd4a086..b391907d5352 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 196static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 197static u64 perf_event_time(struct perf_event *event);
198 198
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 199void __weak perf_event_print_debug(void) { }
203 200
204extern __weak const char *perf_pmu_name(void) 201extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
2917 kfree(event); 2914 kfree(event);
2918} 2915}
2919 2916
2920static bool ring_buffer_put(struct ring_buffer *rb); 2917static void ring_buffer_put(struct ring_buffer *rb);
2918static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 2919
2922static void free_event(struct perf_event *event) 2920static void free_event(struct perf_event *event)
2923{ 2921{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 2940 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 2941 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 2942 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 2943 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 2944 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 2945 event->cpu));
2946 }
2948 } 2947 }
2949 } 2948 }
2950 2949
2951 if (event->rb) { 2950 if (event->rb) {
2952 ring_buffer_put(event->rb); 2951 struct ring_buffer *rb;
2953 event->rb = NULL; 2952
2953 /*
2954 * Can happen when we close an event with re-directed output.
2955 *
2956 * Since we have a 0 refcount, perf_mmap_close() will skip
2957 * over us; possibly making our ring_buffer_put() the last.
2958 */
2959 mutex_lock(&event->mmap_mutex);
2960 rb = event->rb;
2961 if (rb) {
2962 rcu_assign_pointer(event->rb, NULL);
2963 ring_buffer_detach(event, rb);
2964 ring_buffer_put(rb); /* could be last */
2965 }
2966 mutex_unlock(&event->mmap_mutex);
2954 } 2967 }
2955 2968
2956 if (is_cgroup_event(event)) 2969 if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3201 unsigned int events = POLL_HUP;
3189 3202
3190 /* 3203 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3204 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3205 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3206 */
3205 mutex_lock(&event->mmap_mutex); 3207 mutex_lock(&event->mmap_mutex);
3206 3208 rb = event->rb;
3207 rcu_read_lock(); 3209 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3210 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3211 mutex_unlock(&event->mmap_mutex);
3216 3212
3217 poll_wait(file, &event->waitq, wait); 3213 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3517 return;
3522 3518
3523 spin_lock_irqsave(&rb->event_lock, flags); 3519 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3520 if (list_empty(&event->rb_entry))
3525 goto unlock; 3521 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3522 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3523}
3531 3524
3532static void ring_buffer_detach(struct perf_event *event, 3525static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3526{
3535 unsigned long flags; 3527 unsigned long flags;
3536 3528
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3541
3550 rcu_read_lock(); 3542 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3543 rb = rcu_dereference(event->rb);
3552 if (!rb) 3544 if (rb) {
3553 goto unlock; 3545 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3546 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3547 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3548 rcu_read_unlock();
3560} 3549}
3561 3550
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3582 return rb; 3571 return rb;
3583} 3572}
3584 3573
3585static bool ring_buffer_put(struct ring_buffer *rb) 3574static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3575{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3576 if (!atomic_dec_and_test(&rb->refcount))
3591 return false; 3577 return;
3592 3578
3593 spin_lock_irqsave(&rb->event_lock, flags); 3579 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3580
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3581 call_rcu(&rb->rcu_head, rb_free_rcu);
3601 return true;
3602} 3582}
3603 3583
3604static void perf_mmap_open(struct vm_area_struct *vma) 3584static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3606 struct perf_event *event = vma->vm_file->private_data; 3586 struct perf_event *event = vma->vm_file->private_data;
3607 3587
3608 atomic_inc(&event->mmap_count); 3588 atomic_inc(&event->mmap_count);
3589 atomic_inc(&event->rb->mmap_count);
3609} 3590}
3610 3591
3592/*
3593 * A buffer can be mmap()ed multiple times; either directly through the same
3594 * event, or through other events by use of perf_event_set_output().
3595 *
3596 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3597 * the buffer here, where we still have a VM context. This means we need
3598 * to detach all events redirecting to us.
3599 */
3611static void perf_mmap_close(struct vm_area_struct *vma) 3600static void perf_mmap_close(struct vm_area_struct *vma)
3612{ 3601{
3613 struct perf_event *event = vma->vm_file->private_data; 3602 struct perf_event *event = vma->vm_file->private_data;
3614 3603
3615 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3604 struct ring_buffer *rb = event->rb;
3616 struct ring_buffer *rb = event->rb; 3605 struct user_struct *mmap_user = rb->mmap_user;
3617 struct user_struct *mmap_user = rb->mmap_user; 3606 int mmap_locked = rb->mmap_locked;
3618 int mmap_locked = rb->mmap_locked; 3607 unsigned long size = perf_data_size(rb);
3619 unsigned long size = perf_data_size(rb);
3620 3608
3621 rcu_assign_pointer(event->rb, NULL); 3609 atomic_dec(&rb->mmap_count);
3622 ring_buffer_detach(event, rb); 3610
3623 mutex_unlock(&event->mmap_mutex); 3611 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3612 return;
3613
3614 /* Detach current event from the buffer. */
3615 rcu_assign_pointer(event->rb, NULL);
3616 ring_buffer_detach(event, rb);
3617 mutex_unlock(&event->mmap_mutex);
3618
3619 /* If there's still other mmap()s of this buffer, we're done. */
3620 if (atomic_read(&rb->mmap_count)) {
3621 ring_buffer_put(rb); /* can't be last */
3622 return;
3623 }
3624 3624
3625 if (ring_buffer_put(rb)) { 3625 /*
3626 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 3626 * No other mmap()s, detach from all other events that might redirect
3627 vma->vm_mm->pinned_vm -= mmap_locked; 3627 * into the now unreachable buffer. Somewhat complicated by the
3628 free_uid(mmap_user); 3628 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3629 */
3630again:
3631 rcu_read_lock();
3632 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3633 if (!atomic_long_inc_not_zero(&event->refcount)) {
3634 /*
3635 * This event is en-route to free_event() which will
3636 * detach it and remove it from the list.
3637 */
3638 continue;
3629 } 3639 }
3640 rcu_read_unlock();
3641
3642 mutex_lock(&event->mmap_mutex);
3643 /*
3644 * Check we didn't race with perf_event_set_output() which can
3645 * swizzle the rb from under us while we were waiting to
3646 * acquire mmap_mutex.
3647 *
3648 * If we find a different rb; ignore this event, a next
3649 * iteration will no longer find it on the list. We have to
3650 * still restart the iteration to make sure we're not now
3651 * iterating the wrong list.
3652 */
3653 if (event->rb == rb) {
3654 rcu_assign_pointer(event->rb, NULL);
3655 ring_buffer_detach(event, rb);
3656 ring_buffer_put(rb); /* can't be last, we still have one */
3657 }
3658 mutex_unlock(&event->mmap_mutex);
3659 put_event(event);
3660
3661 /*
3662 * Restart the iteration; either we're on the wrong list or
3663 * destroyed its integrity by doing a deletion.
3664 */
3665 goto again;
3630 } 3666 }
3667 rcu_read_unlock();
3668
3669 /*
3670 * It could be there's still a few 0-ref events on the list; they'll
3671 * get cleaned up by free_event() -- they'll also still have their
3672 * ref on the rb and will free it whenever they are done with it.
3673 *
3674 * Aside from that, this buffer is 'fully' detached and unmapped,
3675 * undo the VM accounting.
3676 */
3677
3678 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3679 vma->vm_mm->pinned_vm -= mmap_locked;
3680 free_uid(mmap_user);
3681
3682 ring_buffer_put(rb); /* could be last */
3631} 3683}
3632 3684
3633static const struct vm_operations_struct perf_mmap_vmops = { 3685static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3677 return -EINVAL; 3729 return -EINVAL;
3678 3730
3679 WARN_ON_ONCE(event->ctx->parent_ctx); 3731 WARN_ON_ONCE(event->ctx->parent_ctx);
3732again:
3680 mutex_lock(&event->mmap_mutex); 3733 mutex_lock(&event->mmap_mutex);
3681 if (event->rb) { 3734 if (event->rb) {
3682 if (event->rb->nr_pages != nr_pages) 3735 if (event->rb->nr_pages != nr_pages) {
3683 ret = -EINVAL; 3736 ret = -EINVAL;
3737 goto unlock;
3738 }
3739
3740 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3741 /*
3742 * Raced against perf_mmap_close() through
3743 * perf_event_set_output(). Try again, hope for better
3744 * luck.
3745 */
3746 mutex_unlock(&event->mmap_mutex);
3747 goto again;
3748 }
3749
3684 goto unlock; 3750 goto unlock;
3685 } 3751 }
3686 3752
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3722 goto unlock; 3788 goto unlock;
3723 } 3789 }
3724 3790
3791 atomic_set(&rb->mmap_count, 1);
3725 rb->mmap_locked = extra; 3792 rb->mmap_locked = extra;
3726 rb->mmap_user = get_current_user(); 3793 rb->mmap_user = get_current_user();
3727 3794
3728 atomic_long_add(user_extra, &user->locked_vm); 3795 atomic_long_add(user_extra, &user->locked_vm);
3729 vma->vm_mm->pinned_vm += extra; 3796 vma->vm_mm->pinned_vm += extra;
3730 3797
3798 ring_buffer_attach(event, rb);
3731 rcu_assign_pointer(event->rb, rb); 3799 rcu_assign_pointer(event->rb, rb);
3732 3800
3733 perf_event_update_userpage(event); 3801 perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ unlock:
3737 atomic_inc(&event->mmap_count); 3805 atomic_inc(&event->mmap_count);
3738 mutex_unlock(&event->mmap_mutex); 3806 mutex_unlock(&event->mmap_mutex);
3739 3807
3808 /*
3809 * Since pinned accounting is per vm we cannot allow fork() to copy our
3810 * vma.
3811 */
3740 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 3812 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3741 vma->vm_ops = &perf_mmap_vmops; 3813 vma->vm_ops = &perf_mmap_vmops;
3742 3814
@@ -6415,6 +6487,8 @@ set:
6415 if (atomic_read(&event->mmap_count)) 6487 if (atomic_read(&event->mmap_count))
6416 goto unlock; 6488 goto unlock;
6417 6489
6490 old_rb = event->rb;
6491
6418 if (output_event) { 6492 if (output_event) {
6419 /* get the rb we want to redirect to */ 6493 /* get the rb we want to redirect to */
6420 rb = ring_buffer_get(output_event); 6494 rb = ring_buffer_get(output_event);
@@ -6422,16 +6496,28 @@ set:
6422 goto unlock; 6496 goto unlock;
6423 } 6497 }
6424 6498
6425 old_rb = event->rb;
6426 rcu_assign_pointer(event->rb, rb);
6427 if (old_rb) 6499 if (old_rb)
6428 ring_buffer_detach(event, old_rb); 6500 ring_buffer_detach(event, old_rb);
6501
6502 if (rb)
6503 ring_buffer_attach(event, rb);
6504
6505 rcu_assign_pointer(event->rb, rb);
6506
6507 if (old_rb) {
6508 ring_buffer_put(old_rb);
6509 /*
6510 * Since we detached before setting the new rb, so that we
6511 * could attach the new rb, we could have missed a wakeup.
6512 * Provide it now.
6513 */
6514 wake_up_all(&event->waitq);
6515 }
6516
6429 ret = 0; 6517 ret = 0;
6430unlock: 6518unlock:
6431 mutex_unlock(&event->mmap_mutex); 6519 mutex_unlock(&event->mmap_mutex);
6432 6520
6433 if (old_rb)
6434 ring_buffer_put(old_rb);
6435out: 6521out:
6436 return ret; 6522 return ret;
6437} 6523}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5bc6c8e9b851..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,7 +31,8 @@ struct ring_buffer {
31 spinlock_t event_lock; 31 spinlock_t event_lock;
32 struct list_head event_list; 32 struct list_head event_list;
33 33
34 int mmap_locked; 34 atomic_t mmap_count;
35 unsigned long mmap_locked;
35 struct user_struct *mmap_user; 36 struct user_struct *mmap_user;
36 37
37 struct perf_event_mmap_page *user_page; 38 struct perf_event_mmap_page *user_page;