diff options
author | Peter Zijlstra <peterz@infradead.org> | 2013-06-04 04:44:21 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-06-19 06:44:13 -0400 |
commit | 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b (patch) | |
tree | 28226bb0f9c0e8f43e2680f4ca34d76068b88ceb /kernel | |
parent | 26cb63ad11e04047a64309362674bcbbd6a6f246 (diff) |
perf: Fix mmap() accounting hole
Vince's fuzzer once again found holes. This time it spotted a leak in
the locked page accounting.
When an event had redirected output and its close() was the last
reference to the buffer we didn't have a vm context to undo accounting.
Change the code to destroy the buffer on the last munmap() and detach
all redirected events at that time. This provides us the right context
to undo the vm accounting.
Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 228 | ||||
-rw-r--r-- | kernel/events/internal.h | 3 |
2 files changed, 159 insertions, 72 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index ae752cd4a086..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
196 | static void update_context_time(struct perf_event_context *ctx); | 196 | static void update_context_time(struct perf_event_context *ctx); |
197 | static u64 perf_event_time(struct perf_event *event); | 197 | static u64 perf_event_time(struct perf_event *event); |
198 | 198 | ||
199 | static void ring_buffer_attach(struct perf_event *event, | ||
200 | struct ring_buffer *rb); | ||
201 | |||
202 | void __weak perf_event_print_debug(void) { } | 199 | void __weak perf_event_print_debug(void) { } |
203 | 200 | ||
204 | extern __weak const char *perf_pmu_name(void) | 201 | extern __weak const char *perf_pmu_name(void) |
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head) | |||
2917 | kfree(event); | 2914 | kfree(event); |
2918 | } | 2915 | } |
2919 | 2916 | ||
2920 | static bool ring_buffer_put(struct ring_buffer *rb); | 2917 | static void ring_buffer_put(struct ring_buffer *rb); |
2918 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | ||
2921 | 2919 | ||
2922 | static void free_event(struct perf_event *event) | 2920 | static void free_event(struct perf_event *event) |
2923 | { | 2921 | { |
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) | |||
2942 | if (has_branch_stack(event)) { | 2940 | if (has_branch_stack(event)) { |
2943 | static_key_slow_dec_deferred(&perf_sched_events); | 2941 | static_key_slow_dec_deferred(&perf_sched_events); |
2944 | /* is system-wide event */ | 2942 | /* is system-wide event */ |
2945 | if (!(event->attach_state & PERF_ATTACH_TASK)) | 2943 | if (!(event->attach_state & PERF_ATTACH_TASK)) { |
2946 | atomic_dec(&per_cpu(perf_branch_stack_events, | 2944 | atomic_dec(&per_cpu(perf_branch_stack_events, |
2947 | event->cpu)); | 2945 | event->cpu)); |
2946 | } | ||
2948 | } | 2947 | } |
2949 | } | 2948 | } |
2950 | 2949 | ||
2951 | if (event->rb) { | 2950 | if (event->rb) { |
2952 | ring_buffer_put(event->rb); | 2951 | struct ring_buffer *rb; |
2953 | event->rb = NULL; | 2952 | |
2953 | /* | ||
2954 | * Can happen when we close an event with re-directed output. | ||
2955 | * | ||
2956 | * Since we have a 0 refcount, perf_mmap_close() will skip | ||
2957 | * over us; possibly making our ring_buffer_put() the last. | ||
2958 | */ | ||
2959 | mutex_lock(&event->mmap_mutex); | ||
2960 | rb = event->rb; | ||
2961 | if (rb) { | ||
2962 | rcu_assign_pointer(event->rb, NULL); | ||
2963 | ring_buffer_detach(event, rb); | ||
2964 | ring_buffer_put(rb); /* could be last */ | ||
2965 | } | ||
2966 | mutex_unlock(&event->mmap_mutex); | ||
2954 | } | 2967 | } |
2955 | 2968 | ||
2956 | if (is_cgroup_event(event)) | 2969 | if (is_cgroup_event(event)) |
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3188 | unsigned int events = POLL_HUP; | 3201 | unsigned int events = POLL_HUP; |
3189 | 3202 | ||
3190 | /* | 3203 | /* |
3191 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | 3204 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
3192 | * grabs the rb reference but perf_event_set_output() overrides it. | 3205 | * perf_event_set_output() can swizzle our rb and make us miss wakeups. |
3193 | * Here is the timeline for two threads T1, T2: | ||
3194 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3195 | * t1: T2, old_rb = event->rb | ||
3196 | * t2: T2, event->rb = new rb | ||
3197 | * t3: T2, ring_buffer_detach(old_rb) | ||
3198 | * t4: T1, ring_buffer_attach(rb1) | ||
3199 | * t5: T1, poll_wait(event->waitq) | ||
3200 | * | ||
3201 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3202 | * thereby ensuring that the assignment of the new ring buffer | ||
3203 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3204 | */ | 3206 | */ |
3205 | mutex_lock(&event->mmap_mutex); | 3207 | mutex_lock(&event->mmap_mutex); |
3206 | 3208 | rb = event->rb; | |
3207 | rcu_read_lock(); | 3209 | if (rb) |
3208 | rb = rcu_dereference(event->rb); | ||
3209 | if (rb) { | ||
3210 | ring_buffer_attach(event, rb); | ||
3211 | events = atomic_xchg(&rb->poll, 0); | 3210 | events = atomic_xchg(&rb->poll, 0); |
3212 | } | ||
3213 | rcu_read_unlock(); | ||
3214 | |||
3215 | mutex_unlock(&event->mmap_mutex); | 3211 | mutex_unlock(&event->mmap_mutex); |
3216 | 3212 | ||
3217 | poll_wait(file, &event->waitq, wait); | 3213 | poll_wait(file, &event->waitq, wait); |
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, | |||
3521 | return; | 3517 | return; |
3522 | 3518 | ||
3523 | spin_lock_irqsave(&rb->event_lock, flags); | 3519 | spin_lock_irqsave(&rb->event_lock, flags); |
3524 | if (!list_empty(&event->rb_entry)) | 3520 | if (list_empty(&event->rb_entry)) |
3525 | goto unlock; | 3521 | list_add(&event->rb_entry, &rb->event_list); |
3526 | |||
3527 | list_add(&event->rb_entry, &rb->event_list); | ||
3528 | unlock: | ||
3529 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3522 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3530 | } | 3523 | } |
3531 | 3524 | ||
3532 | static void ring_buffer_detach(struct perf_event *event, | 3525 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) |
3533 | struct ring_buffer *rb) | ||
3534 | { | 3526 | { |
3535 | unsigned long flags; | 3527 | unsigned long flags; |
3536 | 3528 | ||
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) | |||
3549 | 3541 | ||
3550 | rcu_read_lock(); | 3542 | rcu_read_lock(); |
3551 | rb = rcu_dereference(event->rb); | 3543 | rb = rcu_dereference(event->rb); |
3552 | if (!rb) | 3544 | if (rb) { |
3553 | goto unlock; | 3545 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) |
3554 | 3546 | wake_up_all(&event->waitq); | |
3555 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | 3547 | } |
3556 | wake_up_all(&event->waitq); | ||
3557 | |||
3558 | unlock: | ||
3559 | rcu_read_unlock(); | 3548 | rcu_read_unlock(); |
3560 | } | 3549 | } |
3561 | 3550 | ||
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3582 | return rb; | 3571 | return rb; |
3583 | } | 3572 | } |
3584 | 3573 | ||
3585 | static bool ring_buffer_put(struct ring_buffer *rb) | 3574 | static void ring_buffer_put(struct ring_buffer *rb) |
3586 | { | 3575 | { |
3587 | struct perf_event *event, *n; | ||
3588 | unsigned long flags; | ||
3589 | |||
3590 | if (!atomic_dec_and_test(&rb->refcount)) | 3576 | if (!atomic_dec_and_test(&rb->refcount)) |
3591 | return false; | 3577 | return; |
3592 | 3578 | ||
3593 | spin_lock_irqsave(&rb->event_lock, flags); | 3579 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
3594 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3595 | list_del_init(&event->rb_entry); | ||
3596 | wake_up_all(&event->waitq); | ||
3597 | } | ||
3598 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3599 | 3580 | ||
3600 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3581 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3601 | return true; | ||
3602 | } | 3582 | } |
3603 | 3583 | ||
3604 | static void perf_mmap_open(struct vm_area_struct *vma) | 3584 | static void perf_mmap_open(struct vm_area_struct *vma) |
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
3606 | struct perf_event *event = vma->vm_file->private_data; | 3586 | struct perf_event *event = vma->vm_file->private_data; |
3607 | 3587 | ||
3608 | atomic_inc(&event->mmap_count); | 3588 | atomic_inc(&event->mmap_count); |
3589 | atomic_inc(&event->rb->mmap_count); | ||
3609 | } | 3590 | } |
3610 | 3591 | ||
3592 | /* | ||
3593 | * A buffer can be mmap()ed multiple times; either directly through the same | ||
3594 | * event, or through other events by use of perf_event_set_output(). | ||
3595 | * | ||
3596 | * In order to undo the VM accounting done by perf_mmap() we need to destroy | ||
3597 | * the buffer here, where we still have a VM context. This means we need | ||
3598 | * to detach all events redirecting to us. | ||
3599 | */ | ||
3611 | static void perf_mmap_close(struct vm_area_struct *vma) | 3600 | static void perf_mmap_close(struct vm_area_struct *vma) |
3612 | { | 3601 | { |
3613 | struct perf_event *event = vma->vm_file->private_data; | 3602 | struct perf_event *event = vma->vm_file->private_data; |
3614 | 3603 | ||
3615 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3604 | struct ring_buffer *rb = event->rb; |
3616 | struct ring_buffer *rb = event->rb; | 3605 | struct user_struct *mmap_user = rb->mmap_user; |
3617 | struct user_struct *mmap_user = rb->mmap_user; | 3606 | int mmap_locked = rb->mmap_locked; |
3618 | int mmap_locked = rb->mmap_locked; | 3607 | unsigned long size = perf_data_size(rb); |
3619 | unsigned long size = perf_data_size(rb); | ||
3620 | 3608 | ||
3621 | rcu_assign_pointer(event->rb, NULL); | 3609 | atomic_dec(&rb->mmap_count); |
3622 | ring_buffer_detach(event, rb); | 3610 | |
3623 | mutex_unlock(&event->mmap_mutex); | 3611 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
3612 | return; | ||
3613 | |||
3614 | /* Detach current event from the buffer. */ | ||
3615 | rcu_assign_pointer(event->rb, NULL); | ||
3616 | ring_buffer_detach(event, rb); | ||
3617 | mutex_unlock(&event->mmap_mutex); | ||
3618 | |||
3619 | /* If there's still other mmap()s of this buffer, we're done. */ | ||
3620 | if (atomic_read(&rb->mmap_count)) { | ||
3621 | ring_buffer_put(rb); /* can't be last */ | ||
3622 | return; | ||
3623 | } | ||
3624 | 3624 | ||
3625 | if (ring_buffer_put(rb)) { | 3625 | /* |
3626 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | 3626 | * No other mmap()s, detach from all other events that might redirect |
3627 | vma->vm_mm->pinned_vm -= mmap_locked; | 3627 | * into the now unreachable buffer. Somewhat complicated by the |
3628 | free_uid(mmap_user); | 3628 | * fact that rb::event_lock otherwise nests inside mmap_mutex. |
3629 | */ | ||
3630 | again: | ||
3631 | rcu_read_lock(); | ||
3632 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { | ||
3633 | if (!atomic_long_inc_not_zero(&event->refcount)) { | ||
3634 | /* | ||
3635 | * This event is en-route to free_event() which will | ||
3636 | * detach it and remove it from the list. | ||
3637 | */ | ||
3638 | continue; | ||
3629 | } | 3639 | } |
3640 | rcu_read_unlock(); | ||
3641 | |||
3642 | mutex_lock(&event->mmap_mutex); | ||
3643 | /* | ||
3644 | * Check we didn't race with perf_event_set_output() which can | ||
3645 | * swizzle the rb from under us while we were waiting to | ||
3646 | * acquire mmap_mutex. | ||
3647 | * | ||
3648 | * If we find a different rb; ignore this event, a next | ||
3649 | * iteration will no longer find it on the list. We have to | ||
3650 | * still restart the iteration to make sure we're not now | ||
3651 | * iterating the wrong list. | ||
3652 | */ | ||
3653 | if (event->rb == rb) { | ||
3654 | rcu_assign_pointer(event->rb, NULL); | ||
3655 | ring_buffer_detach(event, rb); | ||
3656 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3657 | } | ||
3658 | mutex_unlock(&event->mmap_mutex); | ||
3659 | put_event(event); | ||
3660 | |||
3661 | /* | ||
3662 | * Restart the iteration; either we're on the wrong list or | ||
3663 | * destroyed its integrity by doing a deletion. | ||
3664 | */ | ||
3665 | goto again; | ||
3630 | } | 3666 | } |
3667 | rcu_read_unlock(); | ||
3668 | |||
3669 | /* | ||
3670 | * It could be there's still a few 0-ref events on the list; they'll | ||
3671 | * get cleaned up by free_event() -- they'll also still have their | ||
3672 | * ref on the rb and will free it whenever they are done with it. | ||
3673 | * | ||
3674 | * Aside from that, this buffer is 'fully' detached and unmapped, | ||
3675 | * undo the VM accounting. | ||
3676 | */ | ||
3677 | |||
3678 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | ||
3679 | vma->vm_mm->pinned_vm -= mmap_locked; | ||
3680 | free_uid(mmap_user); | ||
3681 | |||
3682 | ring_buffer_put(rb); /* could be last */ | ||
3631 | } | 3683 | } |
3632 | 3684 | ||
3633 | static const struct vm_operations_struct perf_mmap_vmops = { | 3685 | static const struct vm_operations_struct perf_mmap_vmops = { |
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3677 | return -EINVAL; | 3729 | return -EINVAL; |
3678 | 3730 | ||
3679 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3731 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3732 | again: | ||
3680 | mutex_lock(&event->mmap_mutex); | 3733 | mutex_lock(&event->mmap_mutex); |
3681 | if (event->rb) { | 3734 | if (event->rb) { |
3682 | if (event->rb->nr_pages != nr_pages) | 3735 | if (event->rb->nr_pages != nr_pages) { |
3683 | ret = -EINVAL; | 3736 | ret = -EINVAL; |
3737 | goto unlock; | ||
3738 | } | ||
3739 | |||
3740 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | ||
3741 | /* | ||
3742 | * Raced against perf_mmap_close() through | ||
3743 | * perf_event_set_output(). Try again, hope for better | ||
3744 | * luck. | ||
3745 | */ | ||
3746 | mutex_unlock(&event->mmap_mutex); | ||
3747 | goto again; | ||
3748 | } | ||
3749 | |||
3684 | goto unlock; | 3750 | goto unlock; |
3685 | } | 3751 | } |
3686 | 3752 | ||
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3722 | goto unlock; | 3788 | goto unlock; |
3723 | } | 3789 | } |
3724 | 3790 | ||
3791 | atomic_set(&rb->mmap_count, 1); | ||
3725 | rb->mmap_locked = extra; | 3792 | rb->mmap_locked = extra; |
3726 | rb->mmap_user = get_current_user(); | 3793 | rb->mmap_user = get_current_user(); |
3727 | 3794 | ||
3728 | atomic_long_add(user_extra, &user->locked_vm); | 3795 | atomic_long_add(user_extra, &user->locked_vm); |
3729 | vma->vm_mm->pinned_vm += extra; | 3796 | vma->vm_mm->pinned_vm += extra; |
3730 | 3797 | ||
3798 | ring_buffer_attach(event, rb); | ||
3731 | rcu_assign_pointer(event->rb, rb); | 3799 | rcu_assign_pointer(event->rb, rb); |
3732 | 3800 | ||
3733 | perf_event_update_userpage(event); | 3801 | perf_event_update_userpage(event); |
@@ -3737,6 +3805,10 @@ unlock: | |||
3737 | atomic_inc(&event->mmap_count); | 3805 | atomic_inc(&event->mmap_count); |
3738 | mutex_unlock(&event->mmap_mutex); | 3806 | mutex_unlock(&event->mmap_mutex); |
3739 | 3807 | ||
3808 | /* | ||
3809 | * Since pinned accounting is per vm we cannot allow fork() to copy our | ||
3810 | * vma. | ||
3811 | */ | ||
3740 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; | 3812 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; |
3741 | vma->vm_ops = &perf_mmap_vmops; | 3813 | vma->vm_ops = &perf_mmap_vmops; |
3742 | 3814 | ||
@@ -6415,6 +6487,8 @@ set: | |||
6415 | if (atomic_read(&event->mmap_count)) | 6487 | if (atomic_read(&event->mmap_count)) |
6416 | goto unlock; | 6488 | goto unlock; |
6417 | 6489 | ||
6490 | old_rb = event->rb; | ||
6491 | |||
6418 | if (output_event) { | 6492 | if (output_event) { |
6419 | /* get the rb we want to redirect to */ | 6493 | /* get the rb we want to redirect to */ |
6420 | rb = ring_buffer_get(output_event); | 6494 | rb = ring_buffer_get(output_event); |
@@ -6422,16 +6496,28 @@ set: | |||
6422 | goto unlock; | 6496 | goto unlock; |
6423 | } | 6497 | } |
6424 | 6498 | ||
6425 | old_rb = event->rb; | ||
6426 | rcu_assign_pointer(event->rb, rb); | ||
6427 | if (old_rb) | 6499 | if (old_rb) |
6428 | ring_buffer_detach(event, old_rb); | 6500 | ring_buffer_detach(event, old_rb); |
6501 | |||
6502 | if (rb) | ||
6503 | ring_buffer_attach(event, rb); | ||
6504 | |||
6505 | rcu_assign_pointer(event->rb, rb); | ||
6506 | |||
6507 | if (old_rb) { | ||
6508 | ring_buffer_put(old_rb); | ||
6509 | /* | ||
6510 | * Since we detached before setting the new rb, so that we | ||
6511 | * could attach the new rb, we could have missed a wakeup. | ||
6512 | * Provide it now. | ||
6513 | */ | ||
6514 | wake_up_all(&event->waitq); | ||
6515 | } | ||
6516 | |||
6429 | ret = 0; | 6517 | ret = 0; |
6430 | unlock: | 6518 | unlock: |
6431 | mutex_unlock(&event->mmap_mutex); | 6519 | mutex_unlock(&event->mmap_mutex); |
6432 | 6520 | ||
6433 | if (old_rb) | ||
6434 | ring_buffer_put(old_rb); | ||
6435 | out: | 6521 | out: |
6436 | return ret; | 6522 | return ret; |
6437 | } | 6523 | } |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5bc6c8e9b851..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -31,7 +31,8 @@ struct ring_buffer { | |||
31 | spinlock_t event_lock; | 31 | spinlock_t event_lock; |
32 | struct list_head event_list; | 32 | struct list_head event_list; |
33 | 33 | ||
34 | int mmap_locked; | 34 | atomic_t mmap_count; |
35 | unsigned long mmap_locked; | ||
35 | struct user_struct *mmap_user; | 36 | struct user_struct *mmap_user; |
36 | 37 | ||
37 | struct perf_event_mmap_page *user_page; | 38 | struct perf_event_mmap_page *user_page; |