diff options
-rw-r--r-- | kernel/events/core.c | 228 | ||||
-rw-r--r-- | kernel/events/internal.h | 3 |
2 files changed, 159 insertions, 72 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index ae752cd4a086..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
196 | static void update_context_time(struct perf_event_context *ctx); | 196 | static void update_context_time(struct perf_event_context *ctx); |
197 | static u64 perf_event_time(struct perf_event *event); | 197 | static u64 perf_event_time(struct perf_event *event); |
198 | 198 | ||
199 | static void ring_buffer_attach(struct perf_event *event, | ||
200 | struct ring_buffer *rb); | ||
201 | |||
202 | void __weak perf_event_print_debug(void) { } | 199 | void __weak perf_event_print_debug(void) { } |
203 | 200 | ||
204 | extern __weak const char *perf_pmu_name(void) | 201 | extern __weak const char *perf_pmu_name(void) |
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head) | |||
2917 | kfree(event); | 2914 | kfree(event); |
2918 | } | 2915 | } |
2919 | 2916 | ||
2920 | static bool ring_buffer_put(struct ring_buffer *rb); | 2917 | static void ring_buffer_put(struct ring_buffer *rb); |
2918 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | ||
2921 | 2919 | ||
2922 | static void free_event(struct perf_event *event) | 2920 | static void free_event(struct perf_event *event) |
2923 | { | 2921 | { |
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) | |||
2942 | if (has_branch_stack(event)) { | 2940 | if (has_branch_stack(event)) { |
2943 | static_key_slow_dec_deferred(&perf_sched_events); | 2941 | static_key_slow_dec_deferred(&perf_sched_events); |
2944 | /* is system-wide event */ | 2942 | /* is system-wide event */ |
2945 | if (!(event->attach_state & PERF_ATTACH_TASK)) | 2943 | if (!(event->attach_state & PERF_ATTACH_TASK)) { |
2946 | atomic_dec(&per_cpu(perf_branch_stack_events, | 2944 | atomic_dec(&per_cpu(perf_branch_stack_events, |
2947 | event->cpu)); | 2945 | event->cpu)); |
2946 | } | ||
2948 | } | 2947 | } |
2949 | } | 2948 | } |
2950 | 2949 | ||
2951 | if (event->rb) { | 2950 | if (event->rb) { |
2952 | ring_buffer_put(event->rb); | 2951 | struct ring_buffer *rb; |
2953 | event->rb = NULL; | 2952 | |
2953 | /* | ||
2954 | * Can happen when we close an event with re-directed output. | ||
2955 | * | ||
2956 | * Since we have a 0 refcount, perf_mmap_close() will skip | ||
2957 | * over us; possibly making our ring_buffer_put() the last. | ||
2958 | */ | ||
2959 | mutex_lock(&event->mmap_mutex); | ||
2960 | rb = event->rb; | ||
2961 | if (rb) { | ||
2962 | rcu_assign_pointer(event->rb, NULL); | ||
2963 | ring_buffer_detach(event, rb); | ||
2964 | ring_buffer_put(rb); /* could be last */ | ||
2965 | } | ||
2966 | mutex_unlock(&event->mmap_mutex); | ||
2954 | } | 2967 | } |
2955 | 2968 | ||
2956 | if (is_cgroup_event(event)) | 2969 | if (is_cgroup_event(event)) |
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3188 | unsigned int events = POLL_HUP; | 3201 | unsigned int events = POLL_HUP; |
3189 | 3202 | ||
3190 | /* | 3203 | /* |
3191 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | 3204 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
3192 | * grabs the rb reference but perf_event_set_output() overrides it. | 3205 | * perf_event_set_output() can swizzle our rb and make us miss wakeups. |
3193 | * Here is the timeline for two threads T1, T2: | ||
3194 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3195 | * t1: T2, old_rb = event->rb | ||
3196 | * t2: T2, event->rb = new rb | ||
3197 | * t3: T2, ring_buffer_detach(old_rb) | ||
3198 | * t4: T1, ring_buffer_attach(rb1) | ||
3199 | * t5: T1, poll_wait(event->waitq) | ||
3200 | * | ||
3201 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3202 | * thereby ensuring that the assignment of the new ring buffer | ||
3203 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3204 | */ | 3206 | */ |
3205 | mutex_lock(&event->mmap_mutex); | 3207 | mutex_lock(&event->mmap_mutex); |
3206 | 3208 | rb = event->rb; | |
3207 | rcu_read_lock(); | 3209 | if (rb) |
3208 | rb = rcu_dereference(event->rb); | ||
3209 | if (rb) { | ||
3210 | ring_buffer_attach(event, rb); | ||
3211 | events = atomic_xchg(&rb->poll, 0); | 3210 | events = atomic_xchg(&rb->poll, 0); |
3212 | } | ||
3213 | rcu_read_unlock(); | ||
3214 | |||
3215 | mutex_unlock(&event->mmap_mutex); | 3211 | mutex_unlock(&event->mmap_mutex); |
3216 | 3212 | ||
3217 | poll_wait(file, &event->waitq, wait); | 3213 | poll_wait(file, &event->waitq, wait); |
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, | |||
3521 | return; | 3517 | return; |
3522 | 3518 | ||
3523 | spin_lock_irqsave(&rb->event_lock, flags); | 3519 | spin_lock_irqsave(&rb->event_lock, flags); |
3524 | if (!list_empty(&event->rb_entry)) | 3520 | if (list_empty(&event->rb_entry)) |
3525 | goto unlock; | 3521 | list_add(&event->rb_entry, &rb->event_list); |
3526 | |||
3527 | list_add(&event->rb_entry, &rb->event_list); | ||
3528 | unlock: | ||
3529 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3522 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3530 | } | 3523 | } |
3531 | 3524 | ||
3532 | static void ring_buffer_detach(struct perf_event *event, | 3525 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) |
3533 | struct ring_buffer *rb) | ||
3534 | { | 3526 | { |
3535 | unsigned long flags; | 3527 | unsigned long flags; |
3536 | 3528 | ||
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) | |||
3549 | 3541 | ||
3550 | rcu_read_lock(); | 3542 | rcu_read_lock(); |
3551 | rb = rcu_dereference(event->rb); | 3543 | rb = rcu_dereference(event->rb); |
3552 | if (!rb) | 3544 | if (rb) { |
3553 | goto unlock; | 3545 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) |
3554 | 3546 | wake_up_all(&event->waitq); | |
3555 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | 3547 | } |
3556 | wake_up_all(&event->waitq); | ||
3557 | |||
3558 | unlock: | ||
3559 | rcu_read_unlock(); | 3548 | rcu_read_unlock(); |
3560 | } | 3549 | } |
3561 | 3550 | ||
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3582 | return rb; | 3571 | return rb; |
3583 | } | 3572 | } |
3584 | 3573 | ||
3585 | static bool ring_buffer_put(struct ring_buffer *rb) | 3574 | static void ring_buffer_put(struct ring_buffer *rb) |
3586 | { | 3575 | { |
3587 | struct perf_event *event, *n; | ||
3588 | unsigned long flags; | ||
3589 | |||
3590 | if (!atomic_dec_and_test(&rb->refcount)) | 3576 | if (!atomic_dec_and_test(&rb->refcount)) |
3591 | return false; | 3577 | return; |
3592 | 3578 | ||
3593 | spin_lock_irqsave(&rb->event_lock, flags); | 3579 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
3594 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3595 | list_del_init(&event->rb_entry); | ||
3596 | wake_up_all(&event->waitq); | ||
3597 | } | ||
3598 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3599 | 3580 | ||
3600 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3581 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3601 | return true; | ||
3602 | } | 3582 | } |
3603 | 3583 | ||
3604 | static void perf_mmap_open(struct vm_area_struct *vma) | 3584 | static void perf_mmap_open(struct vm_area_struct *vma) |
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
3606 | struct perf_event *event = vma->vm_file->private_data; | 3586 | struct perf_event *event = vma->vm_file->private_data; |
3607 | 3587 | ||
3608 | atomic_inc(&event->mmap_count); | 3588 | atomic_inc(&event->mmap_count); |
3589 | atomic_inc(&event->rb->mmap_count); | ||
3609 | } | 3590 | } |
3610 | 3591 | ||
3592 | /* | ||
3593 | * A buffer can be mmap()ed multiple times; either directly through the same | ||
3594 | * event, or through other events by use of perf_event_set_output(). | ||
3595 | * | ||
3596 | * In order to undo the VM accounting done by perf_mmap() we need to destroy | ||
3597 | * the buffer here, where we still have a VM context. This means we need | ||
3598 | * to detach all events redirecting to us. | ||
3599 | */ | ||
3611 | static void perf_mmap_close(struct vm_area_struct *vma) | 3600 | static void perf_mmap_close(struct vm_area_struct *vma) |
3612 | { | 3601 | { |
3613 | struct perf_event *event = vma->vm_file->private_data; | 3602 | struct perf_event *event = vma->vm_file->private_data; |
3614 | 3603 | ||
3615 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3604 | struct ring_buffer *rb = event->rb; |
3616 | struct ring_buffer *rb = event->rb; | 3605 | struct user_struct *mmap_user = rb->mmap_user; |
3617 | struct user_struct *mmap_user = rb->mmap_user; | 3606 | int mmap_locked = rb->mmap_locked; |
3618 | int mmap_locked = rb->mmap_locked; | 3607 | unsigned long size = perf_data_size(rb); |
3619 | unsigned long size = perf_data_size(rb); | ||
3620 | 3608 | ||
3621 | rcu_assign_pointer(event->rb, NULL); | 3609 | atomic_dec(&rb->mmap_count); |
3622 | ring_buffer_detach(event, rb); | 3610 | |
3623 | mutex_unlock(&event->mmap_mutex); | 3611 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
3612 | return; | ||
3613 | |||
3614 | /* Detach current event from the buffer. */ | ||
3615 | rcu_assign_pointer(event->rb, NULL); | ||
3616 | ring_buffer_detach(event, rb); | ||
3617 | mutex_unlock(&event->mmap_mutex); | ||
3618 | |||
3619 | /* If there's still other mmap()s of this buffer, we're done. */ | ||
3620 | if (atomic_read(&rb->mmap_count)) { | ||
3621 | ring_buffer_put(rb); /* can't be last */ | ||
3622 | return; | ||
3623 | } | ||
3624 | 3624 | ||
3625 | if (ring_buffer_put(rb)) { | 3625 | /* |
3626 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | 3626 | * No other mmap()s, detach from all other events that might redirect |
3627 | vma->vm_mm->pinned_vm -= mmap_locked; | 3627 | * into the now unreachable buffer. Somewhat complicated by the |
3628 | free_uid(mmap_user); | 3628 | * fact that rb::event_lock otherwise nests inside mmap_mutex. |
3629 | */ | ||
3630 | again: | ||
3631 | rcu_read_lock(); | ||
3632 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { | ||
3633 | if (!atomic_long_inc_not_zero(&event->refcount)) { | ||
3634 | /* | ||
3635 | * This event is en-route to free_event() which will | ||
3636 | * detach it and remove it from the list. | ||
3637 | */ | ||
3638 | continue; | ||
3629 | } | 3639 | } |
3640 | rcu_read_unlock(); | ||
3641 | |||
3642 | mutex_lock(&event->mmap_mutex); | ||
3643 | /* | ||
3644 | * Check we didn't race with perf_event_set_output() which can | ||
3645 | * swizzle the rb from under us while we were waiting to | ||
3646 | * acquire mmap_mutex. | ||
3647 | * | ||
3648 | * If we find a different rb; ignore this event, a next | ||
3649 | * iteration will no longer find it on the list. We have to | ||
3650 | * still restart the iteration to make sure we're not now | ||
3651 | * iterating the wrong list. | ||
3652 | */ | ||
3653 | if (event->rb == rb) { | ||
3654 | rcu_assign_pointer(event->rb, NULL); | ||
3655 | ring_buffer_detach(event, rb); | ||
3656 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3657 | } | ||
3658 | mutex_unlock(&event->mmap_mutex); | ||
3659 | put_event(event); | ||
3660 | |||
3661 | /* | ||
3662 | * Restart the iteration; either we're on the wrong list or | ||
3663 | * destroyed its integrity by doing a deletion. | ||
3664 | */ | ||
3665 | goto again; | ||
3630 | } | 3666 | } |
3667 | rcu_read_unlock(); | ||
3668 | |||
3669 | /* | ||
3670 | * It could be there's still a few 0-ref events on the list; they'll | ||
3671 | * get cleaned up by free_event() -- they'll also still have their | ||
3672 | * ref on the rb and will free it whenever they are done with it. | ||
3673 | * | ||
3674 | * Aside from that, this buffer is 'fully' detached and unmapped, | ||
3675 | * undo the VM accounting. | ||
3676 | */ | ||
3677 | |||
3678 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | ||
3679 | vma->vm_mm->pinned_vm -= mmap_locked; | ||
3680 | free_uid(mmap_user); | ||
3681 | |||
3682 | ring_buffer_put(rb); /* could be last */ | ||
3631 | } | 3683 | } |
3632 | 3684 | ||
3633 | static const struct vm_operations_struct perf_mmap_vmops = { | 3685 | static const struct vm_operations_struct perf_mmap_vmops = { |
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3677 | return -EINVAL; | 3729 | return -EINVAL; |
3678 | 3730 | ||
3679 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3731 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3732 | again: | ||
3680 | mutex_lock(&event->mmap_mutex); | 3733 | mutex_lock(&event->mmap_mutex); |
3681 | if (event->rb) { | 3734 | if (event->rb) { |
3682 | if (event->rb->nr_pages != nr_pages) | 3735 | if (event->rb->nr_pages != nr_pages) { |
3683 | ret = -EINVAL; | 3736 | ret = -EINVAL; |
3737 | goto unlock; | ||
3738 | } | ||
3739 | |||
3740 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | ||
3741 | /* | ||
3742 | * Raced against perf_mmap_close() through | ||
3743 | * perf_event_set_output(). Try again, hope for better | ||
3744 | * luck. | ||
3745 | */ | ||
3746 | mutex_unlock(&event->mmap_mutex); | ||
3747 | goto again; | ||
3748 | } | ||
3749 | |||
3684 | goto unlock; | 3750 | goto unlock; |
3685 | } | 3751 | } |
3686 | 3752 | ||
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3722 | goto unlock; | 3788 | goto unlock; |
3723 | } | 3789 | } |
3724 | 3790 | ||
3791 | atomic_set(&rb->mmap_count, 1); | ||
3725 | rb->mmap_locked = extra; | 3792 | rb->mmap_locked = extra; |
3726 | rb->mmap_user = get_current_user(); | 3793 | rb->mmap_user = get_current_user(); |
3727 | 3794 | ||
3728 | atomic_long_add(user_extra, &user->locked_vm); | 3795 | atomic_long_add(user_extra, &user->locked_vm); |
3729 | vma->vm_mm->pinned_vm += extra; | 3796 | vma->vm_mm->pinned_vm += extra; |
3730 | 3797 | ||
3798 | ring_buffer_attach(event, rb); | ||
3731 | rcu_assign_pointer(event->rb, rb); | 3799 | rcu_assign_pointer(event->rb, rb); |
3732 | 3800 | ||
3733 | perf_event_update_userpage(event); | 3801 | perf_event_update_userpage(event); |
@@ -3737,6 +3805,10 @@ unlock: | |||
3737 | atomic_inc(&event->mmap_count); | 3805 | atomic_inc(&event->mmap_count); |
3738 | mutex_unlock(&event->mmap_mutex); | 3806 | mutex_unlock(&event->mmap_mutex); |
3739 | 3807 | ||
3808 | /* | ||
3809 | * Since pinned accounting is per vm we cannot allow fork() to copy our | ||
3810 | * vma. | ||
3811 | */ | ||
3740 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; | 3812 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; |
3741 | vma->vm_ops = &perf_mmap_vmops; | 3813 | vma->vm_ops = &perf_mmap_vmops; |
3742 | 3814 | ||
@@ -6415,6 +6487,8 @@ set: | |||
6415 | if (atomic_read(&event->mmap_count)) | 6487 | if (atomic_read(&event->mmap_count)) |
6416 | goto unlock; | 6488 | goto unlock; |
6417 | 6489 | ||
6490 | old_rb = event->rb; | ||
6491 | |||
6418 | if (output_event) { | 6492 | if (output_event) { |
6419 | /* get the rb we want to redirect to */ | 6493 | /* get the rb we want to redirect to */ |
6420 | rb = ring_buffer_get(output_event); | 6494 | rb = ring_buffer_get(output_event); |
@@ -6422,16 +6496,28 @@ set: | |||
6422 | goto unlock; | 6496 | goto unlock; |
6423 | } | 6497 | } |
6424 | 6498 | ||
6425 | old_rb = event->rb; | ||
6426 | rcu_assign_pointer(event->rb, rb); | ||
6427 | if (old_rb) | 6499 | if (old_rb) |
6428 | ring_buffer_detach(event, old_rb); | 6500 | ring_buffer_detach(event, old_rb); |
6501 | |||
6502 | if (rb) | ||
6503 | ring_buffer_attach(event, rb); | ||
6504 | |||
6505 | rcu_assign_pointer(event->rb, rb); | ||
6506 | |||
6507 | if (old_rb) { | ||
6508 | ring_buffer_put(old_rb); | ||
6509 | /* | ||
6510 | * Since we detached before setting the new rb, so that we | ||
6511 | * could attach the new rb, we could have missed a wakeup. | ||
6512 | * Provide it now. | ||
6513 | */ | ||
6514 | wake_up_all(&event->waitq); | ||
6515 | } | ||
6516 | |||
6429 | ret = 0; | 6517 | ret = 0; |
6430 | unlock: | 6518 | unlock: |
6431 | mutex_unlock(&event->mmap_mutex); | 6519 | mutex_unlock(&event->mmap_mutex); |
6432 | 6520 | ||
6433 | if (old_rb) | ||
6434 | ring_buffer_put(old_rb); | ||
6435 | out: | 6521 | out: |
6436 | return ret; | 6522 | return ret; |
6437 | } | 6523 | } |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5bc6c8e9b851..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -31,7 +31,8 @@ struct ring_buffer { | |||
31 | spinlock_t event_lock; | 31 | spinlock_t event_lock; |
32 | struct list_head event_list; | 32 | struct list_head event_list; |
33 | 33 | ||
34 | int mmap_locked; | 34 | atomic_t mmap_count; |
35 | unsigned long mmap_locked; | ||
35 | struct user_struct *mmap_user; | 36 | struct user_struct *mmap_user; |
36 | 37 | ||
37 | struct perf_event_mmap_page *user_page; | 38 | struct perf_event_mmap_page *user_page; |