aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/events/core.c228
-rw-r--r--kernel/events/internal.h3
2 files changed, 159 insertions, 72 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae752cd4a086..b391907d5352 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 196static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 197static u64 perf_event_time(struct perf_event *event);
198 198
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 199void __weak perf_event_print_debug(void) { }
203 200
204extern __weak const char *perf_pmu_name(void) 201extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
2917 kfree(event); 2914 kfree(event);
2918} 2915}
2919 2916
2920static bool ring_buffer_put(struct ring_buffer *rb); 2917static void ring_buffer_put(struct ring_buffer *rb);
2918static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 2919
2922static void free_event(struct perf_event *event) 2920static void free_event(struct perf_event *event)
2923{ 2921{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 2940 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 2941 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 2942 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 2943 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 2944 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 2945 event->cpu));
2946 }
2948 } 2947 }
2949 } 2948 }
2950 2949
2951 if (event->rb) { 2950 if (event->rb) {
2952 ring_buffer_put(event->rb); 2951 struct ring_buffer *rb;
2953 event->rb = NULL; 2952
2953 /*
2954 * Can happen when we close an event with re-directed output.
2955 *
2956 * Since we have a 0 refcount, perf_mmap_close() will skip
2957 * over us; possibly making our ring_buffer_put() the last.
2958 */
2959 mutex_lock(&event->mmap_mutex);
2960 rb = event->rb;
2961 if (rb) {
2962 rcu_assign_pointer(event->rb, NULL);
2963 ring_buffer_detach(event, rb);
2964 ring_buffer_put(rb); /* could be last */
2965 }
2966 mutex_unlock(&event->mmap_mutex);
2954 } 2967 }
2955 2968
2956 if (is_cgroup_event(event)) 2969 if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3201 unsigned int events = POLL_HUP;
3189 3202
3190 /* 3203 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3204 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3205 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3206 */
3205 mutex_lock(&event->mmap_mutex); 3207 mutex_lock(&event->mmap_mutex);
3206 3208 rb = event->rb;
3207 rcu_read_lock(); 3209 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3210 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3211 mutex_unlock(&event->mmap_mutex);
3216 3212
3217 poll_wait(file, &event->waitq, wait); 3213 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3517 return;
3522 3518
3523 spin_lock_irqsave(&rb->event_lock, flags); 3519 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3520 if (list_empty(&event->rb_entry))
3525 goto unlock; 3521 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3522 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3523}
3531 3524
3532static void ring_buffer_detach(struct perf_event *event, 3525static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3526{
3535 unsigned long flags; 3527 unsigned long flags;
3536 3528
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3541
3550 rcu_read_lock(); 3542 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3543 rb = rcu_dereference(event->rb);
3552 if (!rb) 3544 if (rb) {
3553 goto unlock; 3545 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3546 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3547 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3548 rcu_read_unlock();
3560} 3549}
3561 3550
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3582 return rb; 3571 return rb;
3583} 3572}
3584 3573
3585static bool ring_buffer_put(struct ring_buffer *rb) 3574static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3575{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3576 if (!atomic_dec_and_test(&rb->refcount))
3591 return false; 3577 return;
3592 3578
3593 spin_lock_irqsave(&rb->event_lock, flags); 3579 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3580
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3581 call_rcu(&rb->rcu_head, rb_free_rcu);
3601 return true;
3602} 3582}
3603 3583
3604static void perf_mmap_open(struct vm_area_struct *vma) 3584static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3606 struct perf_event *event = vma->vm_file->private_data; 3586 struct perf_event *event = vma->vm_file->private_data;
3607 3587
3608 atomic_inc(&event->mmap_count); 3588 atomic_inc(&event->mmap_count);
3589 atomic_inc(&event->rb->mmap_count);
3609} 3590}
3610 3591
3592/*
3593 * A buffer can be mmap()ed multiple times; either directly through the same
3594 * event, or through other events by use of perf_event_set_output().
3595 *
3596 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3597 * the buffer here, where we still have a VM context. This means we need
3598 * to detach all events redirecting to us.
3599 */
3611static void perf_mmap_close(struct vm_area_struct *vma) 3600static void perf_mmap_close(struct vm_area_struct *vma)
3612{ 3601{
3613 struct perf_event *event = vma->vm_file->private_data; 3602 struct perf_event *event = vma->vm_file->private_data;
3614 3603
3615 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3604 struct ring_buffer *rb = event->rb;
3616 struct ring_buffer *rb = event->rb; 3605 struct user_struct *mmap_user = rb->mmap_user;
3617 struct user_struct *mmap_user = rb->mmap_user; 3606 int mmap_locked = rb->mmap_locked;
3618 int mmap_locked = rb->mmap_locked; 3607 unsigned long size = perf_data_size(rb);
3619 unsigned long size = perf_data_size(rb);
3620 3608
3621 rcu_assign_pointer(event->rb, NULL); 3609 atomic_dec(&rb->mmap_count);
3622 ring_buffer_detach(event, rb); 3610
3623 mutex_unlock(&event->mmap_mutex); 3611 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3612 return;
3613
3614 /* Detach current event from the buffer. */
3615 rcu_assign_pointer(event->rb, NULL);
3616 ring_buffer_detach(event, rb);
3617 mutex_unlock(&event->mmap_mutex);
3618
3619 /* If there's still other mmap()s of this buffer, we're done. */
3620 if (atomic_read(&rb->mmap_count)) {
3621 ring_buffer_put(rb); /* can't be last */
3622 return;
3623 }
3624 3624
3625 if (ring_buffer_put(rb)) { 3625 /*
3626 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 3626 * No other mmap()s, detach from all other events that might redirect
3627 vma->vm_mm->pinned_vm -= mmap_locked; 3627 * into the now unreachable buffer. Somewhat complicated by the
3628 free_uid(mmap_user); 3628 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3629 */
3630again:
3631 rcu_read_lock();
3632 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3633 if (!atomic_long_inc_not_zero(&event->refcount)) {
3634 /*
3635 * This event is en-route to free_event() which will
3636 * detach it and remove it from the list.
3637 */
3638 continue;
3629 } 3639 }
3640 rcu_read_unlock();
3641
3642 mutex_lock(&event->mmap_mutex);
3643 /*
3644 * Check we didn't race with perf_event_set_output() which can
3645 * swizzle the rb from under us while we were waiting to
3646 * acquire mmap_mutex.
3647 *
3648 * If we find a different rb; ignore this event, a next
3649 * iteration will no longer find it on the list. We have to
3650 * still restart the iteration to make sure we're not now
3651 * iterating the wrong list.
3652 */
3653 if (event->rb == rb) {
3654 rcu_assign_pointer(event->rb, NULL);
3655 ring_buffer_detach(event, rb);
3656 ring_buffer_put(rb); /* can't be last, we still have one */
3657 }
3658 mutex_unlock(&event->mmap_mutex);
3659 put_event(event);
3660
3661 /*
3662 * Restart the iteration; either we're on the wrong list or
3663 * destroyed its integrity by doing a deletion.
3664 */
3665 goto again;
3630 } 3666 }
3667 rcu_read_unlock();
3668
3669 /*
3670 * It could be there's still a few 0-ref events on the list; they'll
3671 * get cleaned up by free_event() -- they'll also still have their
3672 * ref on the rb and will free it whenever they are done with it.
3673 *
3674 * Aside from that, this buffer is 'fully' detached and unmapped,
3675 * undo the VM accounting.
3676 */
3677
3678 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3679 vma->vm_mm->pinned_vm -= mmap_locked;
3680 free_uid(mmap_user);
3681
3682 ring_buffer_put(rb); /* could be last */
3631} 3683}
3632 3684
3633static const struct vm_operations_struct perf_mmap_vmops = { 3685static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3677 return -EINVAL; 3729 return -EINVAL;
3678 3730
3679 WARN_ON_ONCE(event->ctx->parent_ctx); 3731 WARN_ON_ONCE(event->ctx->parent_ctx);
3732again:
3680 mutex_lock(&event->mmap_mutex); 3733 mutex_lock(&event->mmap_mutex);
3681 if (event->rb) { 3734 if (event->rb) {
3682 if (event->rb->nr_pages != nr_pages) 3735 if (event->rb->nr_pages != nr_pages) {
3683 ret = -EINVAL; 3736 ret = -EINVAL;
3737 goto unlock;
3738 }
3739
3740 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3741 /*
3742 * Raced against perf_mmap_close() through
3743 * perf_event_set_output(). Try again, hope for better
3744 * luck.
3745 */
3746 mutex_unlock(&event->mmap_mutex);
3747 goto again;
3748 }
3749
3684 goto unlock; 3750 goto unlock;
3685 } 3751 }
3686 3752
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3722 goto unlock; 3788 goto unlock;
3723 } 3789 }
3724 3790
3791 atomic_set(&rb->mmap_count, 1);
3725 rb->mmap_locked = extra; 3792 rb->mmap_locked = extra;
3726 rb->mmap_user = get_current_user(); 3793 rb->mmap_user = get_current_user();
3727 3794
3728 atomic_long_add(user_extra, &user->locked_vm); 3795 atomic_long_add(user_extra, &user->locked_vm);
3729 vma->vm_mm->pinned_vm += extra; 3796 vma->vm_mm->pinned_vm += extra;
3730 3797
3798 ring_buffer_attach(event, rb);
3731 rcu_assign_pointer(event->rb, rb); 3799 rcu_assign_pointer(event->rb, rb);
3732 3800
3733 perf_event_update_userpage(event); 3801 perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ unlock:
3737 atomic_inc(&event->mmap_count); 3805 atomic_inc(&event->mmap_count);
3738 mutex_unlock(&event->mmap_mutex); 3806 mutex_unlock(&event->mmap_mutex);
3739 3807
3808 /*
3809 * Since pinned accounting is per vm we cannot allow fork() to copy our
3810 * vma.
3811 */
3740 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 3812 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3741 vma->vm_ops = &perf_mmap_vmops; 3813 vma->vm_ops = &perf_mmap_vmops;
3742 3814
@@ -6415,6 +6487,8 @@ set:
6415 if (atomic_read(&event->mmap_count)) 6487 if (atomic_read(&event->mmap_count))
6416 goto unlock; 6488 goto unlock;
6417 6489
6490 old_rb = event->rb;
6491
6418 if (output_event) { 6492 if (output_event) {
6419 /* get the rb we want to redirect to */ 6493 /* get the rb we want to redirect to */
6420 rb = ring_buffer_get(output_event); 6494 rb = ring_buffer_get(output_event);
@@ -6422,16 +6496,28 @@ set:
6422 goto unlock; 6496 goto unlock;
6423 } 6497 }
6424 6498
6425 old_rb = event->rb;
6426 rcu_assign_pointer(event->rb, rb);
6427 if (old_rb) 6499 if (old_rb)
6428 ring_buffer_detach(event, old_rb); 6500 ring_buffer_detach(event, old_rb);
6501
6502 if (rb)
6503 ring_buffer_attach(event, rb);
6504
6505 rcu_assign_pointer(event->rb, rb);
6506
6507 if (old_rb) {
6508 ring_buffer_put(old_rb);
6509 /*
6510 * Since we detached before setting the new rb, so that we
6511 * could attach the new rb, we could have missed a wakeup.
6512 * Provide it now.
6513 */
6514 wake_up_all(&event->waitq);
6515 }
6516
6429 ret = 0; 6517 ret = 0;
6430unlock: 6518unlock:
6431 mutex_unlock(&event->mmap_mutex); 6519 mutex_unlock(&event->mmap_mutex);
6432 6520
6433 if (old_rb)
6434 ring_buffer_put(old_rb);
6435out: 6521out:
6436 return ret; 6522 return ret;
6437} 6523}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5bc6c8e9b851..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,7 +31,8 @@ struct ring_buffer {
31 spinlock_t event_lock; 31 spinlock_t event_lock;
32 struct list_head event_list; 32 struct list_head event_list;
33 33
34 int mmap_locked; 34 atomic_t mmap_count;
35 unsigned long mmap_locked;
35 struct user_struct *mmap_user; 36 struct user_struct *mmap_user;
36 37
37 struct perf_event_mmap_page *user_page; 38 struct perf_event_mmap_page *user_page;