diff options
author | Ingo Molnar <mingo@kernel.org> | 2013-06-19 06:44:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-06-19 06:44:41 -0400 |
commit | eff2108f020f30eb90462205ecf3ce10a420938b (patch) | |
tree | 27d1fc8ab23cd81e4863c29cddc0fbe8cfe68431 /kernel/events | |
parent | afb71193a4d8e4a3c4c52a80a8cbee76582f0e90 (diff) | |
parent | f1a527899ef0a8a241eb3bea619eb2e29d797f44 (diff) |
Merge branch 'perf/urgent' into perf/core
Merge in the latest fixes, to avoid conflicts with ongoing work.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 233 | ||||
-rw-r--r-- | kernel/events/internal.h | 4 |
2 files changed, 165 insertions, 72 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index a0780b3a3d50..d0e0d0d2025f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -198,9 +198,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
198 | static void update_context_time(struct perf_event_context *ctx); | 198 | static void update_context_time(struct perf_event_context *ctx); |
199 | static u64 perf_event_time(struct perf_event *event); | 199 | static u64 perf_event_time(struct perf_event *event); |
200 | 200 | ||
201 | static void ring_buffer_attach(struct perf_event *event, | ||
202 | struct ring_buffer *rb); | ||
203 | |||
204 | void __weak perf_event_print_debug(void) { } | 201 | void __weak perf_event_print_debug(void) { } |
205 | 202 | ||
206 | extern __weak const char *perf_pmu_name(void) | 203 | extern __weak const char *perf_pmu_name(void) |
@@ -3023,6 +3020,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
3023 | } | 3020 | } |
3024 | 3021 | ||
3025 | static void ring_buffer_put(struct ring_buffer *rb); | 3022 | static void ring_buffer_put(struct ring_buffer *rb); |
3023 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | ||
3026 | 3024 | ||
3027 | static void free_event(struct perf_event *event) | 3025 | static void free_event(struct perf_event *event) |
3028 | { | 3026 | { |
@@ -3047,15 +3045,30 @@ static void free_event(struct perf_event *event) | |||
3047 | if (has_branch_stack(event)) { | 3045 | if (has_branch_stack(event)) { |
3048 | static_key_slow_dec_deferred(&perf_sched_events); | 3046 | static_key_slow_dec_deferred(&perf_sched_events); |
3049 | /* is system-wide event */ | 3047 | /* is system-wide event */ |
3050 | if (!(event->attach_state & PERF_ATTACH_TASK)) | 3048 | if (!(event->attach_state & PERF_ATTACH_TASK)) { |
3051 | atomic_dec(&per_cpu(perf_branch_stack_events, | 3049 | atomic_dec(&per_cpu(perf_branch_stack_events, |
3052 | event->cpu)); | 3050 | event->cpu)); |
3051 | } | ||
3053 | } | 3052 | } |
3054 | } | 3053 | } |
3055 | 3054 | ||
3056 | if (event->rb) { | 3055 | if (event->rb) { |
3057 | ring_buffer_put(event->rb); | 3056 | struct ring_buffer *rb; |
3058 | event->rb = NULL; | 3057 | |
3058 | /* | ||
3059 | * Can happen when we close an event with re-directed output. | ||
3060 | * | ||
3061 | * Since we have a 0 refcount, perf_mmap_close() will skip | ||
3062 | * over us; possibly making our ring_buffer_put() the last. | ||
3063 | */ | ||
3064 | mutex_lock(&event->mmap_mutex); | ||
3065 | rb = event->rb; | ||
3066 | if (rb) { | ||
3067 | rcu_assign_pointer(event->rb, NULL); | ||
3068 | ring_buffer_detach(event, rb); | ||
3069 | ring_buffer_put(rb); /* could be last */ | ||
3070 | } | ||
3071 | mutex_unlock(&event->mmap_mutex); | ||
3059 | } | 3072 | } |
3060 | 3073 | ||
3061 | if (is_cgroup_event(event)) | 3074 | if (is_cgroup_event(event)) |
@@ -3293,30 +3306,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3293 | unsigned int events = POLL_HUP; | 3306 | unsigned int events = POLL_HUP; |
3294 | 3307 | ||
3295 | /* | 3308 | /* |
3296 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | 3309 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
3297 | * grabs the rb reference but perf_event_set_output() overrides it. | 3310 | * perf_event_set_output() can swizzle our rb and make us miss wakeups. |
3298 | * Here is the timeline for two threads T1, T2: | ||
3299 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3300 | * t1: T2, old_rb = event->rb | ||
3301 | * t2: T2, event->rb = new rb | ||
3302 | * t3: T2, ring_buffer_detach(old_rb) | ||
3303 | * t4: T1, ring_buffer_attach(rb1) | ||
3304 | * t5: T1, poll_wait(event->waitq) | ||
3305 | * | ||
3306 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3307 | * thereby ensuring that the assignment of the new ring buffer | ||
3308 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3309 | */ | 3311 | */ |
3310 | mutex_lock(&event->mmap_mutex); | 3312 | mutex_lock(&event->mmap_mutex); |
3311 | 3313 | rb = event->rb; | |
3312 | rcu_read_lock(); | 3314 | if (rb) |
3313 | rb = rcu_dereference(event->rb); | ||
3314 | if (rb) { | ||
3315 | ring_buffer_attach(event, rb); | ||
3316 | events = atomic_xchg(&rb->poll, 0); | 3315 | events = atomic_xchg(&rb->poll, 0); |
3317 | } | ||
3318 | rcu_read_unlock(); | ||
3319 | |||
3320 | mutex_unlock(&event->mmap_mutex); | 3316 | mutex_unlock(&event->mmap_mutex); |
3321 | 3317 | ||
3322 | poll_wait(file, &event->waitq, wait); | 3318 | poll_wait(file, &event->waitq, wait); |
@@ -3626,16 +3622,12 @@ static void ring_buffer_attach(struct perf_event *event, | |||
3626 | return; | 3622 | return; |
3627 | 3623 | ||
3628 | spin_lock_irqsave(&rb->event_lock, flags); | 3624 | spin_lock_irqsave(&rb->event_lock, flags); |
3629 | if (!list_empty(&event->rb_entry)) | 3625 | if (list_empty(&event->rb_entry)) |
3630 | goto unlock; | 3626 | list_add(&event->rb_entry, &rb->event_list); |
3631 | |||
3632 | list_add(&event->rb_entry, &rb->event_list); | ||
3633 | unlock: | ||
3634 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3627 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3635 | } | 3628 | } |
3636 | 3629 | ||
3637 | static void ring_buffer_detach(struct perf_event *event, | 3630 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) |
3638 | struct ring_buffer *rb) | ||
3639 | { | 3631 | { |
3640 | unsigned long flags; | 3632 | unsigned long flags; |
3641 | 3633 | ||
@@ -3654,13 +3646,10 @@ static void ring_buffer_wakeup(struct perf_event *event) | |||
3654 | 3646 | ||
3655 | rcu_read_lock(); | 3647 | rcu_read_lock(); |
3656 | rb = rcu_dereference(event->rb); | 3648 | rb = rcu_dereference(event->rb); |
3657 | if (!rb) | 3649 | if (rb) { |
3658 | goto unlock; | 3650 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) |
3659 | 3651 | wake_up_all(&event->waitq); | |
3660 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | 3652 | } |
3661 | wake_up_all(&event->waitq); | ||
3662 | |||
3663 | unlock: | ||
3664 | rcu_read_unlock(); | 3653 | rcu_read_unlock(); |
3665 | } | 3654 | } |
3666 | 3655 | ||
@@ -3689,18 +3678,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3689 | 3678 | ||
3690 | static void ring_buffer_put(struct ring_buffer *rb) | 3679 | static void ring_buffer_put(struct ring_buffer *rb) |
3691 | { | 3680 | { |
3692 | struct perf_event *event, *n; | ||
3693 | unsigned long flags; | ||
3694 | |||
3695 | if (!atomic_dec_and_test(&rb->refcount)) | 3681 | if (!atomic_dec_and_test(&rb->refcount)) |
3696 | return; | 3682 | return; |
3697 | 3683 | ||
3698 | spin_lock_irqsave(&rb->event_lock, flags); | 3684 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
3699 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3700 | list_del_init(&event->rb_entry); | ||
3701 | wake_up_all(&event->waitq); | ||
3702 | } | ||
3703 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3704 | 3685 | ||
3705 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3686 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3706 | } | 3687 | } |
@@ -3710,26 +3691,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
3710 | struct perf_event *event = vma->vm_file->private_data; | 3691 | struct perf_event *event = vma->vm_file->private_data; |
3711 | 3692 | ||
3712 | atomic_inc(&event->mmap_count); | 3693 | atomic_inc(&event->mmap_count); |
3694 | atomic_inc(&event->rb->mmap_count); | ||
3713 | } | 3695 | } |
3714 | 3696 | ||
3697 | /* | ||
3698 | * A buffer can be mmap()ed multiple times; either directly through the same | ||
3699 | * event, or through other events by use of perf_event_set_output(). | ||
3700 | * | ||
3701 | * In order to undo the VM accounting done by perf_mmap() we need to destroy | ||
3702 | * the buffer here, where we still have a VM context. This means we need | ||
3703 | * to detach all events redirecting to us. | ||
3704 | */ | ||
3715 | static void perf_mmap_close(struct vm_area_struct *vma) | 3705 | static void perf_mmap_close(struct vm_area_struct *vma) |
3716 | { | 3706 | { |
3717 | struct perf_event *event = vma->vm_file->private_data; | 3707 | struct perf_event *event = vma->vm_file->private_data; |
3718 | 3708 | ||
3719 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3709 | struct ring_buffer *rb = event->rb; |
3720 | unsigned long size = perf_data_size(event->rb); | 3710 | struct user_struct *mmap_user = rb->mmap_user; |
3721 | struct user_struct *user = event->mmap_user; | 3711 | int mmap_locked = rb->mmap_locked; |
3722 | struct ring_buffer *rb = event->rb; | 3712 | unsigned long size = perf_data_size(rb); |
3713 | |||
3714 | atomic_dec(&rb->mmap_count); | ||
3715 | |||
3716 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | ||
3717 | return; | ||
3723 | 3718 | ||
3724 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3719 | /* Detach current event from the buffer. */ |
3725 | vma->vm_mm->pinned_vm -= event->mmap_locked; | 3720 | rcu_assign_pointer(event->rb, NULL); |
3726 | rcu_assign_pointer(event->rb, NULL); | 3721 | ring_buffer_detach(event, rb); |
3727 | ring_buffer_detach(event, rb); | 3722 | mutex_unlock(&event->mmap_mutex); |
3723 | |||
3724 | /* If there's still other mmap()s of this buffer, we're done. */ | ||
3725 | if (atomic_read(&rb->mmap_count)) { | ||
3726 | ring_buffer_put(rb); /* can't be last */ | ||
3727 | return; | ||
3728 | } | ||
3729 | |||
3730 | /* | ||
3731 | * No other mmap()s, detach from all other events that might redirect | ||
3732 | * into the now unreachable buffer. Somewhat complicated by the | ||
3733 | * fact that rb::event_lock otherwise nests inside mmap_mutex. | ||
3734 | */ | ||
3735 | again: | ||
3736 | rcu_read_lock(); | ||
3737 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { | ||
3738 | if (!atomic_long_inc_not_zero(&event->refcount)) { | ||
3739 | /* | ||
3740 | * This event is en-route to free_event() which will | ||
3741 | * detach it and remove it from the list. | ||
3742 | */ | ||
3743 | continue; | ||
3744 | } | ||
3745 | rcu_read_unlock(); | ||
3746 | |||
3747 | mutex_lock(&event->mmap_mutex); | ||
3748 | /* | ||
3749 | * Check we didn't race with perf_event_set_output() which can | ||
3750 | * swizzle the rb from under us while we were waiting to | ||
3751 | * acquire mmap_mutex. | ||
3752 | * | ||
3753 | * If we find a different rb; ignore this event, a next | ||
3754 | * iteration will no longer find it on the list. We have to | ||
3755 | * still restart the iteration to make sure we're not now | ||
3756 | * iterating the wrong list. | ||
3757 | */ | ||
3758 | if (event->rb == rb) { | ||
3759 | rcu_assign_pointer(event->rb, NULL); | ||
3760 | ring_buffer_detach(event, rb); | ||
3761 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3762 | } | ||
3728 | mutex_unlock(&event->mmap_mutex); | 3763 | mutex_unlock(&event->mmap_mutex); |
3764 | put_event(event); | ||
3729 | 3765 | ||
3730 | ring_buffer_put(rb); | 3766 | /* |
3731 | free_uid(user); | 3767 | * Restart the iteration; either we're on the wrong list or |
3768 | * destroyed its integrity by doing a deletion. | ||
3769 | */ | ||
3770 | goto again; | ||
3732 | } | 3771 | } |
3772 | rcu_read_unlock(); | ||
3773 | |||
3774 | /* | ||
3775 | * It could be there's still a few 0-ref events on the list; they'll | ||
3776 | * get cleaned up by free_event() -- they'll also still have their | ||
3777 | * ref on the rb and will free it whenever they are done with it. | ||
3778 | * | ||
3779 | * Aside from that, this buffer is 'fully' detached and unmapped, | ||
3780 | * undo the VM accounting. | ||
3781 | */ | ||
3782 | |||
3783 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | ||
3784 | vma->vm_mm->pinned_vm -= mmap_locked; | ||
3785 | free_uid(mmap_user); | ||
3786 | |||
3787 | ring_buffer_put(rb); /* could be last */ | ||
3733 | } | 3788 | } |
3734 | 3789 | ||
3735 | static const struct vm_operations_struct perf_mmap_vmops = { | 3790 | static const struct vm_operations_struct perf_mmap_vmops = { |
@@ -3779,12 +3834,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3779 | return -EINVAL; | 3834 | return -EINVAL; |
3780 | 3835 | ||
3781 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3836 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3837 | again: | ||
3782 | mutex_lock(&event->mmap_mutex); | 3838 | mutex_lock(&event->mmap_mutex); |
3783 | if (event->rb) { | 3839 | if (event->rb) { |
3784 | if (event->rb->nr_pages == nr_pages) | 3840 | if (event->rb->nr_pages != nr_pages) { |
3785 | atomic_inc(&event->rb->refcount); | ||
3786 | else | ||
3787 | ret = -EINVAL; | 3841 | ret = -EINVAL; |
3842 | goto unlock; | ||
3843 | } | ||
3844 | |||
3845 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | ||
3846 | /* | ||
3847 | * Raced against perf_mmap_close() through | ||
3848 | * perf_event_set_output(). Try again, hope for better | ||
3849 | * luck. | ||
3850 | */ | ||
3851 | mutex_unlock(&event->mmap_mutex); | ||
3852 | goto again; | ||
3853 | } | ||
3854 | |||
3788 | goto unlock; | 3855 | goto unlock; |
3789 | } | 3856 | } |
3790 | 3857 | ||
@@ -3825,12 +3892,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3825 | ret = -ENOMEM; | 3892 | ret = -ENOMEM; |
3826 | goto unlock; | 3893 | goto unlock; |
3827 | } | 3894 | } |
3828 | rcu_assign_pointer(event->rb, rb); | 3895 | |
3896 | atomic_set(&rb->mmap_count, 1); | ||
3897 | rb->mmap_locked = extra; | ||
3898 | rb->mmap_user = get_current_user(); | ||
3829 | 3899 | ||
3830 | atomic_long_add(user_extra, &user->locked_vm); | 3900 | atomic_long_add(user_extra, &user->locked_vm); |
3831 | event->mmap_locked = extra; | 3901 | vma->vm_mm->pinned_vm += extra; |
3832 | event->mmap_user = get_current_user(); | 3902 | |
3833 | vma->vm_mm->pinned_vm += event->mmap_locked; | 3903 | ring_buffer_attach(event, rb); |
3904 | rcu_assign_pointer(event->rb, rb); | ||
3834 | 3905 | ||
3835 | perf_event_update_userpage(event); | 3906 | perf_event_update_userpage(event); |
3836 | 3907 | ||
@@ -3839,7 +3910,11 @@ unlock: | |||
3839 | atomic_inc(&event->mmap_count); | 3910 | atomic_inc(&event->mmap_count); |
3840 | mutex_unlock(&event->mmap_mutex); | 3911 | mutex_unlock(&event->mmap_mutex); |
3841 | 3912 | ||
3842 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; | 3913 | /* |
3914 | * Since pinned accounting is per vm we cannot allow fork() to copy our | ||
3915 | * vma. | ||
3916 | */ | ||
3917 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; | ||
3843 | vma->vm_ops = &perf_mmap_vmops; | 3918 | vma->vm_ops = &perf_mmap_vmops; |
3844 | 3919 | ||
3845 | return ret; | 3920 | return ret; |
@@ -6565,6 +6640,8 @@ set: | |||
6565 | if (atomic_read(&event->mmap_count)) | 6640 | if (atomic_read(&event->mmap_count)) |
6566 | goto unlock; | 6641 | goto unlock; |
6567 | 6642 | ||
6643 | old_rb = event->rb; | ||
6644 | |||
6568 | if (output_event) { | 6645 | if (output_event) { |
6569 | /* get the rb we want to redirect to */ | 6646 | /* get the rb we want to redirect to */ |
6570 | rb = ring_buffer_get(output_event); | 6647 | rb = ring_buffer_get(output_event); |
@@ -6572,16 +6649,28 @@ set: | |||
6572 | goto unlock; | 6649 | goto unlock; |
6573 | } | 6650 | } |
6574 | 6651 | ||
6575 | old_rb = event->rb; | ||
6576 | rcu_assign_pointer(event->rb, rb); | ||
6577 | if (old_rb) | 6652 | if (old_rb) |
6578 | ring_buffer_detach(event, old_rb); | 6653 | ring_buffer_detach(event, old_rb); |
6654 | |||
6655 | if (rb) | ||
6656 | ring_buffer_attach(event, rb); | ||
6657 | |||
6658 | rcu_assign_pointer(event->rb, rb); | ||
6659 | |||
6660 | if (old_rb) { | ||
6661 | ring_buffer_put(old_rb); | ||
6662 | /* | ||
6663 | * Since we detached before setting the new rb, so that we | ||
6664 | * could attach the new rb, we could have missed a wakeup. | ||
6665 | * Provide it now. | ||
6666 | */ | ||
6667 | wake_up_all(&event->waitq); | ||
6668 | } | ||
6669 | |||
6579 | ret = 0; | 6670 | ret = 0; |
6580 | unlock: | 6671 | unlock: |
6581 | mutex_unlock(&event->mmap_mutex); | 6672 | mutex_unlock(&event->mmap_mutex); |
6582 | 6673 | ||
6583 | if (old_rb) | ||
6584 | ring_buffer_put(old_rb); | ||
6585 | out: | 6674 | out: |
6586 | return ret; | 6675 | return ret; |
6587 | } | 6676 | } |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -31,6 +31,10 @@ struct ring_buffer { | |||
31 | spinlock_t event_lock; | 31 | spinlock_t event_lock; |
32 | struct list_head event_list; | 32 | struct list_head event_list; |
33 | 33 | ||
34 | atomic_t mmap_count; | ||
35 | unsigned long mmap_locked; | ||
36 | struct user_struct *mmap_user; | ||
37 | |||
34 | struct perf_event_mmap_page *user_page; | 38 | struct perf_event_mmap_page *user_page; |
35 | void *data_pages[0]; | 39 | void *data_pages[0]; |
36 | }; | 40 | }; |