aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c511
1 files changed, 418 insertions, 93 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
165/* 165/*
166 * max perf event sample rate 166 * max perf event sample rate
167 */ 167 */
168#define DEFAULT_MAX_SAMPLE_RATE 100000 168#define DEFAULT_MAX_SAMPLE_RATE 100000
169int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 169#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170static int max_samples_per_tick __read_mostly = 170#define DEFAULT_CPU_TIME_MAX_PERCENT 25
171 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 171
172int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180void update_perf_cpu_limits(void)
181{
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187}
188
189static int perf_rotate_context(struct perf_cpu_context *cpuctx);
172 190
173int perf_proc_update_handler(struct ctl_table *table, int write, 191int perf_proc_update_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 192 void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
180 return ret; 198 return ret;
181 199
182 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 200 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
201 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
202 update_perf_cpu_limits();
183 203
184 return 0; 204 return 0;
185} 205}
186 206
207int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
208
209int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
213 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
214
215 if (ret || !write)
216 return ret;
217
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223/*
224 * perf samples are done in some very critical code paths (NMIs).
225 * If they take too much CPU time, the system can lock up and not
226 * get any real work done. This will drop the sample rate when
227 * we detect that events are taking too long.
228 */
229#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length);
231
232void perf_sample_event_took(u64 sample_len_ns)
233{
234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length);
236
237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return;
239
240 /* decay the counter by 1 average sample */
241 local_samples_len = __get_cpu_var(running_sample_length);
242 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
243 local_samples_len += sample_len_ns;
244 __get_cpu_var(running_sample_length) = local_samples_len;
245
246 /*
247 * note: this will be biased artifically low until we have
248 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
249 * from having to maintain a count.
250 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
254 return;
255
256 if (max_samples_per_tick <= 1)
257 return;
258
259 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
260 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262
263 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate);
269
270 update_perf_cpu_limits();
271}
272
187static atomic64_t perf_event_id; 273static atomic64_t perf_event_id;
188 274
189static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 275static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 282static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 283static u64 perf_event_time(struct perf_event *event);
198 284
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 285void __weak perf_event_print_debug(void) { }
203 286
204extern __weak const char *perf_pmu_name(void) 287extern __weak const char *perf_pmu_name(void)
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
658} 741}
659#endif 742#endif
660 743
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
752static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
775/* CPU is going down */
776void perf_cpu_hrtimer_cancel(int cpu)
777{
778 struct perf_cpu_context *cpuctx;
779 struct pmu *pmu;
780 unsigned long flags;
781
782 if (WARN_ON(cpu != smp_processor_id()))
783 return;
784
785 local_irq_save(flags);
786
787 rcu_read_lock();
788
789 list_for_each_entry_rcu(pmu, &pmus, entry) {
790 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
791
792 if (pmu->task_ctx_nr == perf_sw_context)
793 continue;
794
795 hrtimer_cancel(&cpuctx->hrtimer);
796 }
797
798 rcu_read_unlock();
799
800 local_irq_restore(flags);
801}
802
803static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
804{
805 struct hrtimer *hr = &cpuctx->hrtimer;
806 struct pmu *pmu = cpuctx->ctx.pmu;
807 int timer;
808
809 /* no multiplexing needed for SW PMU */
810 if (pmu->task_ctx_nr == perf_sw_context)
811 return;
812
813 /*
814 * check default is sane, if not set then force to
815 * default interval (1/tick)
816 */
817 timer = pmu->hrtimer_interval_ms;
818 if (timer < 1)
819 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
820
821 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
822
823 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
824 hr->function = perf_cpu_hrtimer_handler;
825}
826
827static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
828{
829 struct hrtimer *hr = &cpuctx->hrtimer;
830 struct pmu *pmu = cpuctx->ctx.pmu;
831
832 /* not for SW PMU */
833 if (pmu->task_ctx_nr == perf_sw_context)
834 return;
835
836 if (hrtimer_active(hr))
837 return;
838
839 if (!hrtimer_callback_running(hr))
840 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
841 0, HRTIMER_MODE_REL_PINNED, 0);
842}
843
661void perf_pmu_disable(struct pmu *pmu) 844void perf_pmu_disable(struct pmu *pmu)
662{ 845{
663 int *count = this_cpu_ptr(pmu->pmu_disable_count); 846 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
1506 1689
1507 if (event_sched_in(group_event, cpuctx, ctx)) { 1690 if (event_sched_in(group_event, cpuctx, ctx)) {
1508 pmu->cancel_txn(pmu); 1691 pmu->cancel_txn(pmu);
1692 perf_cpu_hrtimer_restart(cpuctx);
1509 return -EAGAIN; 1693 return -EAGAIN;
1510 } 1694 }
1511 1695
@@ -1552,6 +1736,8 @@ group_error:
1552 1736
1553 pmu->cancel_txn(pmu); 1737 pmu->cancel_txn(pmu);
1554 1738
1739 perf_cpu_hrtimer_restart(cpuctx);
1740
1555 return -EAGAIN; 1741 return -EAGAIN;
1556} 1742}
1557 1743
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info)
1807 * If this event can't go on and it's part of a 1993 * If this event can't go on and it's part of a
1808 * group, then the whole group has to come off. 1994 * group, then the whole group has to come off.
1809 */ 1995 */
1810 if (leader != event) 1996 if (leader != event) {
1811 group_sched_out(leader, cpuctx, ctx); 1997 group_sched_out(leader, cpuctx, ctx);
1998 perf_cpu_hrtimer_restart(cpuctx);
1999 }
1812 if (leader->attr.pinned) { 2000 if (leader->attr.pinned) {
1813 update_group_times(leader); 2001 update_group_times(leader);
1814 leader->state = PERF_EVENT_STATE_ERROR; 2002 leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2555 * because they're strictly cpu affine and rotate_start is called with IRQs 2743 * because they're strictly cpu affine and rotate_start is called with IRQs
2556 * disabled, while rotate_context is called from IRQ context. 2744 * disabled, while rotate_context is called from IRQ context.
2557 */ 2745 */
2558static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2746static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2559{ 2747{
2560 struct perf_event_context *ctx = NULL; 2748 struct perf_event_context *ctx = NULL;
2561 int rotate = 0, remove = 1; 2749 int rotate = 0, remove = 1;
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2594done: 2782done:
2595 if (remove) 2783 if (remove)
2596 list_del_init(&cpuctx->rotation_list); 2784 list_del_init(&cpuctx->rotation_list);
2785
2786 return rotate;
2597} 2787}
2598 2788
2599#ifdef CONFIG_NO_HZ_FULL 2789#ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void)
2625 ctx = cpuctx->task_ctx; 2815 ctx = cpuctx->task_ctx;
2626 if (ctx) 2816 if (ctx)
2627 perf_adjust_freq_unthr_context(ctx, throttled); 2817 perf_adjust_freq_unthr_context(ctx, throttled);
2628
2629 if (cpuctx->jiffies_interval == 1 ||
2630 !(jiffies % cpuctx->jiffies_interval))
2631 perf_rotate_context(cpuctx);
2632 } 2818 }
2633} 2819}
2634 2820
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head)
2918} 3104}
2919 3105
2920static void ring_buffer_put(struct ring_buffer *rb); 3106static void ring_buffer_put(struct ring_buffer *rb);
3107static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 3108
2922static void free_event(struct perf_event *event) 3109static void free_event(struct perf_event *event)
2923{ 3110{
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 3129 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 3130 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 3131 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 3132 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 3133 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 3134 event->cpu));
3135 }
2948 } 3136 }
2949 } 3137 }
2950 3138
2951 if (event->rb) { 3139 if (event->rb) {
2952 ring_buffer_put(event->rb); 3140 struct ring_buffer *rb;
2953 event->rb = NULL; 3141
3142 /*
3143 * Can happen when we close an event with re-directed output.
3144 *
3145 * Since we have a 0 refcount, perf_mmap_close() will skip
3146 * over us; possibly making our ring_buffer_put() the last.
3147 */
3148 mutex_lock(&event->mmap_mutex);
3149 rb = event->rb;
3150 if (rb) {
3151 rcu_assign_pointer(event->rb, NULL);
3152 ring_buffer_detach(event, rb);
3153 ring_buffer_put(rb); /* could be last */
3154 }
3155 mutex_unlock(&event->mmap_mutex);
2954 } 3156 }
2955 3157
2956 if (is_cgroup_event(event)) 3158 if (is_cgroup_event(event))
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3390 unsigned int events = POLL_HUP;
3189 3391
3190 /* 3392 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3393 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3394 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3395 */
3205 mutex_lock(&event->mmap_mutex); 3396 mutex_lock(&event->mmap_mutex);
3206 3397 rb = event->rb;
3207 rcu_read_lock(); 3398 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3399 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3400 mutex_unlock(&event->mmap_mutex);
3216 3401
3217 poll_wait(file, &event->waitq, wait); 3402 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3706 return;
3522 3707
3523 spin_lock_irqsave(&rb->event_lock, flags); 3708 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3709 if (list_empty(&event->rb_entry))
3525 goto unlock; 3710 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3711 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3712}
3531 3713
3532static void ring_buffer_detach(struct perf_event *event, 3714static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3715{
3535 unsigned long flags; 3716 unsigned long flags;
3536 3717
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3730
3550 rcu_read_lock(); 3731 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3732 rb = rcu_dereference(event->rb);
3552 if (!rb) 3733 if (rb) {
3553 goto unlock; 3734 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3735 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3736 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3737 rcu_read_unlock();
3560} 3738}
3561 3739
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3584 3762
3585static void ring_buffer_put(struct ring_buffer *rb) 3763static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3764{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3765 if (!atomic_dec_and_test(&rb->refcount))
3591 return; 3766 return;
3592 3767
3593 spin_lock_irqsave(&rb->event_lock, flags); 3768 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3769
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3770 call_rcu(&rb->rcu_head, rb_free_rcu);
3601} 3771}
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3605 struct perf_event *event = vma->vm_file->private_data; 3775 struct perf_event *event = vma->vm_file->private_data;
3606 3776
3607 atomic_inc(&event->mmap_count); 3777 atomic_inc(&event->mmap_count);
3778 atomic_inc(&event->rb->mmap_count);
3608} 3779}
3609 3780
3781/*
3782 * A buffer can be mmap()ed multiple times; either directly through the same
3783 * event, or through other events by use of perf_event_set_output().
3784 *
3785 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3786 * the buffer here, where we still have a VM context. This means we need
3787 * to detach all events redirecting to us.
3788 */
3610static void perf_mmap_close(struct vm_area_struct *vma) 3789static void perf_mmap_close(struct vm_area_struct *vma)
3611{ 3790{
3612 struct perf_event *event = vma->vm_file->private_data; 3791 struct perf_event *event = vma->vm_file->private_data;
3613 3792
3614 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3793 struct ring_buffer *rb = event->rb;
3615 unsigned long size = perf_data_size(event->rb); 3794 struct user_struct *mmap_user = rb->mmap_user;
3616 struct user_struct *user = event->mmap_user; 3795 int mmap_locked = rb->mmap_locked;
3617 struct ring_buffer *rb = event->rb; 3796 unsigned long size = perf_data_size(rb);
3797
3798 atomic_dec(&rb->mmap_count);
3799
3800 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3801 return;
3618 3802
3619 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3803 /* Detach current event from the buffer. */
3620 vma->vm_mm->pinned_vm -= event->mmap_locked; 3804 rcu_assign_pointer(event->rb, NULL);
3621 rcu_assign_pointer(event->rb, NULL); 3805 ring_buffer_detach(event, rb);
3622 ring_buffer_detach(event, rb); 3806 mutex_unlock(&event->mmap_mutex);
3807
3808 /* If there's still other mmap()s of this buffer, we're done. */
3809 if (atomic_read(&rb->mmap_count)) {
3810 ring_buffer_put(rb); /* can't be last */
3811 return;
3812 }
3813
3814 /*
3815 * No other mmap()s, detach from all other events that might redirect
3816 * into the now unreachable buffer. Somewhat complicated by the
3817 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3818 */
3819again:
3820 rcu_read_lock();
3821 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3822 if (!atomic_long_inc_not_zero(&event->refcount)) {
3823 /*
3824 * This event is en-route to free_event() which will
3825 * detach it and remove it from the list.
3826 */
3827 continue;
3828 }
3829 rcu_read_unlock();
3830
3831 mutex_lock(&event->mmap_mutex);
3832 /*
3833 * Check we didn't race with perf_event_set_output() which can
3834 * swizzle the rb from under us while we were waiting to
3835 * acquire mmap_mutex.
3836 *
3837 * If we find a different rb; ignore this event, a next
3838 * iteration will no longer find it on the list. We have to
3839 * still restart the iteration to make sure we're not now
3840 * iterating the wrong list.
3841 */
3842 if (event->rb == rb) {
3843 rcu_assign_pointer(event->rb, NULL);
3844 ring_buffer_detach(event, rb);
3845 ring_buffer_put(rb); /* can't be last, we still have one */
3846 }
3623 mutex_unlock(&event->mmap_mutex); 3847 mutex_unlock(&event->mmap_mutex);
3848 put_event(event);
3624 3849
3625 ring_buffer_put(rb); 3850 /*
3626 free_uid(user); 3851 * Restart the iteration; either we're on the wrong list or
3852 * destroyed its integrity by doing a deletion.
3853 */
3854 goto again;
3627 } 3855 }
3856 rcu_read_unlock();
3857
3858 /*
3859 * It could be there's still a few 0-ref events on the list; they'll
3860 * get cleaned up by free_event() -- they'll also still have their
3861 * ref on the rb and will free it whenever they are done with it.
3862 *
3863 * Aside from that, this buffer is 'fully' detached and unmapped,
3864 * undo the VM accounting.
3865 */
3866
3867 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3868 vma->vm_mm->pinned_vm -= mmap_locked;
3869 free_uid(mmap_user);
3870
3871 ring_buffer_put(rb); /* could be last */
3628} 3872}
3629 3873
3630static const struct vm_operations_struct perf_mmap_vmops = { 3874static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3674 return -EINVAL; 3918 return -EINVAL;
3675 3919
3676 WARN_ON_ONCE(event->ctx->parent_ctx); 3920 WARN_ON_ONCE(event->ctx->parent_ctx);
3921again:
3677 mutex_lock(&event->mmap_mutex); 3922 mutex_lock(&event->mmap_mutex);
3678 if (event->rb) { 3923 if (event->rb) {
3679 if (event->rb->nr_pages == nr_pages) 3924 if (event->rb->nr_pages != nr_pages) {
3680 atomic_inc(&event->rb->refcount);
3681 else
3682 ret = -EINVAL; 3925 ret = -EINVAL;
3926 goto unlock;
3927 }
3928
3929 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3930 /*
3931 * Raced against perf_mmap_close() through
3932 * perf_event_set_output(). Try again, hope for better
3933 * luck.
3934 */
3935 mutex_unlock(&event->mmap_mutex);
3936 goto again;
3937 }
3938
3683 goto unlock; 3939 goto unlock;
3684 } 3940 }
3685 3941
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3720 ret = -ENOMEM; 3976 ret = -ENOMEM;
3721 goto unlock; 3977 goto unlock;
3722 } 3978 }
3723 rcu_assign_pointer(event->rb, rb); 3979
3980 atomic_set(&rb->mmap_count, 1);
3981 rb->mmap_locked = extra;
3982 rb->mmap_user = get_current_user();
3724 3983
3725 atomic_long_add(user_extra, &user->locked_vm); 3984 atomic_long_add(user_extra, &user->locked_vm);
3726 event->mmap_locked = extra; 3985 vma->vm_mm->pinned_vm += extra;
3727 event->mmap_user = get_current_user(); 3986
3728 vma->vm_mm->pinned_vm += event->mmap_locked; 3987 ring_buffer_attach(event, rb);
3988 rcu_assign_pointer(event->rb, rb);
3729 3989
3730 perf_event_update_userpage(event); 3990 perf_event_update_userpage(event);
3731 3991
@@ -3734,7 +3994,11 @@ unlock:
3734 atomic_inc(&event->mmap_count); 3994 atomic_inc(&event->mmap_count);
3735 mutex_unlock(&event->mmap_mutex); 3995 mutex_unlock(&event->mmap_mutex);
3736 3996
3737 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3997 /*
3998 * Since pinned accounting is per vm we cannot allow fork() to copy our
3999 * vma.
4000 */
4001 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3738 vma->vm_ops = &perf_mmap_vmops; 4002 vma->vm_ops = &perf_mmap_vmops;
3739 4003
3740 return ret; 4004 return ret;
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4961 * sign as trigger. 5225 * sign as trigger.
4962 */ 5226 */
4963 5227
4964static u64 perf_swevent_set_period(struct perf_event *event) 5228u64 perf_swevent_set_period(struct perf_event *event)
4965{ 5229{
4966 struct hw_perf_event *hwc = &event->hw; 5230 struct hw_perf_event *hwc = &event->hw;
4967 u64 period = hwc->last_period; 5231 u64 period = hwc->last_period;
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
5904 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6168 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5905} 6169}
5906 6170
6171static ssize_t
6172perf_event_mux_interval_ms_show(struct device *dev,
6173 struct device_attribute *attr,
6174 char *page)
6175{
6176 struct pmu *pmu = dev_get_drvdata(dev);
6177
6178 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6179}
6180
6181static ssize_t
6182perf_event_mux_interval_ms_store(struct device *dev,
6183 struct device_attribute *attr,
6184 const char *buf, size_t count)
6185{
6186 struct pmu *pmu = dev_get_drvdata(dev);
6187 int timer, cpu, ret;
6188
6189 ret = kstrtoint(buf, 0, &timer);
6190 if (ret)
6191 return ret;
6192
6193 if (timer < 1)
6194 return -EINVAL;
6195
6196 /* same value, noting to do */
6197 if (timer == pmu->hrtimer_interval_ms)
6198 return count;
6199
6200 pmu->hrtimer_interval_ms = timer;
6201
6202 /* update all cpuctx for this PMU */
6203 for_each_possible_cpu(cpu) {
6204 struct perf_cpu_context *cpuctx;
6205 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6206 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6207
6208 if (hrtimer_active(&cpuctx->hrtimer))
6209 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6210 }
6211
6212 return count;
6213}
6214
6215#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6216
5907static struct device_attribute pmu_dev_attrs[] = { 6217static struct device_attribute pmu_dev_attrs[] = {
5908 __ATTR_RO(type), 6218 __ATTR_RO(type),
5909 __ATTR_NULL, 6219 __ATTR_RW(perf_event_mux_interval_ms),
6220 __ATTR_NULL,
5910}; 6221};
5911 6222
5912static int pmu_bus_running; 6223static int pmu_bus_running;
@@ -5952,7 +6263,7 @@ free_dev:
5952static struct lock_class_key cpuctx_mutex; 6263static struct lock_class_key cpuctx_mutex;
5953static struct lock_class_key cpuctx_lock; 6264static struct lock_class_key cpuctx_lock;
5954 6265
5955int perf_pmu_register(struct pmu *pmu, char *name, int type) 6266int perf_pmu_register(struct pmu *pmu, const char *name, int type)
5956{ 6267{
5957 int cpu, ret; 6268 int cpu, ret;
5958 6269
@@ -6001,7 +6312,9 @@ skip_type:
6001 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6312 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6002 cpuctx->ctx.type = cpu_context; 6313 cpuctx->ctx.type = cpu_context;
6003 cpuctx->ctx.pmu = pmu; 6314 cpuctx->ctx.pmu = pmu;
6004 cpuctx->jiffies_interval = 1; 6315
6316 __perf_cpu_hrtimer_init(cpuctx, cpu);
6317
6005 INIT_LIST_HEAD(&cpuctx->rotation_list); 6318 INIT_LIST_HEAD(&cpuctx->rotation_list);
6006 cpuctx->unique_pmu = pmu; 6319 cpuctx->unique_pmu = pmu;
6007 } 6320 }
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6327 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6640 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6328 return -EINVAL; 6641 return -EINVAL;
6329 6642
6330 /* kernel level capture: check permissions */
6331 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6332 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6333 return -EACCES;
6334
6335 /* propagate priv level, when not set for branch */ 6643 /* propagate priv level, when not set for branch */
6336 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6644 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6337 6645
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6349 */ 6657 */
6350 attr->branch_sample_type = mask; 6658 attr->branch_sample_type = mask;
6351 } 6659 }
6660 /* privileged levels capture (kernel, hv): check permissions */
6661 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6662 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6663 return -EACCES;
6352 } 6664 }
6353 6665
6354 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 6666 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6412,6 +6724,8 @@ set:
6412 if (atomic_read(&event->mmap_count)) 6724 if (atomic_read(&event->mmap_count))
6413 goto unlock; 6725 goto unlock;
6414 6726
6727 old_rb = event->rb;
6728
6415 if (output_event) { 6729 if (output_event) {
6416 /* get the rb we want to redirect to */ 6730 /* get the rb we want to redirect to */
6417 rb = ring_buffer_get(output_event); 6731 rb = ring_buffer_get(output_event);
@@ -6419,16 +6733,28 @@ set:
6419 goto unlock; 6733 goto unlock;
6420 } 6734 }
6421 6735
6422 old_rb = event->rb;
6423 rcu_assign_pointer(event->rb, rb);
6424 if (old_rb) 6736 if (old_rb)
6425 ring_buffer_detach(event, old_rb); 6737 ring_buffer_detach(event, old_rb);
6738
6739 if (rb)
6740 ring_buffer_attach(event, rb);
6741
6742 rcu_assign_pointer(event->rb, rb);
6743
6744 if (old_rb) {
6745 ring_buffer_put(old_rb);
6746 /*
6747 * Since we detached before setting the new rb, so that we
6748 * could attach the new rb, we could have missed a wakeup.
6749 * Provide it now.
6750 */
6751 wake_up_all(&event->waitq);
6752 }
6753
6426 ret = 0; 6754 ret = 0;
6427unlock: 6755unlock:
6428 mutex_unlock(&event->mmap_mutex); 6756 mutex_unlock(&event->mmap_mutex);
6429 6757
6430 if (old_rb)
6431 ring_buffer_put(old_rb);
6432out: 6758out:
6433 return ret; 6759 return ret;
6434} 6760}
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7387 case CPU_DOWN_PREPARE: 7713 case CPU_DOWN_PREPARE:
7388 perf_event_exit_cpu(cpu); 7714 perf_event_exit_cpu(cpu);
7389 break; 7715 break;
7390
7391 default: 7716 default:
7392 break; 7717 break;
7393 } 7718 }