diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 511 |
1 files changed, 418 insertions, 93 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..1db3af933704 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' | |||
165 | /* | 165 | /* |
166 | * max perf event sample rate | 166 | * max perf event sample rate |
167 | */ | 167 | */ |
168 | #define DEFAULT_MAX_SAMPLE_RATE 100000 | 168 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
169 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | 169 | #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) |
170 | static int max_samples_per_tick __read_mostly = | 170 | #define DEFAULT_CPU_TIME_MAX_PERCENT 25 |
171 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 171 | |
172 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
173 | |||
174 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
175 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | ||
176 | |||
177 | static atomic_t perf_sample_allowed_ns __read_mostly = | ||
178 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | ||
179 | |||
180 | void update_perf_cpu_limits(void) | ||
181 | { | ||
182 | u64 tmp = perf_sample_period_ns; | ||
183 | |||
184 | tmp *= sysctl_perf_cpu_time_max_percent; | ||
185 | tmp = do_div(tmp, 100); | ||
186 | atomic_set(&perf_sample_allowed_ns, tmp); | ||
187 | } | ||
188 | |||
189 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | ||
172 | 190 | ||
173 | int perf_proc_update_handler(struct ctl_table *table, int write, | 191 | int perf_proc_update_handler(struct ctl_table *table, int write, |
174 | void __user *buffer, size_t *lenp, | 192 | void __user *buffer, size_t *lenp, |
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
180 | return ret; | 198 | return ret; |
181 | 199 | ||
182 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | 200 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); |
201 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | ||
202 | update_perf_cpu_limits(); | ||
183 | 203 | ||
184 | return 0; | 204 | return 0; |
185 | } | 205 | } |
186 | 206 | ||
207 | int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; | ||
208 | |||
209 | int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | ||
210 | void __user *buffer, size_t *lenp, | ||
211 | loff_t *ppos) | ||
212 | { | ||
213 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
214 | |||
215 | if (ret || !write) | ||
216 | return ret; | ||
217 | |||
218 | update_perf_cpu_limits(); | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | /* | ||
224 | * perf samples are done in some very critical code paths (NMIs). | ||
225 | * If they take too much CPU time, the system can lock up and not | ||
226 | * get any real work done. This will drop the sample rate when | ||
227 | * we detect that events are taking too long. | ||
228 | */ | ||
229 | #define NR_ACCUMULATED_SAMPLES 128 | ||
230 | DEFINE_PER_CPU(u64, running_sample_length); | ||
231 | |||
232 | void perf_sample_event_took(u64 sample_len_ns) | ||
233 | { | ||
234 | u64 avg_local_sample_len; | ||
235 | u64 local_samples_len = __get_cpu_var(running_sample_length); | ||
236 | |||
237 | if (atomic_read(&perf_sample_allowed_ns) == 0) | ||
238 | return; | ||
239 | |||
240 | /* decay the counter by 1 average sample */ | ||
241 | local_samples_len = __get_cpu_var(running_sample_length); | ||
242 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; | ||
243 | local_samples_len += sample_len_ns; | ||
244 | __get_cpu_var(running_sample_length) = local_samples_len; | ||
245 | |||
246 | /* | ||
247 | * note: this will be biased artifically low until we have | ||
248 | * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us | ||
249 | * from having to maintain a count. | ||
250 | */ | ||
251 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | ||
252 | |||
253 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | ||
254 | return; | ||
255 | |||
256 | if (max_samples_per_tick <= 1) | ||
257 | return; | ||
258 | |||
259 | max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); | ||
260 | sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; | ||
261 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | ||
262 | |||
263 | printk_ratelimited(KERN_WARNING | ||
264 | "perf samples too long (%lld > %d), lowering " | ||
265 | "kernel.perf_event_max_sample_rate to %d\n", | ||
266 | avg_local_sample_len, | ||
267 | atomic_read(&perf_sample_allowed_ns), | ||
268 | sysctl_perf_event_sample_rate); | ||
269 | |||
270 | update_perf_cpu_limits(); | ||
271 | } | ||
272 | |||
187 | static atomic64_t perf_event_id; | 273 | static atomic64_t perf_event_id; |
188 | 274 | ||
189 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 275 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
196 | static void update_context_time(struct perf_event_context *ctx); | 282 | static void update_context_time(struct perf_event_context *ctx); |
197 | static u64 perf_event_time(struct perf_event *event); | 283 | static u64 perf_event_time(struct perf_event *event); |
198 | 284 | ||
199 | static void ring_buffer_attach(struct perf_event *event, | ||
200 | struct ring_buffer *rb); | ||
201 | |||
202 | void __weak perf_event_print_debug(void) { } | 285 | void __weak perf_event_print_debug(void) { } |
203 | 286 | ||
204 | extern __weak const char *perf_pmu_name(void) | 287 | extern __weak const char *perf_pmu_name(void) |
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event, | |||
658 | } | 741 | } |
659 | #endif | 742 | #endif |
660 | 743 | ||
744 | /* | ||
745 | * set default to be dependent on timer tick just | ||
746 | * like original code | ||
747 | */ | ||
748 | #define PERF_CPU_HRTIMER (1000 / HZ) | ||
749 | /* | ||
750 | * function must be called with interrupts disbled | ||
751 | */ | ||
752 | static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) | ||
753 | { | ||
754 | struct perf_cpu_context *cpuctx; | ||
755 | enum hrtimer_restart ret = HRTIMER_NORESTART; | ||
756 | int rotations = 0; | ||
757 | |||
758 | WARN_ON(!irqs_disabled()); | ||
759 | |||
760 | cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); | ||
761 | |||
762 | rotations = perf_rotate_context(cpuctx); | ||
763 | |||
764 | /* | ||
765 | * arm timer if needed | ||
766 | */ | ||
767 | if (rotations) { | ||
768 | hrtimer_forward_now(hr, cpuctx->hrtimer_interval); | ||
769 | ret = HRTIMER_RESTART; | ||
770 | } | ||
771 | |||
772 | return ret; | ||
773 | } | ||
774 | |||
775 | /* CPU is going down */ | ||
776 | void perf_cpu_hrtimer_cancel(int cpu) | ||
777 | { | ||
778 | struct perf_cpu_context *cpuctx; | ||
779 | struct pmu *pmu; | ||
780 | unsigned long flags; | ||
781 | |||
782 | if (WARN_ON(cpu != smp_processor_id())) | ||
783 | return; | ||
784 | |||
785 | local_irq_save(flags); | ||
786 | |||
787 | rcu_read_lock(); | ||
788 | |||
789 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
790 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
791 | |||
792 | if (pmu->task_ctx_nr == perf_sw_context) | ||
793 | continue; | ||
794 | |||
795 | hrtimer_cancel(&cpuctx->hrtimer); | ||
796 | } | ||
797 | |||
798 | rcu_read_unlock(); | ||
799 | |||
800 | local_irq_restore(flags); | ||
801 | } | ||
802 | |||
803 | static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) | ||
804 | { | ||
805 | struct hrtimer *hr = &cpuctx->hrtimer; | ||
806 | struct pmu *pmu = cpuctx->ctx.pmu; | ||
807 | int timer; | ||
808 | |||
809 | /* no multiplexing needed for SW PMU */ | ||
810 | if (pmu->task_ctx_nr == perf_sw_context) | ||
811 | return; | ||
812 | |||
813 | /* | ||
814 | * check default is sane, if not set then force to | ||
815 | * default interval (1/tick) | ||
816 | */ | ||
817 | timer = pmu->hrtimer_interval_ms; | ||
818 | if (timer < 1) | ||
819 | timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; | ||
820 | |||
821 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); | ||
822 | |||
823 | hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); | ||
824 | hr->function = perf_cpu_hrtimer_handler; | ||
825 | } | ||
826 | |||
827 | static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) | ||
828 | { | ||
829 | struct hrtimer *hr = &cpuctx->hrtimer; | ||
830 | struct pmu *pmu = cpuctx->ctx.pmu; | ||
831 | |||
832 | /* not for SW PMU */ | ||
833 | if (pmu->task_ctx_nr == perf_sw_context) | ||
834 | return; | ||
835 | |||
836 | if (hrtimer_active(hr)) | ||
837 | return; | ||
838 | |||
839 | if (!hrtimer_callback_running(hr)) | ||
840 | __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, | ||
841 | 0, HRTIMER_MODE_REL_PINNED, 0); | ||
842 | } | ||
843 | |||
661 | void perf_pmu_disable(struct pmu *pmu) | 844 | void perf_pmu_disable(struct pmu *pmu) |
662 | { | 845 | { |
663 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 846 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event, | |||
1506 | 1689 | ||
1507 | if (event_sched_in(group_event, cpuctx, ctx)) { | 1690 | if (event_sched_in(group_event, cpuctx, ctx)) { |
1508 | pmu->cancel_txn(pmu); | 1691 | pmu->cancel_txn(pmu); |
1692 | perf_cpu_hrtimer_restart(cpuctx); | ||
1509 | return -EAGAIN; | 1693 | return -EAGAIN; |
1510 | } | 1694 | } |
1511 | 1695 | ||
@@ -1552,6 +1736,8 @@ group_error: | |||
1552 | 1736 | ||
1553 | pmu->cancel_txn(pmu); | 1737 | pmu->cancel_txn(pmu); |
1554 | 1738 | ||
1739 | perf_cpu_hrtimer_restart(cpuctx); | ||
1740 | |||
1555 | return -EAGAIN; | 1741 | return -EAGAIN; |
1556 | } | 1742 | } |
1557 | 1743 | ||
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info) | |||
1807 | * If this event can't go on and it's part of a | 1993 | * If this event can't go on and it's part of a |
1808 | * group, then the whole group has to come off. | 1994 | * group, then the whole group has to come off. |
1809 | */ | 1995 | */ |
1810 | if (leader != event) | 1996 | if (leader != event) { |
1811 | group_sched_out(leader, cpuctx, ctx); | 1997 | group_sched_out(leader, cpuctx, ctx); |
1998 | perf_cpu_hrtimer_restart(cpuctx); | ||
1999 | } | ||
1812 | if (leader->attr.pinned) { | 2000 | if (leader->attr.pinned) { |
1813 | update_group_times(leader); | 2001 | update_group_times(leader); |
1814 | leader->state = PERF_EVENT_STATE_ERROR; | 2002 | leader->state = PERF_EVENT_STATE_ERROR; |
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
2555 | * because they're strictly cpu affine and rotate_start is called with IRQs | 2743 | * because they're strictly cpu affine and rotate_start is called with IRQs |
2556 | * disabled, while rotate_context is called from IRQ context. | 2744 | * disabled, while rotate_context is called from IRQ context. |
2557 | */ | 2745 | */ |
2558 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | 2746 | static int perf_rotate_context(struct perf_cpu_context *cpuctx) |
2559 | { | 2747 | { |
2560 | struct perf_event_context *ctx = NULL; | 2748 | struct perf_event_context *ctx = NULL; |
2561 | int rotate = 0, remove = 1; | 2749 | int rotate = 0, remove = 1; |
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2594 | done: | 2782 | done: |
2595 | if (remove) | 2783 | if (remove) |
2596 | list_del_init(&cpuctx->rotation_list); | 2784 | list_del_init(&cpuctx->rotation_list); |
2785 | |||
2786 | return rotate; | ||
2597 | } | 2787 | } |
2598 | 2788 | ||
2599 | #ifdef CONFIG_NO_HZ_FULL | 2789 | #ifdef CONFIG_NO_HZ_FULL |
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void) | |||
2625 | ctx = cpuctx->task_ctx; | 2815 | ctx = cpuctx->task_ctx; |
2626 | if (ctx) | 2816 | if (ctx) |
2627 | perf_adjust_freq_unthr_context(ctx, throttled); | 2817 | perf_adjust_freq_unthr_context(ctx, throttled); |
2628 | |||
2629 | if (cpuctx->jiffies_interval == 1 || | ||
2630 | !(jiffies % cpuctx->jiffies_interval)) | ||
2631 | perf_rotate_context(cpuctx); | ||
2632 | } | 2818 | } |
2633 | } | 2819 | } |
2634 | 2820 | ||
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
2918 | } | 3104 | } |
2919 | 3105 | ||
2920 | static void ring_buffer_put(struct ring_buffer *rb); | 3106 | static void ring_buffer_put(struct ring_buffer *rb); |
3107 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | ||
2921 | 3108 | ||
2922 | static void free_event(struct perf_event *event) | 3109 | static void free_event(struct perf_event *event) |
2923 | { | 3110 | { |
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event) | |||
2942 | if (has_branch_stack(event)) { | 3129 | if (has_branch_stack(event)) { |
2943 | static_key_slow_dec_deferred(&perf_sched_events); | 3130 | static_key_slow_dec_deferred(&perf_sched_events); |
2944 | /* is system-wide event */ | 3131 | /* is system-wide event */ |
2945 | if (!(event->attach_state & PERF_ATTACH_TASK)) | 3132 | if (!(event->attach_state & PERF_ATTACH_TASK)) { |
2946 | atomic_dec(&per_cpu(perf_branch_stack_events, | 3133 | atomic_dec(&per_cpu(perf_branch_stack_events, |
2947 | event->cpu)); | 3134 | event->cpu)); |
3135 | } | ||
2948 | } | 3136 | } |
2949 | } | 3137 | } |
2950 | 3138 | ||
2951 | if (event->rb) { | 3139 | if (event->rb) { |
2952 | ring_buffer_put(event->rb); | 3140 | struct ring_buffer *rb; |
2953 | event->rb = NULL; | 3141 | |
3142 | /* | ||
3143 | * Can happen when we close an event with re-directed output. | ||
3144 | * | ||
3145 | * Since we have a 0 refcount, perf_mmap_close() will skip | ||
3146 | * over us; possibly making our ring_buffer_put() the last. | ||
3147 | */ | ||
3148 | mutex_lock(&event->mmap_mutex); | ||
3149 | rb = event->rb; | ||
3150 | if (rb) { | ||
3151 | rcu_assign_pointer(event->rb, NULL); | ||
3152 | ring_buffer_detach(event, rb); | ||
3153 | ring_buffer_put(rb); /* could be last */ | ||
3154 | } | ||
3155 | mutex_unlock(&event->mmap_mutex); | ||
2954 | } | 3156 | } |
2955 | 3157 | ||
2956 | if (is_cgroup_event(event)) | 3158 | if (is_cgroup_event(event)) |
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3188 | unsigned int events = POLL_HUP; | 3390 | unsigned int events = POLL_HUP; |
3189 | 3391 | ||
3190 | /* | 3392 | /* |
3191 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | 3393 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
3192 | * grabs the rb reference but perf_event_set_output() overrides it. | 3394 | * perf_event_set_output() can swizzle our rb and make us miss wakeups. |
3193 | * Here is the timeline for two threads T1, T2: | ||
3194 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3195 | * t1: T2, old_rb = event->rb | ||
3196 | * t2: T2, event->rb = new rb | ||
3197 | * t3: T2, ring_buffer_detach(old_rb) | ||
3198 | * t4: T1, ring_buffer_attach(rb1) | ||
3199 | * t5: T1, poll_wait(event->waitq) | ||
3200 | * | ||
3201 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3202 | * thereby ensuring that the assignment of the new ring buffer | ||
3203 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3204 | */ | 3395 | */ |
3205 | mutex_lock(&event->mmap_mutex); | 3396 | mutex_lock(&event->mmap_mutex); |
3206 | 3397 | rb = event->rb; | |
3207 | rcu_read_lock(); | 3398 | if (rb) |
3208 | rb = rcu_dereference(event->rb); | ||
3209 | if (rb) { | ||
3210 | ring_buffer_attach(event, rb); | ||
3211 | events = atomic_xchg(&rb->poll, 0); | 3399 | events = atomic_xchg(&rb->poll, 0); |
3212 | } | ||
3213 | rcu_read_unlock(); | ||
3214 | |||
3215 | mutex_unlock(&event->mmap_mutex); | 3400 | mutex_unlock(&event->mmap_mutex); |
3216 | 3401 | ||
3217 | poll_wait(file, &event->waitq, wait); | 3402 | poll_wait(file, &event->waitq, wait); |
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event, | |||
3521 | return; | 3706 | return; |
3522 | 3707 | ||
3523 | spin_lock_irqsave(&rb->event_lock, flags); | 3708 | spin_lock_irqsave(&rb->event_lock, flags); |
3524 | if (!list_empty(&event->rb_entry)) | 3709 | if (list_empty(&event->rb_entry)) |
3525 | goto unlock; | 3710 | list_add(&event->rb_entry, &rb->event_list); |
3526 | |||
3527 | list_add(&event->rb_entry, &rb->event_list); | ||
3528 | unlock: | ||
3529 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3711 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3530 | } | 3712 | } |
3531 | 3713 | ||
3532 | static void ring_buffer_detach(struct perf_event *event, | 3714 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) |
3533 | struct ring_buffer *rb) | ||
3534 | { | 3715 | { |
3535 | unsigned long flags; | 3716 | unsigned long flags; |
3536 | 3717 | ||
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event) | |||
3549 | 3730 | ||
3550 | rcu_read_lock(); | 3731 | rcu_read_lock(); |
3551 | rb = rcu_dereference(event->rb); | 3732 | rb = rcu_dereference(event->rb); |
3552 | if (!rb) | 3733 | if (rb) { |
3553 | goto unlock; | 3734 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) |
3554 | 3735 | wake_up_all(&event->waitq); | |
3555 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | 3736 | } |
3556 | wake_up_all(&event->waitq); | ||
3557 | |||
3558 | unlock: | ||
3559 | rcu_read_unlock(); | 3737 | rcu_read_unlock(); |
3560 | } | 3738 | } |
3561 | 3739 | ||
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3584 | 3762 | ||
3585 | static void ring_buffer_put(struct ring_buffer *rb) | 3763 | static void ring_buffer_put(struct ring_buffer *rb) |
3586 | { | 3764 | { |
3587 | struct perf_event *event, *n; | ||
3588 | unsigned long flags; | ||
3589 | |||
3590 | if (!atomic_dec_and_test(&rb->refcount)) | 3765 | if (!atomic_dec_and_test(&rb->refcount)) |
3591 | return; | 3766 | return; |
3592 | 3767 | ||
3593 | spin_lock_irqsave(&rb->event_lock, flags); | 3768 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
3594 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3595 | list_del_init(&event->rb_entry); | ||
3596 | wake_up_all(&event->waitq); | ||
3597 | } | ||
3598 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3599 | 3769 | ||
3600 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3770 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3601 | } | 3771 | } |
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
3605 | struct perf_event *event = vma->vm_file->private_data; | 3775 | struct perf_event *event = vma->vm_file->private_data; |
3606 | 3776 | ||
3607 | atomic_inc(&event->mmap_count); | 3777 | atomic_inc(&event->mmap_count); |
3778 | atomic_inc(&event->rb->mmap_count); | ||
3608 | } | 3779 | } |
3609 | 3780 | ||
3781 | /* | ||
3782 | * A buffer can be mmap()ed multiple times; either directly through the same | ||
3783 | * event, or through other events by use of perf_event_set_output(). | ||
3784 | * | ||
3785 | * In order to undo the VM accounting done by perf_mmap() we need to destroy | ||
3786 | * the buffer here, where we still have a VM context. This means we need | ||
3787 | * to detach all events redirecting to us. | ||
3788 | */ | ||
3610 | static void perf_mmap_close(struct vm_area_struct *vma) | 3789 | static void perf_mmap_close(struct vm_area_struct *vma) |
3611 | { | 3790 | { |
3612 | struct perf_event *event = vma->vm_file->private_data; | 3791 | struct perf_event *event = vma->vm_file->private_data; |
3613 | 3792 | ||
3614 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3793 | struct ring_buffer *rb = event->rb; |
3615 | unsigned long size = perf_data_size(event->rb); | 3794 | struct user_struct *mmap_user = rb->mmap_user; |
3616 | struct user_struct *user = event->mmap_user; | 3795 | int mmap_locked = rb->mmap_locked; |
3617 | struct ring_buffer *rb = event->rb; | 3796 | unsigned long size = perf_data_size(rb); |
3797 | |||
3798 | atomic_dec(&rb->mmap_count); | ||
3799 | |||
3800 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | ||
3801 | return; | ||
3618 | 3802 | ||
3619 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3803 | /* Detach current event from the buffer. */ |
3620 | vma->vm_mm->pinned_vm -= event->mmap_locked; | 3804 | rcu_assign_pointer(event->rb, NULL); |
3621 | rcu_assign_pointer(event->rb, NULL); | 3805 | ring_buffer_detach(event, rb); |
3622 | ring_buffer_detach(event, rb); | 3806 | mutex_unlock(&event->mmap_mutex); |
3807 | |||
3808 | /* If there's still other mmap()s of this buffer, we're done. */ | ||
3809 | if (atomic_read(&rb->mmap_count)) { | ||
3810 | ring_buffer_put(rb); /* can't be last */ | ||
3811 | return; | ||
3812 | } | ||
3813 | |||
3814 | /* | ||
3815 | * No other mmap()s, detach from all other events that might redirect | ||
3816 | * into the now unreachable buffer. Somewhat complicated by the | ||
3817 | * fact that rb::event_lock otherwise nests inside mmap_mutex. | ||
3818 | */ | ||
3819 | again: | ||
3820 | rcu_read_lock(); | ||
3821 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { | ||
3822 | if (!atomic_long_inc_not_zero(&event->refcount)) { | ||
3823 | /* | ||
3824 | * This event is en-route to free_event() which will | ||
3825 | * detach it and remove it from the list. | ||
3826 | */ | ||
3827 | continue; | ||
3828 | } | ||
3829 | rcu_read_unlock(); | ||
3830 | |||
3831 | mutex_lock(&event->mmap_mutex); | ||
3832 | /* | ||
3833 | * Check we didn't race with perf_event_set_output() which can | ||
3834 | * swizzle the rb from under us while we were waiting to | ||
3835 | * acquire mmap_mutex. | ||
3836 | * | ||
3837 | * If we find a different rb; ignore this event, a next | ||
3838 | * iteration will no longer find it on the list. We have to | ||
3839 | * still restart the iteration to make sure we're not now | ||
3840 | * iterating the wrong list. | ||
3841 | */ | ||
3842 | if (event->rb == rb) { | ||
3843 | rcu_assign_pointer(event->rb, NULL); | ||
3844 | ring_buffer_detach(event, rb); | ||
3845 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3846 | } | ||
3623 | mutex_unlock(&event->mmap_mutex); | 3847 | mutex_unlock(&event->mmap_mutex); |
3848 | put_event(event); | ||
3624 | 3849 | ||
3625 | ring_buffer_put(rb); | 3850 | /* |
3626 | free_uid(user); | 3851 | * Restart the iteration; either we're on the wrong list or |
3852 | * destroyed its integrity by doing a deletion. | ||
3853 | */ | ||
3854 | goto again; | ||
3627 | } | 3855 | } |
3856 | rcu_read_unlock(); | ||
3857 | |||
3858 | /* | ||
3859 | * It could be there's still a few 0-ref events on the list; they'll | ||
3860 | * get cleaned up by free_event() -- they'll also still have their | ||
3861 | * ref on the rb and will free it whenever they are done with it. | ||
3862 | * | ||
3863 | * Aside from that, this buffer is 'fully' detached and unmapped, | ||
3864 | * undo the VM accounting. | ||
3865 | */ | ||
3866 | |||
3867 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); | ||
3868 | vma->vm_mm->pinned_vm -= mmap_locked; | ||
3869 | free_uid(mmap_user); | ||
3870 | |||
3871 | ring_buffer_put(rb); /* could be last */ | ||
3628 | } | 3872 | } |
3629 | 3873 | ||
3630 | static const struct vm_operations_struct perf_mmap_vmops = { | 3874 | static const struct vm_operations_struct perf_mmap_vmops = { |
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3674 | return -EINVAL; | 3918 | return -EINVAL; |
3675 | 3919 | ||
3676 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3920 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3921 | again: | ||
3677 | mutex_lock(&event->mmap_mutex); | 3922 | mutex_lock(&event->mmap_mutex); |
3678 | if (event->rb) { | 3923 | if (event->rb) { |
3679 | if (event->rb->nr_pages == nr_pages) | 3924 | if (event->rb->nr_pages != nr_pages) { |
3680 | atomic_inc(&event->rb->refcount); | ||
3681 | else | ||
3682 | ret = -EINVAL; | 3925 | ret = -EINVAL; |
3926 | goto unlock; | ||
3927 | } | ||
3928 | |||
3929 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | ||
3930 | /* | ||
3931 | * Raced against perf_mmap_close() through | ||
3932 | * perf_event_set_output(). Try again, hope for better | ||
3933 | * luck. | ||
3934 | */ | ||
3935 | mutex_unlock(&event->mmap_mutex); | ||
3936 | goto again; | ||
3937 | } | ||
3938 | |||
3683 | goto unlock; | 3939 | goto unlock; |
3684 | } | 3940 | } |
3685 | 3941 | ||
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3720 | ret = -ENOMEM; | 3976 | ret = -ENOMEM; |
3721 | goto unlock; | 3977 | goto unlock; |
3722 | } | 3978 | } |
3723 | rcu_assign_pointer(event->rb, rb); | 3979 | |
3980 | atomic_set(&rb->mmap_count, 1); | ||
3981 | rb->mmap_locked = extra; | ||
3982 | rb->mmap_user = get_current_user(); | ||
3724 | 3983 | ||
3725 | atomic_long_add(user_extra, &user->locked_vm); | 3984 | atomic_long_add(user_extra, &user->locked_vm); |
3726 | event->mmap_locked = extra; | 3985 | vma->vm_mm->pinned_vm += extra; |
3727 | event->mmap_user = get_current_user(); | 3986 | |
3728 | vma->vm_mm->pinned_vm += event->mmap_locked; | 3987 | ring_buffer_attach(event, rb); |
3988 | rcu_assign_pointer(event->rb, rb); | ||
3729 | 3989 | ||
3730 | perf_event_update_userpage(event); | 3990 | perf_event_update_userpage(event); |
3731 | 3991 | ||
@@ -3734,7 +3994,11 @@ unlock: | |||
3734 | atomic_inc(&event->mmap_count); | 3994 | atomic_inc(&event->mmap_count); |
3735 | mutex_unlock(&event->mmap_mutex); | 3995 | mutex_unlock(&event->mmap_mutex); |
3736 | 3996 | ||
3737 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; | 3997 | /* |
3998 | * Since pinned accounting is per vm we cannot allow fork() to copy our | ||
3999 | * vma. | ||
4000 | */ | ||
4001 | vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; | ||
3738 | vma->vm_ops = &perf_mmap_vmops; | 4002 | vma->vm_ops = &perf_mmap_vmops; |
3739 | 4003 | ||
3740 | return ret; | 4004 | return ret; |
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | |||
4961 | * sign as trigger. | 5225 | * sign as trigger. |
4962 | */ | 5226 | */ |
4963 | 5227 | ||
4964 | static u64 perf_swevent_set_period(struct perf_event *event) | 5228 | u64 perf_swevent_set_period(struct perf_event *event) |
4965 | { | 5229 | { |
4966 | struct hw_perf_event *hwc = &event->hw; | 5230 | struct hw_perf_event *hwc = &event->hw; |
4967 | u64 period = hwc->last_period; | 5231 | u64 period = hwc->last_period; |
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
5904 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6168 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
5905 | } | 6169 | } |
5906 | 6170 | ||
6171 | static ssize_t | ||
6172 | perf_event_mux_interval_ms_show(struct device *dev, | ||
6173 | struct device_attribute *attr, | ||
6174 | char *page) | ||
6175 | { | ||
6176 | struct pmu *pmu = dev_get_drvdata(dev); | ||
6177 | |||
6178 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); | ||
6179 | } | ||
6180 | |||
6181 | static ssize_t | ||
6182 | perf_event_mux_interval_ms_store(struct device *dev, | ||
6183 | struct device_attribute *attr, | ||
6184 | const char *buf, size_t count) | ||
6185 | { | ||
6186 | struct pmu *pmu = dev_get_drvdata(dev); | ||
6187 | int timer, cpu, ret; | ||
6188 | |||
6189 | ret = kstrtoint(buf, 0, &timer); | ||
6190 | if (ret) | ||
6191 | return ret; | ||
6192 | |||
6193 | if (timer < 1) | ||
6194 | return -EINVAL; | ||
6195 | |||
6196 | /* same value, noting to do */ | ||
6197 | if (timer == pmu->hrtimer_interval_ms) | ||
6198 | return count; | ||
6199 | |||
6200 | pmu->hrtimer_interval_ms = timer; | ||
6201 | |||
6202 | /* update all cpuctx for this PMU */ | ||
6203 | for_each_possible_cpu(cpu) { | ||
6204 | struct perf_cpu_context *cpuctx; | ||
6205 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
6206 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); | ||
6207 | |||
6208 | if (hrtimer_active(&cpuctx->hrtimer)) | ||
6209 | hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); | ||
6210 | } | ||
6211 | |||
6212 | return count; | ||
6213 | } | ||
6214 | |||
6215 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) | ||
6216 | |||
5907 | static struct device_attribute pmu_dev_attrs[] = { | 6217 | static struct device_attribute pmu_dev_attrs[] = { |
5908 | __ATTR_RO(type), | 6218 | __ATTR_RO(type), |
5909 | __ATTR_NULL, | 6219 | __ATTR_RW(perf_event_mux_interval_ms), |
6220 | __ATTR_NULL, | ||
5910 | }; | 6221 | }; |
5911 | 6222 | ||
5912 | static int pmu_bus_running; | 6223 | static int pmu_bus_running; |
@@ -5952,7 +6263,7 @@ free_dev: | |||
5952 | static struct lock_class_key cpuctx_mutex; | 6263 | static struct lock_class_key cpuctx_mutex; |
5953 | static struct lock_class_key cpuctx_lock; | 6264 | static struct lock_class_key cpuctx_lock; |
5954 | 6265 | ||
5955 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | 6266 | int perf_pmu_register(struct pmu *pmu, const char *name, int type) |
5956 | { | 6267 | { |
5957 | int cpu, ret; | 6268 | int cpu, ret; |
5958 | 6269 | ||
@@ -6001,7 +6312,9 @@ skip_type: | |||
6001 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | 6312 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); |
6002 | cpuctx->ctx.type = cpu_context; | 6313 | cpuctx->ctx.type = cpu_context; |
6003 | cpuctx->ctx.pmu = pmu; | 6314 | cpuctx->ctx.pmu = pmu; |
6004 | cpuctx->jiffies_interval = 1; | 6315 | |
6316 | __perf_cpu_hrtimer_init(cpuctx, cpu); | ||
6317 | |||
6005 | INIT_LIST_HEAD(&cpuctx->rotation_list); | 6318 | INIT_LIST_HEAD(&cpuctx->rotation_list); |
6006 | cpuctx->unique_pmu = pmu; | 6319 | cpuctx->unique_pmu = pmu; |
6007 | } | 6320 | } |
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6327 | if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) | 6640 | if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) |
6328 | return -EINVAL; | 6641 | return -EINVAL; |
6329 | 6642 | ||
6330 | /* kernel level capture: check permissions */ | ||
6331 | if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) | ||
6332 | && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
6333 | return -EACCES; | ||
6334 | |||
6335 | /* propagate priv level, when not set for branch */ | 6643 | /* propagate priv level, when not set for branch */ |
6336 | if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { | 6644 | if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { |
6337 | 6645 | ||
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6349 | */ | 6657 | */ |
6350 | attr->branch_sample_type = mask; | 6658 | attr->branch_sample_type = mask; |
6351 | } | 6659 | } |
6660 | /* privileged levels capture (kernel, hv): check permissions */ | ||
6661 | if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) | ||
6662 | && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
6663 | return -EACCES; | ||
6352 | } | 6664 | } |
6353 | 6665 | ||
6354 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { | 6666 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { |
@@ -6412,6 +6724,8 @@ set: | |||
6412 | if (atomic_read(&event->mmap_count)) | 6724 | if (atomic_read(&event->mmap_count)) |
6413 | goto unlock; | 6725 | goto unlock; |
6414 | 6726 | ||
6727 | old_rb = event->rb; | ||
6728 | |||
6415 | if (output_event) { | 6729 | if (output_event) { |
6416 | /* get the rb we want to redirect to */ | 6730 | /* get the rb we want to redirect to */ |
6417 | rb = ring_buffer_get(output_event); | 6731 | rb = ring_buffer_get(output_event); |
@@ -6419,16 +6733,28 @@ set: | |||
6419 | goto unlock; | 6733 | goto unlock; |
6420 | } | 6734 | } |
6421 | 6735 | ||
6422 | old_rb = event->rb; | ||
6423 | rcu_assign_pointer(event->rb, rb); | ||
6424 | if (old_rb) | 6736 | if (old_rb) |
6425 | ring_buffer_detach(event, old_rb); | 6737 | ring_buffer_detach(event, old_rb); |
6738 | |||
6739 | if (rb) | ||
6740 | ring_buffer_attach(event, rb); | ||
6741 | |||
6742 | rcu_assign_pointer(event->rb, rb); | ||
6743 | |||
6744 | if (old_rb) { | ||
6745 | ring_buffer_put(old_rb); | ||
6746 | /* | ||
6747 | * Since we detached before setting the new rb, so that we | ||
6748 | * could attach the new rb, we could have missed a wakeup. | ||
6749 | * Provide it now. | ||
6750 | */ | ||
6751 | wake_up_all(&event->waitq); | ||
6752 | } | ||
6753 | |||
6426 | ret = 0; | 6754 | ret = 0; |
6427 | unlock: | 6755 | unlock: |
6428 | mutex_unlock(&event->mmap_mutex); | 6756 | mutex_unlock(&event->mmap_mutex); |
6429 | 6757 | ||
6430 | if (old_rb) | ||
6431 | ring_buffer_put(old_rb); | ||
6432 | out: | 6758 | out: |
6433 | return ret; | 6759 | return ret; |
6434 | } | 6760 | } |
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
7387 | case CPU_DOWN_PREPARE: | 7713 | case CPU_DOWN_PREPARE: |
7388 | perf_event_exit_cpu(cpu); | 7714 | perf_event_exit_cpu(cpu); |
7389 | break; | 7715 | break; |
7390 | |||
7391 | default: | 7716 | default: |
7392 | break; | 7717 | break; |
7393 | } | 7718 | } |