aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c245
1 files changed, 156 insertions, 89 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 199ed4771315..b0b20a07f394 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1104 __perf_counter_sync_stat(counter, next_counter); 1104 __perf_counter_sync_stat(counter, next_counter);
1105 1105
1106 counter = list_next_entry(counter, event_entry); 1106 counter = list_next_entry(counter, event_entry);
1107 next_counter = list_next_entry(counter, event_entry); 1107 next_counter = list_next_entry(next_counter, event_entry);
1108 } 1108 }
1109} 1109}
1110 1110
@@ -2714,6 +2714,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2714 header.size += sizeof(u64); 2714 header.size += sizeof(u64);
2715 } 2715 }
2716 2716
2717 if (sample_type & PERF_SAMPLE_RAW) {
2718 int size = sizeof(u32);
2719
2720 if (data->raw)
2721 size += data->raw->size;
2722 else
2723 size += sizeof(u32);
2724
2725 WARN_ON_ONCE(size & (sizeof(u64)-1));
2726 header.size += size;
2727 }
2728
2717 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2729 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2718 if (ret) 2730 if (ret)
2719 return; 2731 return;
@@ -2777,6 +2789,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2777 } 2789 }
2778 } 2790 }
2779 2791
2792 if (sample_type & PERF_SAMPLE_RAW) {
2793 if (data->raw) {
2794 perf_output_put(&handle, data->raw->size);
2795 perf_output_copy(&handle, data->raw->data, data->raw->size);
2796 } else {
2797 struct {
2798 u32 size;
2799 u32 data;
2800 } raw = {
2801 .size = sizeof(u32),
2802 .data = 0,
2803 };
2804 perf_output_put(&handle, raw);
2805 }
2806 }
2807
2780 perf_output_end(&handle); 2808 perf_output_end(&handle);
2781} 2809}
2782 2810
@@ -2840,7 +2868,8 @@ perf_counter_read_event(struct perf_counter *counter,
2840 */ 2868 */
2841 2869
2842struct perf_task_event { 2870struct perf_task_event {
2843 struct task_struct *task; 2871 struct task_struct *task;
2872 struct perf_counter_context *task_ctx;
2844 2873
2845 struct { 2874 struct {
2846 struct perf_event_header header; 2875 struct perf_event_header header;
@@ -2900,24 +2929,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2900static void perf_counter_task_event(struct perf_task_event *task_event) 2929static void perf_counter_task_event(struct perf_task_event *task_event)
2901{ 2930{
2902 struct perf_cpu_context *cpuctx; 2931 struct perf_cpu_context *cpuctx;
2903 struct perf_counter_context *ctx; 2932 struct perf_counter_context *ctx = task_event->task_ctx;
2904 2933
2905 cpuctx = &get_cpu_var(perf_cpu_context); 2934 cpuctx = &get_cpu_var(perf_cpu_context);
2906 perf_counter_task_ctx(&cpuctx->ctx, task_event); 2935 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2907 put_cpu_var(perf_cpu_context); 2936 put_cpu_var(perf_cpu_context);
2908 2937
2909 rcu_read_lock(); 2938 rcu_read_lock();
2910 /* 2939 if (!ctx)
2911 * doesn't really matter which of the child contexts the 2940 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2912 * events ends up in.
2913 */
2914 ctx = rcu_dereference(current->perf_counter_ctxp);
2915 if (ctx) 2941 if (ctx)
2916 perf_counter_task_ctx(ctx, task_event); 2942 perf_counter_task_ctx(ctx, task_event);
2917 rcu_read_unlock(); 2943 rcu_read_unlock();
2918} 2944}
2919 2945
2920static void perf_counter_task(struct task_struct *task, int new) 2946static void perf_counter_task(struct task_struct *task,
2947 struct perf_counter_context *task_ctx,
2948 int new)
2921{ 2949{
2922 struct perf_task_event task_event; 2950 struct perf_task_event task_event;
2923 2951
@@ -2927,8 +2955,9 @@ static void perf_counter_task(struct task_struct *task, int new)
2927 return; 2955 return;
2928 2956
2929 task_event = (struct perf_task_event){ 2957 task_event = (struct perf_task_event){
2930 .task = task, 2958 .task = task,
2931 .event = { 2959 .task_ctx = task_ctx,
2960 .event = {
2932 .header = { 2961 .header = {
2933 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, 2962 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2934 .misc = 0, 2963 .misc = 0,
@@ -2946,7 +2975,7 @@ static void perf_counter_task(struct task_struct *task, int new)
2946 2975
2947void perf_counter_fork(struct task_struct *task) 2976void perf_counter_fork(struct task_struct *task)
2948{ 2977{
2949 perf_counter_task(task, 1); 2978 perf_counter_task(task, NULL, 1);
2950} 2979}
2951 2980
2952/* 2981/*
@@ -3335,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3335 * Generic software counter infrastructure 3364 * Generic software counter infrastructure
3336 */ 3365 */
3337 3366
3338static void perf_swcounter_update(struct perf_counter *counter) 3367/*
3368 * We directly increment counter->count and keep a second value in
3369 * counter->hw.period_left to count intervals. This period counter
3370 * is kept in the range [-sample_period, 0] so that we can use the
3371 * sign as trigger.
3372 */
3373
3374static u64 perf_swcounter_set_period(struct perf_counter *counter)
3339{ 3375{
3340 struct hw_perf_counter *hwc = &counter->hw; 3376 struct hw_perf_counter *hwc = &counter->hw;
3341 u64 prev, now; 3377 u64 period = hwc->last_period;
3342 s64 delta; 3378 u64 nr, offset;
3379 s64 old, val;
3380
3381 hwc->last_period = hwc->sample_period;
3343 3382
3344again: 3383again:
3345 prev = atomic64_read(&hwc->prev_count); 3384 old = val = atomic64_read(&hwc->period_left);
3346 now = atomic64_read(&hwc->count); 3385 if (val < 0)
3347 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3386 return 0;
3348 goto again;
3349 3387
3350 delta = now - prev; 3388 nr = div64_u64(period + val, period);
3389 offset = nr * period;
3390 val -= offset;
3391 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3392 goto again;
3351 3393
3352 atomic64_add(delta, &counter->count); 3394 return nr;
3353 atomic64_sub(delta, &hwc->period_left);
3354} 3395}
3355 3396
3356static void perf_swcounter_set_period(struct perf_counter *counter) 3397static void perf_swcounter_overflow(struct perf_counter *counter,
3398 int nmi, struct perf_sample_data *data)
3357{ 3399{
3358 struct hw_perf_counter *hwc = &counter->hw; 3400 struct hw_perf_counter *hwc = &counter->hw;
3359 s64 left = atomic64_read(&hwc->period_left); 3401 u64 overflow;
3360 s64 period = hwc->sample_period;
3361 3402
3362 if (unlikely(left <= -period)) { 3403 data->period = counter->hw.last_period;
3363 left = period; 3404 overflow = perf_swcounter_set_period(counter);
3364 atomic64_set(&hwc->period_left, left);
3365 hwc->last_period = period;
3366 }
3367 3405
3368 if (unlikely(left <= 0)) { 3406 if (hwc->interrupts == MAX_INTERRUPTS)
3369 left += period; 3407 return;
3370 atomic64_add(period, &hwc->period_left);
3371 hwc->last_period = period;
3372 }
3373 3408
3374 atomic64_set(&hwc->prev_count, -left); 3409 for (; overflow; overflow--) {
3375 atomic64_set(&hwc->count, -left); 3410 if (perf_counter_overflow(counter, nmi, data)) {
3411 /*
3412 * We inhibit the overflow from happening when
3413 * hwc->interrupts == MAX_INTERRUPTS.
3414 */
3415 break;
3416 }
3417 }
3376} 3418}
3377 3419
3378static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3420static void perf_swcounter_unthrottle(struct perf_counter *counter)
3379{ 3421{
3380 enum hrtimer_restart ret = HRTIMER_RESTART;
3381 struct perf_sample_data data;
3382 struct perf_counter *counter;
3383 u64 period;
3384
3385 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3386 counter->pmu->read(counter);
3387
3388 data.addr = 0;
3389 data.regs = get_irq_regs();
3390 /* 3422 /*
3391 * In case we exclude kernel IPs or are somehow not in interrupt 3423 * Nothing to do, we already reset hwc->interrupts.
3392 * context, provide the next best thing, the user IP.
3393 */ 3424 */
3394 if ((counter->attr.exclude_kernel || !data.regs) && 3425}
3395 !counter->attr.exclude_user)
3396 data.regs = task_pt_regs(current);
3397 3426
3398 if (data.regs) { 3427static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3399 if (perf_counter_overflow(counter, 0, &data)) 3428 int nmi, struct perf_sample_data *data)
3400 ret = HRTIMER_NORESTART; 3429{
3401 } 3430 struct hw_perf_counter *hwc = &counter->hw;
3402 3431
3403 period = max_t(u64, 10000, counter->hw.sample_period); 3432 atomic64_add(nr, &counter->count);
3404 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3405 3433
3406 return ret; 3434 if (!hwc->sample_period)
3407} 3435 return;
3408 3436
3409static void perf_swcounter_overflow(struct perf_counter *counter, 3437 if (!data->regs)
3410 int nmi, struct perf_sample_data *data) 3438 return;
3411{
3412 data->period = counter->hw.last_period;
3413 3439
3414 perf_swcounter_update(counter); 3440 if (!atomic64_add_negative(nr, &hwc->period_left))
3415 perf_swcounter_set_period(counter); 3441 perf_swcounter_overflow(counter, nmi, data);
3416 if (perf_counter_overflow(counter, nmi, data))
3417 /* soft-disable the counter */
3418 ;
3419} 3442}
3420 3443
3421static int perf_swcounter_is_counting(struct perf_counter *counter) 3444static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3479,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3479 return 1; 3502 return 1;
3480} 3503}
3481 3504
3482static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3483 int nmi, struct perf_sample_data *data)
3484{
3485 int neg = atomic64_add_negative(nr, &counter->hw.count);
3486
3487 if (counter->hw.sample_period && !neg && data->regs)
3488 perf_swcounter_overflow(counter, nmi, data);
3489}
3490
3491static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3505static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3492 enum perf_type_id type, 3506 enum perf_type_id type,
3493 u32 event, u64 nr, int nmi, 3507 u32 event, u64 nr, int nmi,
@@ -3566,27 +3580,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3566 3580
3567static void perf_swcounter_read(struct perf_counter *counter) 3581static void perf_swcounter_read(struct perf_counter *counter)
3568{ 3582{
3569 perf_swcounter_update(counter);
3570} 3583}
3571 3584
3572static int perf_swcounter_enable(struct perf_counter *counter) 3585static int perf_swcounter_enable(struct perf_counter *counter)
3573{ 3586{
3574 perf_swcounter_set_period(counter); 3587 struct hw_perf_counter *hwc = &counter->hw;
3588
3589 if (hwc->sample_period) {
3590 hwc->last_period = hwc->sample_period;
3591 perf_swcounter_set_period(counter);
3592 }
3575 return 0; 3593 return 0;
3576} 3594}
3577 3595
3578static void perf_swcounter_disable(struct perf_counter *counter) 3596static void perf_swcounter_disable(struct perf_counter *counter)
3579{ 3597{
3580 perf_swcounter_update(counter);
3581} 3598}
3582 3599
3583static const struct pmu perf_ops_generic = { 3600static const struct pmu perf_ops_generic = {
3584 .enable = perf_swcounter_enable, 3601 .enable = perf_swcounter_enable,
3585 .disable = perf_swcounter_disable, 3602 .disable = perf_swcounter_disable,
3586 .read = perf_swcounter_read, 3603 .read = perf_swcounter_read,
3604 .unthrottle = perf_swcounter_unthrottle,
3587}; 3605};
3588 3606
3589/* 3607/*
3608 * hrtimer based swcounter callback
3609 */
3610
3611static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3612{
3613 enum hrtimer_restart ret = HRTIMER_RESTART;
3614 struct perf_sample_data data;
3615 struct perf_counter *counter;
3616 u64 period;
3617
3618 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3619 counter->pmu->read(counter);
3620
3621 data.addr = 0;
3622 data.regs = get_irq_regs();
3623 /*
3624 * In case we exclude kernel IPs or are somehow not in interrupt
3625 * context, provide the next best thing, the user IP.
3626 */
3627 if ((counter->attr.exclude_kernel || !data.regs) &&
3628 !counter->attr.exclude_user)
3629 data.regs = task_pt_regs(current);
3630
3631 if (data.regs) {
3632 if (perf_counter_overflow(counter, 0, &data))
3633 ret = HRTIMER_NORESTART;
3634 }
3635
3636 period = max_t(u64, 10000, counter->hw.sample_period);
3637 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3638
3639 return ret;
3640}
3641
3642/*
3590 * Software counter: cpu wall time clock 3643 * Software counter: cpu wall time clock
3591 */ 3644 */
3592 3645
@@ -3703,17 +3756,24 @@ static const struct pmu perf_ops_task_clock = {
3703}; 3756};
3704 3757
3705#ifdef CONFIG_EVENT_PROFILE 3758#ifdef CONFIG_EVENT_PROFILE
3706void perf_tpcounter_event(int event_id) 3759void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3760 int entry_size)
3707{ 3761{
3762 struct perf_raw_record raw = {
3763 .size = entry_size,
3764 .data = record,
3765 };
3766
3708 struct perf_sample_data data = { 3767 struct perf_sample_data data = {
3709 .regs = get_irq_regs(), 3768 .regs = get_irq_regs(),
3710 .addr = 0, 3769 .addr = addr,
3770 .raw = &raw,
3711 }; 3771 };
3712 3772
3713 if (!data.regs) 3773 if (!data.regs)
3714 data.regs = task_pt_regs(current); 3774 data.regs = task_pt_regs(current);
3715 3775
3716 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); 3776 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3717} 3777}
3718EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3778EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3719 3779
@@ -3727,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
3727 3787
3728static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3788static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3729{ 3789{
3790 /*
3791 * Raw tracepoint data is a severe data leak, only allow root to
3792 * have these.
3793 */
3794 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3795 !capable(CAP_SYS_ADMIN))
3796 return ERR_PTR(-EPERM);
3797
3730 if (ftrace_profile_enable(counter->attr.config)) 3798 if (ftrace_profile_enable(counter->attr.config))
3731 return NULL; 3799 return NULL;
3732 3800
@@ -4269,7 +4337,7 @@ void perf_counter_exit_task(struct task_struct *child)
4269 unsigned long flags; 4337 unsigned long flags;
4270 4338
4271 if (likely(!child->perf_counter_ctxp)) { 4339 if (likely(!child->perf_counter_ctxp)) {
4272 perf_counter_task(child, 0); 4340 perf_counter_task(child, NULL, 0);
4273 return; 4341 return;
4274 } 4342 }
4275 4343
@@ -4289,6 +4357,7 @@ void perf_counter_exit_task(struct task_struct *child)
4289 * incremented the context's refcount before we do put_ctx below. 4357 * incremented the context's refcount before we do put_ctx below.
4290 */ 4358 */
4291 spin_lock(&child_ctx->lock); 4359 spin_lock(&child_ctx->lock);
4360 child->perf_counter_ctxp = NULL;
4292 /* 4361 /*
4293 * If this context is a clone; unclone it so it can't get 4362 * If this context is a clone; unclone it so it can't get
4294 * swapped to another process while we're removing all 4363 * swapped to another process while we're removing all
@@ -4302,9 +4371,7 @@ void perf_counter_exit_task(struct task_struct *child)
4302 * won't get any samples after PERF_EVENT_EXIT. We can however still 4371 * won't get any samples after PERF_EVENT_EXIT. We can however still
4303 * get a few PERF_EVENT_READ events. 4372 * get a few PERF_EVENT_READ events.
4304 */ 4373 */
4305 perf_counter_task(child, 0); 4374 perf_counter_task(child, child_ctx, 0);
4306
4307 child->perf_counter_ctxp = NULL;
4308 4375
4309 /* 4376 /*
4310 * We can recurse on the same lock type through: 4377 * We can recurse on the same lock type through: