aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c239
1 files changed, 145 insertions, 94 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 868102172aa4..b0b20a07f394 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2646 u64 counter; 2646 u64 counter;
2647 } group_entry; 2647 } group_entry;
2648 struct perf_callchain_entry *callchain = NULL; 2648 struct perf_callchain_entry *callchain = NULL;
2649 struct perf_tracepoint_record *tp;
2650 int callchain_size = 0; 2649 int callchain_size = 0;
2651 u64 time; 2650 u64 time;
2652 struct { 2651 struct {
@@ -2715,9 +2714,16 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2715 header.size += sizeof(u64); 2714 header.size += sizeof(u64);
2716 } 2715 }
2717 2716
2718 if (sample_type & PERF_SAMPLE_TP_RECORD) { 2717 if (sample_type & PERF_SAMPLE_RAW) {
2719 tp = data->private; 2718 int size = sizeof(u32);
2720 header.size += tp->size; 2719
2720 if (data->raw)
2721 size += data->raw->size;
2722 else
2723 size += sizeof(u32);
2724
2725 WARN_ON_ONCE(size & (sizeof(u64)-1));
2726 header.size += size;
2721 } 2727 }
2722 2728
2723 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2729 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2783,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2783 } 2789 }
2784 } 2790 }
2785 2791
2786 if (sample_type & PERF_SAMPLE_TP_RECORD) 2792 if (sample_type & PERF_SAMPLE_RAW) {
2787 perf_output_copy(&handle, tp->record, tp->size); 2793 if (data->raw) {
2794 perf_output_put(&handle, data->raw->size);
2795 perf_output_copy(&handle, data->raw->data, data->raw->size);
2796 } else {
2797 struct {
2798 u32 size;
2799 u32 data;
2800 } raw = {
2801 .size = sizeof(u32),
2802 .data = 0,
2803 };
2804 perf_output_put(&handle, raw);
2805 }
2806 }
2788 2807
2789 perf_output_end(&handle); 2808 perf_output_end(&handle);
2790} 2809}
@@ -2849,7 +2868,8 @@ perf_counter_read_event(struct perf_counter *counter,
2849 */ 2868 */
2850 2869
2851struct perf_task_event { 2870struct perf_task_event {
2852 struct task_struct *task; 2871 struct task_struct *task;
2872 struct perf_counter_context *task_ctx;
2853 2873
2854 struct { 2874 struct {
2855 struct perf_event_header header; 2875 struct perf_event_header header;
@@ -2909,24 +2929,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2909static void perf_counter_task_event(struct perf_task_event *task_event) 2929static void perf_counter_task_event(struct perf_task_event *task_event)
2910{ 2930{
2911 struct perf_cpu_context *cpuctx; 2931 struct perf_cpu_context *cpuctx;
2912 struct perf_counter_context *ctx; 2932 struct perf_counter_context *ctx = task_event->task_ctx;
2913 2933
2914 cpuctx = &get_cpu_var(perf_cpu_context); 2934 cpuctx = &get_cpu_var(perf_cpu_context);
2915 perf_counter_task_ctx(&cpuctx->ctx, task_event); 2935 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2916 put_cpu_var(perf_cpu_context); 2936 put_cpu_var(perf_cpu_context);
2917 2937
2918 rcu_read_lock(); 2938 rcu_read_lock();
2919 /* 2939 if (!ctx)
2920 * doesn't really matter which of the child contexts the 2940 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2921 * events ends up in.
2922 */
2923 ctx = rcu_dereference(current->perf_counter_ctxp);
2924 if (ctx) 2941 if (ctx)
2925 perf_counter_task_ctx(ctx, task_event); 2942 perf_counter_task_ctx(ctx, task_event);
2926 rcu_read_unlock(); 2943 rcu_read_unlock();
2927} 2944}
2928 2945
2929static void perf_counter_task(struct task_struct *task, int new) 2946static void perf_counter_task(struct task_struct *task,
2947 struct perf_counter_context *task_ctx,
2948 int new)
2930{ 2949{
2931 struct perf_task_event task_event; 2950 struct perf_task_event task_event;
2932 2951
@@ -2936,8 +2955,9 @@ static void perf_counter_task(struct task_struct *task, int new)
2936 return; 2955 return;
2937 2956
2938 task_event = (struct perf_task_event){ 2957 task_event = (struct perf_task_event){
2939 .task = task, 2958 .task = task,
2940 .event = { 2959 .task_ctx = task_ctx,
2960 .event = {
2941 .header = { 2961 .header = {
2942 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, 2962 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2943 .misc = 0, 2963 .misc = 0,
@@ -2955,7 +2975,7 @@ static void perf_counter_task(struct task_struct *task, int new)
2955 2975
2956void perf_counter_fork(struct task_struct *task) 2976void perf_counter_fork(struct task_struct *task)
2957{ 2977{
2958 perf_counter_task(task, 1); 2978 perf_counter_task(task, NULL, 1);
2959} 2979}
2960 2980
2961/* 2981/*
@@ -3344,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3344 * Generic software counter infrastructure 3364 * Generic software counter infrastructure
3345 */ 3365 */
3346 3366
3347static void perf_swcounter_update(struct perf_counter *counter) 3367/*
3368 * We directly increment counter->count and keep a second value in
3369 * counter->hw.period_left to count intervals. This period counter
3370 * is kept in the range [-sample_period, 0] so that we can use the
3371 * sign as trigger.
3372 */
3373
3374static u64 perf_swcounter_set_period(struct perf_counter *counter)
3348{ 3375{
3349 struct hw_perf_counter *hwc = &counter->hw; 3376 struct hw_perf_counter *hwc = &counter->hw;
3350 u64 prev, now; 3377 u64 period = hwc->last_period;
3351 s64 delta; 3378 u64 nr, offset;
3379 s64 old, val;
3380
3381 hwc->last_period = hwc->sample_period;
3352 3382
3353again: 3383again:
3354 prev = atomic64_read(&hwc->prev_count); 3384 old = val = atomic64_read(&hwc->period_left);
3355 now = atomic64_read(&hwc->count); 3385 if (val < 0)
3356 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3386 return 0;
3357 goto again;
3358 3387
3359 delta = now - prev; 3388 nr = div64_u64(period + val, period);
3389 offset = nr * period;
3390 val -= offset;
3391 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3392 goto again;
3360 3393
3361 atomic64_add(delta, &counter->count); 3394 return nr;
3362 atomic64_sub(delta, &hwc->period_left);
3363} 3395}
3364 3396
3365static void perf_swcounter_set_period(struct perf_counter *counter) 3397static void perf_swcounter_overflow(struct perf_counter *counter,
3398 int nmi, struct perf_sample_data *data)
3366{ 3399{
3367 struct hw_perf_counter *hwc = &counter->hw; 3400 struct hw_perf_counter *hwc = &counter->hw;
3368 s64 left = atomic64_read(&hwc->period_left); 3401 u64 overflow;
3369 s64 period = hwc->sample_period;
3370 3402
3371 if (unlikely(left <= -period)) { 3403 data->period = counter->hw.last_period;
3372 left = period; 3404 overflow = perf_swcounter_set_period(counter);
3373 atomic64_set(&hwc->period_left, left);
3374 hwc->last_period = period;
3375 }
3376 3405
3377 if (unlikely(left <= 0)) { 3406 if (hwc->interrupts == MAX_INTERRUPTS)
3378 left += period; 3407 return;
3379 atomic64_add(period, &hwc->period_left);
3380 hwc->last_period = period;
3381 }
3382 3408
3383 atomic64_set(&hwc->prev_count, -left); 3409 for (; overflow; overflow--) {
3384 atomic64_set(&hwc->count, -left); 3410 if (perf_counter_overflow(counter, nmi, data)) {
3411 /*
3412 * We inhibit the overflow from happening when
3413 * hwc->interrupts == MAX_INTERRUPTS.
3414 */
3415 break;
3416 }
3417 }
3385} 3418}
3386 3419
3387static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3420static void perf_swcounter_unthrottle(struct perf_counter *counter)
3388{ 3421{
3389 enum hrtimer_restart ret = HRTIMER_RESTART;
3390 struct perf_sample_data data;
3391 struct perf_counter *counter;
3392 u64 period;
3393
3394 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3395 counter->pmu->read(counter);
3396
3397 data.addr = 0;
3398 data.regs = get_irq_regs();
3399 /* 3422 /*
3400 * In case we exclude kernel IPs or are somehow not in interrupt 3423 * Nothing to do, we already reset hwc->interrupts.
3401 * context, provide the next best thing, the user IP.
3402 */ 3424 */
3403 if ((counter->attr.exclude_kernel || !data.regs) && 3425}
3404 !counter->attr.exclude_user)
3405 data.regs = task_pt_regs(current);
3406 3426
3407 if (data.regs) { 3427static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3408 if (perf_counter_overflow(counter, 0, &data)) 3428 int nmi, struct perf_sample_data *data)
3409 ret = HRTIMER_NORESTART; 3429{
3410 } 3430 struct hw_perf_counter *hwc = &counter->hw;
3411 3431
3412 period = max_t(u64, 10000, counter->hw.sample_period); 3432 atomic64_add(nr, &counter->count);
3413 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3414 3433
3415 return ret; 3434 if (!hwc->sample_period)
3416} 3435 return;
3417 3436
3418static void perf_swcounter_overflow(struct perf_counter *counter, 3437 if (!data->regs)
3419 int nmi, struct perf_sample_data *data) 3438 return;
3420{
3421 data->period = counter->hw.last_period;
3422 3439
3423 perf_swcounter_update(counter); 3440 if (!atomic64_add_negative(nr, &hwc->period_left))
3424 perf_swcounter_set_period(counter); 3441 perf_swcounter_overflow(counter, nmi, data);
3425 if (perf_counter_overflow(counter, nmi, data))
3426 /* soft-disable the counter */
3427 ;
3428} 3442}
3429 3443
3430static int perf_swcounter_is_counting(struct perf_counter *counter) 3444static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3488,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3488 return 1; 3502 return 1;
3489} 3503}
3490 3504
3491static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3492 int nmi, struct perf_sample_data *data)
3493{
3494 int neg = atomic64_add_negative(nr, &counter->hw.count);
3495
3496 if (counter->hw.sample_period && !neg && data->regs)
3497 perf_swcounter_overflow(counter, nmi, data);
3498}
3499
3500static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3505static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3501 enum perf_type_id type, 3506 enum perf_type_id type,
3502 u32 event, u64 nr, int nmi, 3507 u32 event, u64 nr, int nmi,
@@ -3575,27 +3580,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3575 3580
3576static void perf_swcounter_read(struct perf_counter *counter) 3581static void perf_swcounter_read(struct perf_counter *counter)
3577{ 3582{
3578 perf_swcounter_update(counter);
3579} 3583}
3580 3584
3581static int perf_swcounter_enable(struct perf_counter *counter) 3585static int perf_swcounter_enable(struct perf_counter *counter)
3582{ 3586{
3583 perf_swcounter_set_period(counter); 3587 struct hw_perf_counter *hwc = &counter->hw;
3588
3589 if (hwc->sample_period) {
3590 hwc->last_period = hwc->sample_period;
3591 perf_swcounter_set_period(counter);
3592 }
3584 return 0; 3593 return 0;
3585} 3594}
3586 3595
3587static void perf_swcounter_disable(struct perf_counter *counter) 3596static void perf_swcounter_disable(struct perf_counter *counter)
3588{ 3597{
3589 perf_swcounter_update(counter);
3590} 3598}
3591 3599
3592static const struct pmu perf_ops_generic = { 3600static const struct pmu perf_ops_generic = {
3593 .enable = perf_swcounter_enable, 3601 .enable = perf_swcounter_enable,
3594 .disable = perf_swcounter_disable, 3602 .disable = perf_swcounter_disable,
3595 .read = perf_swcounter_read, 3603 .read = perf_swcounter_read,
3604 .unthrottle = perf_swcounter_unthrottle,
3596}; 3605};
3597 3606
3598/* 3607/*
3608 * hrtimer based swcounter callback
3609 */
3610
3611static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3612{
3613 enum hrtimer_restart ret = HRTIMER_RESTART;
3614 struct perf_sample_data data;
3615 struct perf_counter *counter;
3616 u64 period;
3617
3618 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3619 counter->pmu->read(counter);
3620
3621 data.addr = 0;
3622 data.regs = get_irq_regs();
3623 /*
3624 * In case we exclude kernel IPs or are somehow not in interrupt
3625 * context, provide the next best thing, the user IP.
3626 */
3627 if ((counter->attr.exclude_kernel || !data.regs) &&
3628 !counter->attr.exclude_user)
3629 data.regs = task_pt_regs(current);
3630
3631 if (data.regs) {
3632 if (perf_counter_overflow(counter, 0, &data))
3633 ret = HRTIMER_NORESTART;
3634 }
3635
3636 period = max_t(u64, 10000, counter->hw.sample_period);
3637 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3638
3639 return ret;
3640}
3641
3642/*
3599 * Software counter: cpu wall time clock 3643 * Software counter: cpu wall time clock
3600 */ 3644 */
3601 3645
@@ -3715,15 +3759,15 @@ static const struct pmu perf_ops_task_clock = {
3715void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, 3759void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3716 int entry_size) 3760 int entry_size)
3717{ 3761{
3718 struct perf_tracepoint_record tp = { 3762 struct perf_raw_record raw = {
3719 .size = entry_size, 3763 .size = entry_size,
3720 .record = record, 3764 .data = record,
3721 }; 3765 };
3722 3766
3723 struct perf_sample_data data = { 3767 struct perf_sample_data data = {
3724 .regs = get_irq_regs(), 3768 .regs = get_irq_regs(),
3725 .addr = addr, 3769 .addr = addr,
3726 .private = &tp, 3770 .raw = &raw,
3727 }; 3771 };
3728 3772
3729 if (!data.regs) 3773 if (!data.regs)
@@ -3743,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
3743 3787
3744static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3788static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3745{ 3789{
3790 /*
3791 * Raw tracepoint data is a severe data leak, only allow root to
3792 * have these.
3793 */
3794 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3795 !capable(CAP_SYS_ADMIN))
3796 return ERR_PTR(-EPERM);
3797
3746 if (ftrace_profile_enable(counter->attr.config)) 3798 if (ftrace_profile_enable(counter->attr.config))
3747 return NULL; 3799 return NULL;
3748 3800
@@ -4285,7 +4337,7 @@ void perf_counter_exit_task(struct task_struct *child)
4285 unsigned long flags; 4337 unsigned long flags;
4286 4338
4287 if (likely(!child->perf_counter_ctxp)) { 4339 if (likely(!child->perf_counter_ctxp)) {
4288 perf_counter_task(child, 0); 4340 perf_counter_task(child, NULL, 0);
4289 return; 4341 return;
4290 } 4342 }
4291 4343
@@ -4305,6 +4357,7 @@ void perf_counter_exit_task(struct task_struct *child)
4305 * incremented the context's refcount before we do put_ctx below. 4357 * incremented the context's refcount before we do put_ctx below.
4306 */ 4358 */
4307 spin_lock(&child_ctx->lock); 4359 spin_lock(&child_ctx->lock);
4360 child->perf_counter_ctxp = NULL;
4308 /* 4361 /*
4309 * If this context is a clone; unclone it so it can't get 4362 * If this context is a clone; unclone it so it can't get
4310 * swapped to another process while we're removing all 4363 * swapped to another process while we're removing all
@@ -4318,9 +4371,7 @@ void perf_counter_exit_task(struct task_struct *child)
4318 * won't get any samples after PERF_EVENT_EXIT. We can however still 4371 * won't get any samples after PERF_EVENT_EXIT. We can however still
4319 * get a few PERF_EVENT_READ events. 4372 * get a few PERF_EVENT_READ events.
4320 */ 4373 */
4321 perf_counter_task(child, 0); 4374 perf_counter_task(child, child_ctx, 0);
4322
4323 child->perf_counter_ctxp = NULL;
4324 4375
4325 /* 4376 /*
4326 * We can recurse on the same lock type through: 4377 * We can recurse on the same lock type through: