From 3c502e7a0255d82621ff25d60cc816624830497e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 4 Nov 2010 17:33:01 -0500 Subject: perf,hw_breakpoint: Initialize hardware api earlier When using early debugging, the kernel does not initialize the hw_breakpoint API early enough and causes the late initialization of the kernel debugger to fail. The boot arguments are: earlyprintk=vga ekgdboc=kbd kgdbwait Then simply type "go" at the kdb prompt and boot. The kernel will later emit the message: kgdb: Could not allocate hwbreakpoints And at that point the kernel debugger will cease to work correctly. The solution is to initialize the hw_breakpoint at the same time that all the other perf call backs are initialized instead of using a core_initcall() initialization which happens well after the kernel debugger can make use of hardware breakpoints. Signed-off-by: Jason Wessel CC: Frederic Weisbecker CC: Ingo Molnar CC: Peter Zijlstra LKML-Reference: <4CD3396D.1090308@windriver.com> Signed-off-by: Frederic Weisbecker --- kernel/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f4982..05b7d8c72c6c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -6295,6 +6296,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) void __init perf_event_init(void) { + int ret; + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent); @@ -6302,4 +6305,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } -- cgit v1.2.2 From 8882135bcd332f294df5455747ea43ba9e6f77ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Nov 2010 19:01:43 +0100 Subject: perf: Fix owner-list vs exit Oleg noticed that a perf-fd keeping a reference on the creating task leads to a few funny side effects. There's two different aspects to this: - kernel based perf-events, these should not take out a reference on the creating task and appear on the task's event list since they're not bound to fds nor visible to userspace. - fork() and pthread_create(), these can lead to the creating task dying (and thus the task's event-list becomming useless) but keeping the list and ref alive until the event is closed. Combined they lead to malfunction of the ptrace hw_tracepoints. Cure this by not considering kernel based perf_events for the owner-list and destroying the owner-list when the owner dies. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov LKML-Reference: <1289576883.2084.286.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f818d9d2dc93..671f6c8c8a32 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); - mutex_lock(&event->owner->perf_event_mutex); - list_del_init(&event->owner_entry); - mutex_unlock(&event->owner->perf_event_mutex); - put_task_struct(event->owner); - free_event(event); return 0; @@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { struct perf_event *event = file->private_data; + struct task_struct *owner; file->private_data = NULL; + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + return perf_event_release_kernel(event); } @@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); event->owner = current; - get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ++ctx->generation; mutex_unlock(&ctx->mutex); - event->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - return event; err_free: @@ -5902,8 +5925,24 @@ again: */ void perf_event_exit_task(struct task_struct *child) { + struct perf_event *event, *tmp; int ctxn; + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); } -- cgit v1.2.2 From 61c32659b12c44e62de32fbf99f7e4ca783dc38b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 18 Nov 2010 01:39:17 +0100 Subject: tracing: New flag to allow non privileged users to use a trace event This adds a new trace event internal flag that allows them to be used in perf by non privileged users in case of task bound tracing. This is desired for syscalls tracepoint because they don't leak global system informations, like some other tracepoints. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Li Zefan Cc: Jason Baron --- kernel/perf_event.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f4982..ee1e903f983c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4747,15 +4747,6 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((event->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - err = perf_trace_init(event); if (err) return err; -- cgit v1.2.2 From dddd3379a619a4cb8247bfd3c94ca9ae3797aa2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 Nov 2010 10:05:55 +0100 Subject: perf: Fix inherit vs. context rotation bug It was found that sometimes children of tasks with inherited events had one extra event. Eventually it turned out to be due to the list rotation no being exclusive with the list iteration in the inheritance code. Cure this by temporarily disabling the rotation while we inherit the events. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 671f6c8c8a32..f365dd8ef8b0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1622,8 +1622,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -6162,6 +6166,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp[ctxn] = NULL; @@ -6202,6 +6207,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); @@ -6209,6 +6223,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { -- cgit v1.2.2 From ee6dcfa40a50fe12a3ae0fb4d2653c66c3ed6556 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Nov 2010 13:49:04 +0100 Subject: perf: Fix the software context switch counter Stephane noticed that because the perf_sw_event() call is inside the perf_event_task_sched_out() call it won't get called unless we have a per-task counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f365dd8ef8b0..eac7e3364335 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1287,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); } -- cgit v1.2.2 From 6c7e550f13f8ad82efb6a5653ae628c2543c1768 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Tue, 23 Nov 2010 16:21:43 +0100 Subject: perf: Introduce is_sampling_event() and use it when appropriate. Signed-off-by: Franck Bui-Huu Signed-off-by: Peter Zijlstra LKML-Reference: <1290525705-6265-1-git-send-email-fbuihuu@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 43f757ccf831..880698488c91 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2514,7 +2514,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) int ret = 0; u64 value; - if (!event->attr.sample_period) + if (!is_sampling_event(event)) return -EINVAL; if (copy_from_user(&value, arg, sizeof(value))) @@ -4385,7 +4385,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!regs) return; - if (!hwc->sample_period) + if (!is_sampling_event(event)) return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) @@ -4548,7 +4548,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; - if (hwc->sample_period) { + if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } @@ -4920,7 +4920,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { + if (is_sampling_event(event)) { s64 period = local64_read(&hwc->period_left); if (period) { @@ -4941,7 +4941,7 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (hwc->sample_period) { + if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); -- cgit v1.2.2 From 2e939d1da9b5628642314c1e68b4319e61263c94 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Tue, 23 Nov 2010 16:21:44 +0100 Subject: perf: Limit event refresh to sampling event Signed-off-by: Franck Bui-Huu Signed-off-by: Peter Zijlstra LKML-Reference: <1290525705-6265-2-git-send-email-fbuihuu@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 880698488c91..027c4d33b4d3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1073,7 +1073,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) /* * not supported on inherited events */ - if (event->attr.inherit) + if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); -- cgit v1.2.2 From 5d508e820a23d9b6e8a149dfaa8ba5cbedf3d95c Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Tue, 23 Nov 2010 16:21:45 +0100 Subject: perf: Don't bother to init the hrtimer for no SW sampling counters Signed-off-by: Franck Bui-Huu Signed-off-by: Peter Zijlstra LKML-Reference: <1290525705-6265-3-git-send-email-fbuihuu@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 027c4d33b4d3..98c5549c8e29 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4917,24 +4917,26 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; - if (is_sampling_event(event)) { - s64 period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL_PINNED, 0); - } } static void perf_swevent_cancel_hrtimer(struct perf_event *event) -- cgit v1.2.2 From 963988262c3c8f4234f64a0dde59446a295e07bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Nov 2010 18:55:29 +0100 Subject: perf: Ignore non-sampling overflows Some arch implementations call perf_event_overflow() by 'accident', ignore this. Reported-by: Francis Moreau Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 98c5549c8e29..af1e63f249f3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4240,6 +4240,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + if (!throttle) { hwc->interrupts++; } else { -- cgit v1.2.2 From c320c7b7d380e630f595de1236d9d085b035d5b4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Oct 2010 12:50:11 -0200 Subject: perf events: Precalculate the header space for PERF_SAMPLE_ fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PERF_SAMPLE_{CALLCHAIN,RAW} have variable lenghts per sample, but the others can be precalculated, reducing a bit the per sample cost. Acked-by: Peter Zijlstra Cc: Frédéric Weisbecker Cc: Ian Munsie Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Stephane Eranian LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 150 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 59 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index af1e63f249f3..aede71245e9f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -312,9 +312,75 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + static void perf_group_attach(struct perf_event *event) { - struct perf_event *group_leader = event->group_leader; + struct perf_event *group_leader = event->group_leader, *pos; /* * We can have double attach due to group movement in perf_event_open. @@ -333,6 +399,11 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); } /* @@ -391,7 +462,7 @@ static void perf_group_detach(struct perf_event *event) if (event->group_leader != event) { list_del_init(&event->group_entry); event->group_leader->nr_siblings--; - return; + goto out; } if (!list_empty(&event->group_entry)) @@ -410,6 +481,12 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); } static inline int @@ -2289,31 +2366,6 @@ static int perf_release(struct inode *inode, struct file *file) return perf_event_release_kernel(event); } -static int perf_event_read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -2428,7 +2480,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) if (event->state == PERF_EVENT_STATE_ERROR) return 0; - if (count < perf_event_read_size(event)) + if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); @@ -3606,59 +3658,34 @@ void perf_prepare_sample(struct perf_event_header *header, data->type = sample_type; header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header); + header->size = sizeof(*header) + event->header_size; header->misc = 0; header->misc |= perf_misc_flags(regs); - if (sample_type & PERF_SAMPLE_IP) { + if (sample_type & PERF_SAMPLE_IP) data->ip = perf_instruction_pointer(regs); - header->size += sizeof(data->ip); - } - if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); - - header->size += sizeof(data->tid_entry); } - if (sample_type & PERF_SAMPLE_TIME) { + if (sample_type & PERF_SAMPLE_TIME) data->time = perf_clock(); - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { + if (sample_type & PERF_SAMPLE_ID) data->id = primary_event_id(event); - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { + if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; - header->size += sizeof(data->stream_id); - } - if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); } - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_event_read_size(event); - if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -3726,7 +3753,7 @@ perf_event_read_event(struct perf_event *event, .header = { .type = PERF_RECORD_READ, .misc = 0, - .size = sizeof(read_event) + perf_event_read_size(event), + .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), @@ -5714,6 +5741,11 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction -- cgit v1.2.2 From 614b6780eb0c393d2fb49ff62d61f29b877bd07e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Dec 2010 16:24:32 -0200 Subject: perf events: Fix event inherit fallout of precalculated headers The precalculated header size is not updated when an event is inherited. That results in bogus sample entries for all child events. Bug introduced in c320c7b. Cc: Frederic Weisbecker Cc: Ian Munsie Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index aede71245e9f..7961b27aceea 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6098,6 +6098,11 @@ inherit_event(struct perf_event *parent_event, child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + /* * Link it up in the child's context: */ -- cgit v1.2.2 From 6844c09d849aeb00e8ddfe9525e8567a531c22d0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 3 Dec 2010 16:36:35 -0200 Subject: perf events: Separate the routines handling the PERF_SAMPLE_ identity fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those will be made available in sample like events like MMAP, EXEC, etc in a followup patch. So precalculate the extra id header space and have a separate routine to fill them up. V2: Thomas noticed that the id header needs to be precalculated at inherit_events too: LKML-Reference: Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Acked-by: Ian Munsie Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Frédéric Weisbecker Cc: Ian Munsie Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Stephane Eranian Cc: Thomas Gleixner LKML-Reference: <1291318772-30880-2-git-send-email-acme@infradead.org> Signed-off-by: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 129 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 54 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7961b27aceea..a04799769566 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -133,6 +133,28 @@ static void unclone_ctx(struct perf_event_context *ctx) } } +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + /* * If we inherit events we want to return the parent event id * to userspace. @@ -351,15 +373,30 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); - if (sample_type & PERF_SAMPLE_ADDR) - size += sizeof(data->addr); - if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); @@ -369,13 +406,7 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); - if (sample_type & PERF_SAMPLE_PERIOD) - size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - size += event->read_size; - - event->header_size = size; + event->id_header_size = size; } static void perf_group_attach(struct perf_event *event) @@ -3357,6 +3388,36 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, } while (len); } +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) @@ -3459,28 +3520,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -3655,37 +3694,17 @@ void perf_prepare_sample(struct perf_event_header *header, { u64 sample_type = event->attr.sample_type; - data->type = sample_type; - header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; header->misc = 0; header->misc |= perf_misc_flags(regs); + perf_event_header__init_id(header, data, event); + if (sample_type & PERF_SAMPLE_IP) data->ip = perf_instruction_pointer(regs); - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - } - - if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); - - if (sample_type & PERF_SAMPLE_ID) - data->id = primary_event_id(event); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - data->stream_id = event->id; - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - } - if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -5745,6 +5764,7 @@ SYSCALL_DEFINE5(perf_event_open, * Precalculate sample_data sizes */ perf_event__header_size(event); + perf_event__id_header_size(event); /* * Drop the reference on the group_event after placing the @@ -6102,6 +6122,7 @@ inherit_event(struct perf_event *parent_event, * Precalculate sample_data sizes */ perf_event__header_size(child_event); + perf_event__id_header_size(child_event); /* * Link it up in the child's context: -- cgit v1.2.2 From c980d1091810df13f21aabbce545fd98f545bbf7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 4 Dec 2010 23:02:20 -0200 Subject: perf events: Make sample_type identity fields available in all PERF_RECORD_ events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If perf_event_attr.sample_id_all is set it will add the PERF_SAMPLE_ identity info: TID, TIME, ID, CPU, STREAM_ID As a trailer, so that older perf tools can process new files, just ignoring the extra payload. With this its possible to do further analysis on problems in the event stream, like detecting reordering of MMAP and FORK events, etc. V2: Fixup header size in comm, mmap and task processing, as we have to take into account different sample_types for each matching event, noticed by Thomas Gleixner. Thomas also noticed a problem in v2 where if we didn't had space in the buffer we wouldn't restore the header size. Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Acked-by: Ian Munsie Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Frédéric Weisbecker Cc: Ian Munsie Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Stephane Eranian Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 108 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 17 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a04799769566..77ad22c00b9d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3388,9 +3388,9 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, } while (len); } -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) { u64 sample_type = event->attr.sample_type; @@ -3418,6 +3418,43 @@ static void perf_event_header__init_id(struct perf_event_header *header, } } +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) @@ -3425,6 +3462,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; + struct perf_sample_data sample_data; struct { struct perf_event_header header; u64 id; @@ -3451,8 +3489,12 @@ int perf_output_begin(struct perf_output_handle *handle, goto out; have_lost = local_read(&buffer->lost); - if (have_lost) - size += sizeof(lost_event); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } perf_output_get_handle(handle); @@ -3483,11 +3525,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); } return 0; @@ -3700,7 +3742,7 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event); if (sample_type & PERF_SAMPLE_IP) data->ip = perf_instruction_pointer(regs); @@ -3768,6 +3810,7 @@ perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; + struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, @@ -3779,12 +3822,14 @@ perf_event_read_event(struct perf_event *event, }; int ret; + perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } @@ -3814,14 +3859,16 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; + struct perf_sample_data sample; struct task_struct *task = task_event->task; - int size, ret; + int ret, size = task_event->event_id.header.size; - size = task_event->event_id.header.size; - ret = perf_output_begin(&handle, event, size, 0, 0); + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3831,7 +3878,11 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; } static int perf_event_task_match(struct perf_event *event) @@ -3944,11 +3995,16 @@ static void perf_event_comm_output(struct perf_event *event, struct perf_comm_event *comm_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = comm_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); @@ -3956,7 +4012,12 @@ static void perf_event_comm_output(struct perf_event *event, perf_output_put(&handle, comm_event->event_id); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; } static int perf_event_comm_match(struct perf_event *event) @@ -4001,7 +4062,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); @@ -4080,11 +4140,15 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_mmap_event *mmap_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = mmap_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); @@ -4092,7 +4156,12 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->event_id); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; } static int perf_event_mmap_match(struct perf_event *event, @@ -4245,6 +4314,7 @@ void perf_event_mmap(struct vm_area_struct *vma) static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; + struct perf_sample_data sample; int ret; struct { @@ -4266,11 +4336,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); if (ret) return; perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } -- cgit v1.2.2 From 5167695753c63444a9e6cbbef136200a16c7a225 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Dec 2010 14:18:20 +0100 Subject: perf: Fix duplicate events with multiple-pmu vs software events Because the multi-pmu bits can share contexts between struct pmu instances we could get duplicate events by iterating the pmu list. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index eac7e3364335..7b870174c56d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3824,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); ctx = task_event->task_ctx; @@ -3959,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); ctxn = pmu->task_ctx_nr; @@ -4144,6 +4148,8 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -5145,20 +5151,36 @@ static void *find_pmu_context(int ctxn) return NULL; } -static void free_pmu_context(void * __percpu cpu_context) +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) { - struct pmu *pmu; + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + + if (cpuctx->active_pmu == old_pmu) + cpuctx->active_pmu = pmu; + } +} + +static void free_pmu_context(struct pmu *pmu) +{ + struct pmu *i; mutex_lock(&pmus_lock); /* * Like a real lame refcount. */ - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->pmu_cpu_context == cpu_context) + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); goto out; + } } - free_percpu(cpu_context); + free_percpu(pmu->pmu_cpu_context); out: mutex_unlock(&pmus_lock); } @@ -5190,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu) cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); + cpuctx->active_pmu = pmu; } got_cpu_context: @@ -5241,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - free_pmu_context(pmu->pmu_cpu_context); + free_pmu_context(pmu); } struct pmu *perf_init_event(struct perf_event *event) -- cgit v1.2.2 From c277443cfc29b1623b4923219ff0bdb48b91b589 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Dec 2010 15:29:02 +0100 Subject: perf: Stop all counters on reboot Use the reboot notifier to detach all running counters on reboot, this solves a problem with kexec where the new kernel doesn't expect running counters (rightly so). It will however decrease the coverage of the NMI watchdog. Making a kexec specific reboot notifier callback would be best, however that would require touching all notifier callback handlers as they are not properly structured to deal with new state. As a compromise, place the perf reboot notifier at the very last position in the list. Reported-by: Yinghai Lu Signed-off-by: Peter Zijlstra Cc: Vivek Goyal Cc: Eric W. Biederman Cc: Jason Wessel Cc: Don Zickus LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 77ad22c00b9d..f9d2645b5546 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -6429,7 +6430,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC static void perf_pmu_rotate_stop(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -6483,6 +6484,26 @@ static void perf_event_exit_cpu(int cpu) static inline void perf_event_exit_cpu(int cpu) { } #endif +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + static int __cpuinit perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -6518,6 +6539,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); -- cgit v1.2.2 From ce677831a4abd0f9f957c90ac6f6a0d0472bafb4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 24 Oct 2010 21:50:42 +0200 Subject: perf: Fix off by one in perf_swevent_init() The perf_swevent_enabled[] array has PERF_COUNT_SW_MAX elements. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra LKML-Reference: <20101024195041.GT5985@bicker> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7b870174c56d..2870feee81dd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4719,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event) break; } - if (event_id > PERF_COUNT_SW_MAX) + if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { -- cgit v1.2.2 From 2e80a82a49c4c7eca4e35734380f28298ba5db19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Nov 2010 23:17:36 +0100 Subject: perf: Dynamic pmu types Extend the perf_pmu_register() interface to allow for named and dynamic pmu types. Because we need to support the existing static types we cannot use dynamic types for everything, hence provide a type argument. If we want to enumerate the PMUs they need a name, provide one. Signed-off-by: Peter Zijlstra LKML-Reference: <20101117222056.259707703@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a3d568fbacc6..8f09bc877a30 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -4961,7 +4962,7 @@ static struct pmu perf_tracepoint = { static inline void perf_tp_register(void) { - perf_pmu_register(&perf_tracepoint); + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -5305,8 +5306,9 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } +static struct idr pmu_idr; -int perf_pmu_register(struct pmu *pmu) +int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5316,13 +5318,32 @@ int perf_pmu_register(struct pmu *pmu) if (!pmu->pmu_disable_count) goto unlock; + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + +skip_type: pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) - goto free_pdc; + goto free_ird; for_each_possible_cpu(cpu) { struct perf_cpu_context *cpuctx; @@ -5366,6 +5387,10 @@ unlock: return ret; +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; @@ -5385,6 +5410,8 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); free_pmu_context(pmu); } @@ -5394,6 +5421,13 @@ struct pmu *perf_init_event(struct perf_event *event) int idx; idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) + goto unlock; + list_for_each_entry_rcu(pmu, &pmus, entry) { int ret = pmu->event_init(event); if (!ret) @@ -6555,11 +6589,13 @@ void __init perf_event_init(void) { int ret; + idr_init(&pmu_idr); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent); - perf_pmu_register(&perf_cpu_clock); - perf_pmu_register(&perf_task_clock); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); register_reboot_notifier(&perf_reboot_notifier); -- cgit v1.2.2 From abe43400579d5de0078c2d3a760e6598e183f871 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Nov 2010 23:17:37 +0100 Subject: perf: Sysfs enumeration Simple sysfs emumeration of the PMUs. Use a "event_source" bus, and add PMU devices using their name. Each PMU device has a type attribute which contrains the value needed for perf_event_attr::type to identify this PMU. This is the minimal stub needed to start using this interface, we'll consider extending the sysfs usage later. Cc: Kay Sievers Cc: Greg KH Signed-off-by: Peter Zijlstra LKML-Reference: <20101117222056.316982569@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8f09bc877a30..11847bf1e8cc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -5308,6 +5309,58 @@ out: } static struct idr pmu_idr; +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} + int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5336,6 +5389,12 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type) } pmu->type = type; + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + skip_type: pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) @@ -5343,7 +5402,7 @@ skip_type: pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) - goto free_ird; + goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_context *cpuctx; @@ -5387,6 +5446,10 @@ unlock: return ret; +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + free_idr: if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); @@ -5412,6 +5475,8 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); free_pmu_context(pmu); } @@ -6603,3 +6668,31 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); -- cgit v1.2.2