From 9b33fa6ba0e2f90fdf407501db801c2511121564 Mon Sep 17 00:00:00 2001 From: "eranian@google.com" Date: Wed, 10 Mar 2010 22:26:05 -0800 Subject: perf_events: Improve task_sched_in() This patch is an optimization in perf_event_task_sched_in() to avoid scheduling the events twice in a row. Without it, the perf_disable()/perf_enable() pair is invoked twice, thereby pinned events counts while scheduling flexible events and we go throuh hw_perf_enable() twice. By encapsulating, the whole sequence into perf_disable()/perf_enable() we ensure, hw_perf_enable() is going to be invoked only once because of the refcount protection. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1268288765-5326-1-git-send-email-eranian@google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 52c69a34d697..3853d49c7d56 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1368,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1380,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) -- cgit v1.2.2 From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Paul Mackerras LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8bf61273c58b..455393e71cab 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } +#endif /* * Output -- cgit v1.2.2 From 3326c1ceee234e63160852720d48be8a8f7a6d08 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2010 19:09:33 +0100 Subject: perf_event: Make perf fd non seekable Perf_event does not need seeking, so prevent it in order to get rid of default_llseek, which uses the BKL. Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar [drop the nonseekable_open, not needed for anon inodes] Signed-off-by: Frederic Weisbecker --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 63fbce1c80b5..4aa50ff4efc0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2645,6 +2645,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, -- cgit v1.2.2 From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Apr 2010 15:35:57 +0200 Subject: perf: Store active software events in a hashlist Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 246 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 183 insertions(+), 63 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fcf42dcd6089..9efdfe5b8d3b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); @@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event, return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + u64 hash; + struct swevent_hlist *hlist; + + hash = swevent_hash(type, event_id); + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return &hlist->heads[hash]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) @@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx) } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); } static const struct pmu perf_ops_generic = { @@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event, return 0; } +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist; + + if (!cpuctx->swevent_hlist) + return; + + hlist = cpuctx->swevent_hlist; + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!cpuctx->swevent_hlist && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); + swevent_hlist_put(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return NULL; event->destroy = tp_perf_event_destroy; + err = swevent_hlist_get(event); + if (err) { + perf_trace_disable(event->attr.config); + return ERR_PTR(err); + } return &perf_ops_generic; } @@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); -- cgit v1.2.2 From df8290bf7ea6b3051e2f315579a6e829309ec1ed Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2010 00:28:14 +0200 Subject: perf: Make clock software events consistent with general exclusion rules The cpu/task clock events implement their own version of exclusion on top of exclude_user and exclude_kernel. The result is that when the event triggered in the kernel but we have exclude_kernel set, we try to rewind using task_pt_regs. There are two side effects of this: - we call task_pt_regs even on kernel threads, which doesn't give us the desired result. - if the event occured in the kernel, we shouldn't rewind to the user context. We want to actually ignore the event. get_irq_regs() will always give us the right interrupted context, so use its result and submit it to perf_exclude_context() that knows when an event must be ignored. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9efdfe5b8d3b..095101d685bc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4164,15 +4164,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; -- cgit v1.2.2 From 95476b64ab11d528de2557366ec584977c215b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Apr 2010 23:42:18 +0200 Subject: perf: Fix hlist related build error hlist helpers need to be available for all software events, not only trace events. Pull them out outside the ifdef CONFIG_EVENT_TRACING section. Fixes: kernel/perf_event.c:4573: error: implicit declaration of function 'swevent_hlist_put' kernel/perf_event.c:4614: error: implicit declaration of function 'swevent_hlist_get' kernel/perf_event.c:5534: error: implicit declaration of function 'swevent_hlist_release Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1271281338-23491-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 095101d685bc..07b7a435bf03 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4313,36 +4313,6 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_TRACING - -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs) -{ - struct perf_sample_data data; - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) { struct swevent_hlist *hlist; @@ -4442,6 +4412,36 @@ static int swevent_hlist_get(struct perf_event *event) return err; } +#ifdef CONFIG_EVENT_TRACING + +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size, struct pt_regs *regs) +{ + struct perf_sample_data data; + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + /* Trace events already protected against recursion */ + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); -- cgit v1.2.2 From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 19 Apr 2010 13:32:41 +0800 Subject: perf: Enhance perf to allow for guest statistic collection from host Below patch introduces perf_guest_info_callbacks and related register/unregister functions. Add more PERF_RECORD_MISC_XXX bits meaning guest kernel and guest user space. Signed-off-by: Zhang Yanmin Signed-off-by: Avi Kivity --- kernel/perf_event.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 07b7a435bf03..9dbe8cdaf145 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski } +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + /* * Output */ @@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ -- cgit v1.2.2 From 048c852051d2bd5da54a4488bc1f16b0fc74c695 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 May 2010 10:11:35 +0200 Subject: perf: Fix resource leak in failure path of perf_event_open() perf_event_open() kfrees event after init failure which doesn't release all resources allocated by perf_event_alloc(). Use free_event() instead. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: LKML-Reference: <4BDBE237.1040809@kernel.org> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2f3fbf84215a..3d1552d3c12b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4897,7 +4897,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) -- cgit v1.2.2 From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 17:31:38 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by only doing this when we free the counter itself. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1273160566.5605.404.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d1552d3c12b..f13c3db765f4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -341,6 +341,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + if (event->state > PERF_EVENT_STATE_FREE) + return; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -1856,6 +1859,8 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + event->state = PERF_EVENT_STATE_FREE; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); -- cgit v1.2.2 From a0507c84bf47dfd204299774f45fd16da33f0619 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 15:42:53 +0200 Subject: perf: Annotate perf_event_read_group() vs perf_event_release_kernel() Stephane reported a lockdep warning while using PERF_FORMAT_GROUP. The issue is that perf_event_read_group() takes faults while holding the ctx->mutex, while perf_event_release_kernel() can be called from munmap(). Which makes for an AB-BA deadlock. Except we can never establish the deadlock because we'll only ever call perf_event_release_kernel() after all file descriptors are dead so there is no concurrency possible. Reported-by: Stephane Eranian Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 49d8be5a45e3..34659d4085c7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1867,7 +1867,19 @@ int perf_event_release_kernel(struct perf_event *event) event->state = PERF_EVENT_STATE_FREE; WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); perf_event_remove_from_context(event); mutex_unlock(&ctx->mutex); @@ -5305,7 +5317,7 @@ void perf_event_exit_task(struct task_struct *child) * * But since its the parent context it won't be the same instance. */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&child_ctx->mutex); again: list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, -- cgit v1.2.2 From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 23 Apr 2010 13:56:00 +0800 Subject: perf: Add group scheduling transactional APIs Add group scheduling transactional APIs to struct pmu. These APIs will be implemented in arch code, based on Peter's idea as below. > the idea behind hw_perf_group_sched_in() is to not perform > schedulability tests on each event in the group, but to add the group > as a whole and then perform one test. > > Of course, when that test fails, you'll have to roll-back the whole > group again. > > So start_txn (or a better name) would simply toggle a flag in the pmu > implementation that will make pmu::enable() not perform the > schedulablilty test. > > Then commit_txn() will perform the schedulability test (so note the > method has to have a !void return value. > > This will allow us to use the regular > kernel/perf_event.c::group_sched_in() and all the rollback code. > Currently each hw_perf_group_sched_in() implementation duplicates all > the rolllback code (with various bugs). ->start_txn: Start group events scheduling transaction, set a flag to make pmu::enable() not perform the schedulability test, it will be performed at commit time. ->commit_txn: Commit group events scheduling transaction, perform the group schedulability as a whole ->cancel_txn: Stop group events scheduling transaction, clear the flag so pmu::enable() will perform the schedulability test. Reviewed-by: Stephane Eranian Reviewed-by: Frederic Weisbecker Signed-off-by: Lin Ming Cc: David Miller Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 34659d4085c7..bb06382f98e7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -83,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - return 0; -} - void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -644,15 +636,20 @@ group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - struct perf_event *event, *partial_group; + struct perf_event *event, *partial_group = NULL; + const struct pmu *pmu = group_event->pmu; + bool txn = false; int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); - if (ret) - return ret < 0 ? ret : 0; + /* Check if group transaction availabe */ + if (pmu->start_txn) + txn = true; + + if (txn) + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; @@ -667,9 +664,19 @@ group_sched_in(struct perf_event *group_event, } } - return 0; + if (txn) { + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + + return 0; + } + } group_error: + if (txn) + pmu->cancel_txn(pmu); + /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: -- cgit v1.2.2 From 6e85158cf5a2385264316870256fb6ad681156a0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 8 May 2010 20:58:00 +1000 Subject: perf_event: Make software events work again Commit 6bde9b6ce0127e2a56228a2071536d422be31336 ("perf: Add group scheduling transactional APIs") added code to allow a group to be scheduled in a single transaction. However, it introduced a bug in handling events whose pmu does not implement transactions -- at the end of scheduling in the events in the group, in the non-transactional case the code now falls through to the group_error label, and proceeds to unschedule all the events in the group and return failure. This fixes it by returning 0 (success) in the non-transactional case. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Lin Ming Cc: Frederic Weisbecker Cc: eranian@gmail.com LKML-Reference: <20100508105800.GB10650@brick.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bb06382f98e7..180151ff8376 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -664,13 +664,13 @@ group_sched_in(struct perf_event *group_event, } } - if (txn) { - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); + if (!txn) + return 0; - return 0; - } + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + return 0; } group_error: -- cgit v1.2.2 From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 May 2010 08:31:49 +0200 Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP" This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c. It causes various crashes and hangs when events are activated. The cause is not fully understood yet but we need to revert it because the effects are severe. Reported-by: Stephane Eranian Reported-by: Lin Ming Cc: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 180151ff8376..a9047463fd83 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -334,9 +334,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; - if (event->state > PERF_EVENT_STATE_FREE) - return; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -1871,8 +1868,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - event->state = PERF_EVENT_STATE_FREE; - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: -- cgit v1.2.2 From 050735b08ca8a016bbace4445fa025b88fee770b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 May 2010 11:51:53 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by splitting off the group destroy from the list removal such that perf_event_remove_from_context() does not do this and change perf_event_release() to do so. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: # .34.x LKML-Reference: <1273571513.5605.3527.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a9047463fd83..c97e82518403 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -308,8 +308,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *sibling, *tmp; - if (list_empty(&event->group_entry)) return; ctx->nr_events--; @@ -333,6 +331,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; +} + +static void +perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; /* * If this was a group event with sibling events then @@ -1868,6 +1872,12 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: @@ -1882,7 +1892,10 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - perf_event_remove_from_context(event); + raw_spin_lock_irq(&ctx->lock); + list_del_event(event, ctx); + perf_destroy_group(event, ctx); + raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); mutex_lock(&event->owner->perf_event_mutex); -- cgit v1.2.2 From 96c21a460a37880abfbc8445d5b098dbab958a29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 May 2010 16:19:10 +0200 Subject: perf: Fix exit() vs event-groups Corey reported that the value scale times of group siblings are not updated when the monitored task dies. The problem appears to be that we only update the group leader's time values, fix it by updating the whole group. Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: # .34.x LKML-Reference: <1273588935.1810.6.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c97e82518403..a4fa381db3c2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -255,6 +255,18 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -320,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; - update_event_times(event); + update_group_times(event); /* * If event was in error state, then keep it @@ -501,18 +513,6 @@ retry: raw_spin_unlock_irq(&ctx->lock); } -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - /* * Cross CPU call to disable a performance event */ -- cgit v1.2.2