aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2010-04-05 09:35:57 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2010-04-14 12:20:33 -0400
commit76e1d9047e4edefb8ada20aa90d5762306082bd6 (patch)
tree60384bd206878d2c440e07b33d97b1bb00a103dc /kernel/perf_event.c
parentc05556421742eb47f80301767653a4bcb19de9de (diff)
perf: Store active software events in a hashlist
Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c246
1 files changed, 183 insertions, 63 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fcf42dcd6089..9efdfe5b8d3b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/hash.h>
19#include <linux/sysfs.h> 20#include <linux/sysfs.h>
20#include <linux/dcache.h> 21#include <linux/dcache.h>
21#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3966 perf_swevent_overflow(event, 0, nmi, data, regs); 3967 perf_swevent_overflow(event, 0, nmi, data, regs);
3967} 3968}
3968 3969
3969static int perf_swevent_is_counting(struct perf_event *event)
3970{
3971 /*
3972 * The event is active, we're good!
3973 */
3974 if (event->state == PERF_EVENT_STATE_ACTIVE)
3975 return 1;
3976
3977 /*
3978 * The event is off/error, not counting.
3979 */
3980 if (event->state != PERF_EVENT_STATE_INACTIVE)
3981 return 0;
3982
3983 /*
3984 * The event is inactive, if the context is active
3985 * we're part of a group that didn't make it on the 'pmu',
3986 * not counting.
3987 */
3988 if (event->ctx->is_active)
3989 return 0;
3990
3991 /*
3992 * We're inactive and the context is too, this means the
3993 * task is scheduled out, we're counting events that happen
3994 * to us, like migration events.
3995 */
3996 return 1;
3997}
3998
3999static int perf_tp_event_match(struct perf_event *event, 3970static int perf_tp_event_match(struct perf_event *event,
4000 struct perf_sample_data *data); 3971 struct perf_sample_data *data);
4001 3972
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
4019 struct perf_sample_data *data, 3990 struct perf_sample_data *data,
4020 struct pt_regs *regs) 3991 struct pt_regs *regs)
4021{ 3992{
4022 if (event->cpu != -1 && event->cpu != smp_processor_id())
4023 return 0;
4024
4025 if (!perf_swevent_is_counting(event))
4026 return 0;
4027
4028 if (event->attr.type != type) 3993 if (event->attr.type != type)
4029 return 0; 3994 return 0;
4030 3995
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
4041 return 1; 4006 return 1;
4042} 4007}
4043 4008
4044static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4009static inline u64 swevent_hash(u64 type, u32 event_id)
4045 enum perf_type_id type, 4010{
4046 u32 event_id, u64 nr, int nmi, 4011 u64 val = event_id | (type << 32);
4047 struct perf_sample_data *data, 4012
4048 struct pt_regs *regs) 4013 return hash_64(val, SWEVENT_HLIST_BITS);
4014}
4015
4016static struct hlist_head *
4017find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4018{
4019 u64 hash;
4020 struct swevent_hlist *hlist;
4021
4022 hash = swevent_hash(type, event_id);
4023
4024 hlist = rcu_dereference(ctx->swevent_hlist);
4025 if (!hlist)
4026 return NULL;
4027
4028 return &hlist->heads[hash];
4029}
4030
4031static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4032 u64 nr, int nmi,
4033 struct perf_sample_data *data,
4034 struct pt_regs *regs)
4049{ 4035{
4036 struct perf_cpu_context *cpuctx;
4050 struct perf_event *event; 4037 struct perf_event *event;
4038 struct hlist_node *node;
4039 struct hlist_head *head;
4051 4040
4052 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4041 cpuctx = &__get_cpu_var(perf_cpu_context);
4042
4043 rcu_read_lock();
4044
4045 head = find_swevent_head(cpuctx, type, event_id);
4046
4047 if (!head)
4048 goto end;
4049
4050 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4053 if (perf_swevent_match(event, type, event_id, data, regs)) 4051 if (perf_swevent_match(event, type, event_id, data, regs))
4054 perf_swevent_add(event, nr, nmi, data, regs); 4052 perf_swevent_add(event, nr, nmi, data, regs);
4055 } 4053 }
4054end:
4055 rcu_read_unlock();
4056} 4056}
4057 4057
4058int perf_swevent_get_recursion_context(void) 4058int perf_swevent_get_recursion_context(void)
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
4090} 4090}
4091EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4091EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4092 4092
4093static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4094 u64 nr, int nmi,
4095 struct perf_sample_data *data,
4096 struct pt_regs *regs)
4097{
4098 struct perf_cpu_context *cpuctx;
4099 struct perf_event_context *ctx;
4100
4101 cpuctx = &__get_cpu_var(perf_cpu_context);
4102 rcu_read_lock();
4103 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4104 nr, nmi, data, regs);
4105 /*
4106 * doesn't really matter which of the child contexts the
4107 * events ends up in.
4108 */
4109 ctx = rcu_dereference(current->perf_event_ctxp);
4110 if (ctx)
4111 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4112 rcu_read_unlock();
4113}
4114 4093
4115void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4094void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4116 struct pt_regs *regs, u64 addr) 4095 struct pt_regs *regs, u64 addr)
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
4136static int perf_swevent_enable(struct perf_event *event) 4115static int perf_swevent_enable(struct perf_event *event)
4137{ 4116{
4138 struct hw_perf_event *hwc = &event->hw; 4117 struct hw_perf_event *hwc = &event->hw;
4118 struct perf_cpu_context *cpuctx;
4119 struct hlist_head *head;
4120
4121 cpuctx = &__get_cpu_var(perf_cpu_context);
4139 4122
4140 if (hwc->sample_period) { 4123 if (hwc->sample_period) {
4141 hwc->last_period = hwc->sample_period; 4124 hwc->last_period = hwc->sample_period;
4142 perf_swevent_set_period(event); 4125 perf_swevent_set_period(event);
4143 } 4126 }
4127
4128 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
4129 if (WARN_ON_ONCE(!head))
4130 return -EINVAL;
4131
4132 hlist_add_head_rcu(&event->hlist_entry, head);
4133
4144 return 0; 4134 return 0;
4145} 4135}
4146 4136
4147static void perf_swevent_disable(struct perf_event *event) 4137static void perf_swevent_disable(struct perf_event *event)
4148{ 4138{
4139 hlist_del_rcu(&event->hlist_entry);
4149} 4140}
4150 4141
4151static const struct pmu perf_ops_generic = { 4142static const struct pmu perf_ops_generic = {
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
4359 return 0; 4350 return 0;
4360} 4351}
4361 4352
4353static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4354{
4355 struct swevent_hlist *hlist;
4356
4357 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4358 kfree(hlist);
4359}
4360
4361static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4362{
4363 struct swevent_hlist *hlist;
4364
4365 if (!cpuctx->swevent_hlist)
4366 return;
4367
4368 hlist = cpuctx->swevent_hlist;
4369 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4370 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4371}
4372
4373static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4374{
4375 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4376
4377 mutex_lock(&cpuctx->hlist_mutex);
4378
4379 if (!--cpuctx->hlist_refcount)
4380 swevent_hlist_release(cpuctx);
4381
4382 mutex_unlock(&cpuctx->hlist_mutex);
4383}
4384
4385static void swevent_hlist_put(struct perf_event *event)
4386{
4387 int cpu;
4388
4389 if (event->cpu != -1) {
4390 swevent_hlist_put_cpu(event, event->cpu);
4391 return;
4392 }
4393
4394 for_each_possible_cpu(cpu)
4395 swevent_hlist_put_cpu(event, cpu);
4396}
4397
4398static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4399{
4400 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4401 int err = 0;
4402
4403 mutex_lock(&cpuctx->hlist_mutex);
4404
4405 if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
4406 struct swevent_hlist *hlist;
4407
4408 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4409 if (!hlist) {
4410 err = -ENOMEM;
4411 goto exit;
4412 }
4413 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4414 }
4415 cpuctx->hlist_refcount++;
4416 exit:
4417 mutex_unlock(&cpuctx->hlist_mutex);
4418
4419 return err;
4420}
4421
4422static int swevent_hlist_get(struct perf_event *event)
4423{
4424 int err;
4425 int cpu, failed_cpu;
4426
4427 if (event->cpu != -1)
4428 return swevent_hlist_get_cpu(event, event->cpu);
4429
4430 get_online_cpus();
4431 for_each_possible_cpu(cpu) {
4432 err = swevent_hlist_get_cpu(event, cpu);
4433 if (err) {
4434 failed_cpu = cpu;
4435 goto fail;
4436 }
4437 }
4438 put_online_cpus();
4439
4440 return 0;
4441 fail:
4442 for_each_possible_cpu(cpu) {
4443 if (cpu == failed_cpu)
4444 break;
4445 swevent_hlist_put_cpu(event, cpu);
4446 }
4447
4448 put_online_cpus();
4449 return err;
4450}
4451
4362static void tp_perf_event_destroy(struct perf_event *event) 4452static void tp_perf_event_destroy(struct perf_event *event)
4363{ 4453{
4364 perf_trace_disable(event->attr.config); 4454 perf_trace_disable(event->attr.config);
4455 swevent_hlist_put(event);
4365} 4456}
4366 4457
4367static const struct pmu *tp_perf_event_init(struct perf_event *event) 4458static const struct pmu *tp_perf_event_init(struct perf_event *event)
4368{ 4459{
4460 int err;
4461
4369 /* 4462 /*
4370 * Raw tracepoint data is a severe data leak, only allow root to 4463 * Raw tracepoint data is a severe data leak, only allow root to
4371 * have these. 4464 * have these.
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4379 return NULL; 4472 return NULL;
4380 4473
4381 event->destroy = tp_perf_event_destroy; 4474 event->destroy = tp_perf_event_destroy;
4475 err = swevent_hlist_get(event);
4476 if (err) {
4477 perf_trace_disable(event->attr.config);
4478 return ERR_PTR(err);
4479 }
4382 4480
4383 return &perf_ops_generic; 4481 return &perf_ops_generic;
4384} 4482}
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4479 WARN_ON(event->parent); 4577 WARN_ON(event->parent);
4480 4578
4481 atomic_dec(&perf_swevent_enabled[event_id]); 4579 atomic_dec(&perf_swevent_enabled[event_id]);
4580 swevent_hlist_put(event);
4482} 4581}
4483 4582
4484static const struct pmu *sw_perf_event_init(struct perf_event *event) 4583static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4517 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4616 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4518 case PERF_COUNT_SW_EMULATION_FAULTS: 4617 case PERF_COUNT_SW_EMULATION_FAULTS:
4519 if (!event->parent) { 4618 if (!event->parent) {
4619 int err;
4620
4621 err = swevent_hlist_get(event);
4622 if (err)
4623 return ERR_PTR(err);
4624
4520 atomic_inc(&perf_swevent_enabled[event_id]); 4625 atomic_inc(&perf_swevent_enabled[event_id]);
4521 event->destroy = sw_perf_event_destroy; 4626 event->destroy = sw_perf_event_destroy;
4522 } 4627 }
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
5389 5494
5390 for_each_possible_cpu(cpu) { 5495 for_each_possible_cpu(cpu) {
5391 cpuctx = &per_cpu(perf_cpu_context, cpu); 5496 cpuctx = &per_cpu(perf_cpu_context, cpu);
5497 mutex_init(&cpuctx->hlist_mutex);
5392 __perf_event_init_context(&cpuctx->ctx, NULL); 5498 __perf_event_init_context(&cpuctx->ctx, NULL);
5393 } 5499 }
5394} 5500}
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5402 spin_lock(&perf_resource_lock); 5508 spin_lock(&perf_resource_lock);
5403 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5509 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5404 spin_unlock(&perf_resource_lock); 5510 spin_unlock(&perf_resource_lock);
5511
5512 mutex_lock(&cpuctx->hlist_mutex);
5513 if (cpuctx->hlist_refcount > 0) {
5514 struct swevent_hlist *hlist;
5515
5516 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5517 WARN_ON_ONCE(!hlist);
5518 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5519 }
5520 mutex_unlock(&cpuctx->hlist_mutex);
5405} 5521}
5406 5522
5407#ifdef CONFIG_HOTPLUG_CPU 5523#ifdef CONFIG_HOTPLUG_CPU
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
5421 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5537 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5422 struct perf_event_context *ctx = &cpuctx->ctx; 5538 struct perf_event_context *ctx = &cpuctx->ctx;
5423 5539
5540 mutex_lock(&cpuctx->hlist_mutex);
5541 swevent_hlist_release(cpuctx);
5542 mutex_unlock(&cpuctx->hlist_mutex);
5543
5424 mutex_lock(&ctx->mutex); 5544 mutex_lock(&ctx->mutex);
5425 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5545 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5426 mutex_unlock(&ctx->mutex); 5546 mutex_unlock(&ctx->mutex);