diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2010-04-05 09:35:57 -0400 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2010-04-14 12:20:33 -0400 |
commit | 76e1d9047e4edefb8ada20aa90d5762306082bd6 (patch) | |
tree | 60384bd206878d2c440e07b33d97b1bb00a103dc /kernel/perf_event.c | |
parent | c05556421742eb47f80301767653a4bcb19de9de (diff) |
perf: Store active software events in a hashlist
Each time a software event triggers, we need to walk through
the entire list of events from the current cpu and task contexts
to retrieve a running perf event that matches.
We also need to check a matching perf event is actually counting.
This walk is wasteful and makes the event fast path scaling
down with a growing number of events running on the same
contexts.
To solve this, we store the running perf events in a hashlist to
get an immediate access to them against their type:event_id when
they trigger.
v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic
maths along the way)
- Only allocate hlist for online cpus, but keep track of the
refcount on offline possible cpus too, so that we allocate it
if needed when it becomes online.
- Drop the kref use as it's not adapted to our tricks anymore.
v3: - Fix bad refcount check (address instead of value). Thanks to
Eric Dumazet who spotted this.
- While exiting cpu, move the hlist release out of the IPI path
to lock the hlist mutex sanely.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 246 |
1 files changed, 183 insertions, 63 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fcf42dcd6089..9efdfe5b8d3b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/file.h> | 16 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 17 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/hash.h> | ||
19 | #include <linux/sysfs.h> | 20 | #include <linux/sysfs.h> |
20 | #include <linux/dcache.h> | 21 | #include <linux/dcache.h> |
21 | #include <linux/percpu.h> | 22 | #include <linux/percpu.h> |
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
3966 | perf_swevent_overflow(event, 0, nmi, data, regs); | 3967 | perf_swevent_overflow(event, 0, nmi, data, regs); |
3967 | } | 3968 | } |
3968 | 3969 | ||
3969 | static int perf_swevent_is_counting(struct perf_event *event) | ||
3970 | { | ||
3971 | /* | ||
3972 | * The event is active, we're good! | ||
3973 | */ | ||
3974 | if (event->state == PERF_EVENT_STATE_ACTIVE) | ||
3975 | return 1; | ||
3976 | |||
3977 | /* | ||
3978 | * The event is off/error, not counting. | ||
3979 | */ | ||
3980 | if (event->state != PERF_EVENT_STATE_INACTIVE) | ||
3981 | return 0; | ||
3982 | |||
3983 | /* | ||
3984 | * The event is inactive, if the context is active | ||
3985 | * we're part of a group that didn't make it on the 'pmu', | ||
3986 | * not counting. | ||
3987 | */ | ||
3988 | if (event->ctx->is_active) | ||
3989 | return 0; | ||
3990 | |||
3991 | /* | ||
3992 | * We're inactive and the context is too, this means the | ||
3993 | * task is scheduled out, we're counting events that happen | ||
3994 | * to us, like migration events. | ||
3995 | */ | ||
3996 | return 1; | ||
3997 | } | ||
3998 | |||
3999 | static int perf_tp_event_match(struct perf_event *event, | 3970 | static int perf_tp_event_match(struct perf_event *event, |
4000 | struct perf_sample_data *data); | 3971 | struct perf_sample_data *data); |
4001 | 3972 | ||
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event, | |||
4019 | struct perf_sample_data *data, | 3990 | struct perf_sample_data *data, |
4020 | struct pt_regs *regs) | 3991 | struct pt_regs *regs) |
4021 | { | 3992 | { |
4022 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | ||
4023 | return 0; | ||
4024 | |||
4025 | if (!perf_swevent_is_counting(event)) | ||
4026 | return 0; | ||
4027 | |||
4028 | if (event->attr.type != type) | 3993 | if (event->attr.type != type) |
4029 | return 0; | 3994 | return 0; |
4030 | 3995 | ||
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event, | |||
4041 | return 1; | 4006 | return 1; |
4042 | } | 4007 | } |
4043 | 4008 | ||
4044 | static void perf_swevent_ctx_event(struct perf_event_context *ctx, | 4009 | static inline u64 swevent_hash(u64 type, u32 event_id) |
4045 | enum perf_type_id type, | 4010 | { |
4046 | u32 event_id, u64 nr, int nmi, | 4011 | u64 val = event_id | (type << 32); |
4047 | struct perf_sample_data *data, | 4012 | |
4048 | struct pt_regs *regs) | 4013 | return hash_64(val, SWEVENT_HLIST_BITS); |
4014 | } | ||
4015 | |||
4016 | static struct hlist_head * | ||
4017 | find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) | ||
4018 | { | ||
4019 | u64 hash; | ||
4020 | struct swevent_hlist *hlist; | ||
4021 | |||
4022 | hash = swevent_hash(type, event_id); | ||
4023 | |||
4024 | hlist = rcu_dereference(ctx->swevent_hlist); | ||
4025 | if (!hlist) | ||
4026 | return NULL; | ||
4027 | |||
4028 | return &hlist->heads[hash]; | ||
4029 | } | ||
4030 | |||
4031 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | ||
4032 | u64 nr, int nmi, | ||
4033 | struct perf_sample_data *data, | ||
4034 | struct pt_regs *regs) | ||
4049 | { | 4035 | { |
4036 | struct perf_cpu_context *cpuctx; | ||
4050 | struct perf_event *event; | 4037 | struct perf_event *event; |
4038 | struct hlist_node *node; | ||
4039 | struct hlist_head *head; | ||
4051 | 4040 | ||
4052 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 4041 | cpuctx = &__get_cpu_var(perf_cpu_context); |
4042 | |||
4043 | rcu_read_lock(); | ||
4044 | |||
4045 | head = find_swevent_head(cpuctx, type, event_id); | ||
4046 | |||
4047 | if (!head) | ||
4048 | goto end; | ||
4049 | |||
4050 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | ||
4053 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4051 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4054 | perf_swevent_add(event, nr, nmi, data, regs); | 4052 | perf_swevent_add(event, nr, nmi, data, regs); |
4055 | } | 4053 | } |
4054 | end: | ||
4055 | rcu_read_unlock(); | ||
4056 | } | 4056 | } |
4057 | 4057 | ||
4058 | int perf_swevent_get_recursion_context(void) | 4058 | int perf_swevent_get_recursion_context(void) |
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx) | |||
4090 | } | 4090 | } |
4091 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | 4091 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); |
4092 | 4092 | ||
4093 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | ||
4094 | u64 nr, int nmi, | ||
4095 | struct perf_sample_data *data, | ||
4096 | struct pt_regs *regs) | ||
4097 | { | ||
4098 | struct perf_cpu_context *cpuctx; | ||
4099 | struct perf_event_context *ctx; | ||
4100 | |||
4101 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4102 | rcu_read_lock(); | ||
4103 | perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, | ||
4104 | nr, nmi, data, regs); | ||
4105 | /* | ||
4106 | * doesn't really matter which of the child contexts the | ||
4107 | * events ends up in. | ||
4108 | */ | ||
4109 | ctx = rcu_dereference(current->perf_event_ctxp); | ||
4110 | if (ctx) | ||
4111 | perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); | ||
4112 | rcu_read_unlock(); | ||
4113 | } | ||
4114 | 4093 | ||
4115 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4094 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
4116 | struct pt_regs *regs, u64 addr) | 4095 | struct pt_regs *regs, u64 addr) |
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event) | |||
4136 | static int perf_swevent_enable(struct perf_event *event) | 4115 | static int perf_swevent_enable(struct perf_event *event) |
4137 | { | 4116 | { |
4138 | struct hw_perf_event *hwc = &event->hw; | 4117 | struct hw_perf_event *hwc = &event->hw; |
4118 | struct perf_cpu_context *cpuctx; | ||
4119 | struct hlist_head *head; | ||
4120 | |||
4121 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4139 | 4122 | ||
4140 | if (hwc->sample_period) { | 4123 | if (hwc->sample_period) { |
4141 | hwc->last_period = hwc->sample_period; | 4124 | hwc->last_period = hwc->sample_period; |
4142 | perf_swevent_set_period(event); | 4125 | perf_swevent_set_period(event); |
4143 | } | 4126 | } |
4127 | |||
4128 | head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); | ||
4129 | if (WARN_ON_ONCE(!head)) | ||
4130 | return -EINVAL; | ||
4131 | |||
4132 | hlist_add_head_rcu(&event->hlist_entry, head); | ||
4133 | |||
4144 | return 0; | 4134 | return 0; |
4145 | } | 4135 | } |
4146 | 4136 | ||
4147 | static void perf_swevent_disable(struct perf_event *event) | 4137 | static void perf_swevent_disable(struct perf_event *event) |
4148 | { | 4138 | { |
4139 | hlist_del_rcu(&event->hlist_entry); | ||
4149 | } | 4140 | } |
4150 | 4141 | ||
4151 | static const struct pmu perf_ops_generic = { | 4142 | static const struct pmu perf_ops_generic = { |
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event, | |||
4359 | return 0; | 4350 | return 0; |
4360 | } | 4351 | } |
4361 | 4352 | ||
4353 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | ||
4354 | { | ||
4355 | struct swevent_hlist *hlist; | ||
4356 | |||
4357 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); | ||
4358 | kfree(hlist); | ||
4359 | } | ||
4360 | |||
4361 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | ||
4362 | { | ||
4363 | struct swevent_hlist *hlist; | ||
4364 | |||
4365 | if (!cpuctx->swevent_hlist) | ||
4366 | return; | ||
4367 | |||
4368 | hlist = cpuctx->swevent_hlist; | ||
4369 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | ||
4370 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | ||
4371 | } | ||
4372 | |||
4373 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | ||
4374 | { | ||
4375 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
4376 | |||
4377 | mutex_lock(&cpuctx->hlist_mutex); | ||
4378 | |||
4379 | if (!--cpuctx->hlist_refcount) | ||
4380 | swevent_hlist_release(cpuctx); | ||
4381 | |||
4382 | mutex_unlock(&cpuctx->hlist_mutex); | ||
4383 | } | ||
4384 | |||
4385 | static void swevent_hlist_put(struct perf_event *event) | ||
4386 | { | ||
4387 | int cpu; | ||
4388 | |||
4389 | if (event->cpu != -1) { | ||
4390 | swevent_hlist_put_cpu(event, event->cpu); | ||
4391 | return; | ||
4392 | } | ||
4393 | |||
4394 | for_each_possible_cpu(cpu) | ||
4395 | swevent_hlist_put_cpu(event, cpu); | ||
4396 | } | ||
4397 | |||
4398 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | ||
4399 | { | ||
4400 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
4401 | int err = 0; | ||
4402 | |||
4403 | mutex_lock(&cpuctx->hlist_mutex); | ||
4404 | |||
4405 | if (!cpuctx->swevent_hlist && cpu_online(cpu)) { | ||
4406 | struct swevent_hlist *hlist; | ||
4407 | |||
4408 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | ||
4409 | if (!hlist) { | ||
4410 | err = -ENOMEM; | ||
4411 | goto exit; | ||
4412 | } | ||
4413 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | ||
4414 | } | ||
4415 | cpuctx->hlist_refcount++; | ||
4416 | exit: | ||
4417 | mutex_unlock(&cpuctx->hlist_mutex); | ||
4418 | |||
4419 | return err; | ||
4420 | } | ||
4421 | |||
4422 | static int swevent_hlist_get(struct perf_event *event) | ||
4423 | { | ||
4424 | int err; | ||
4425 | int cpu, failed_cpu; | ||
4426 | |||
4427 | if (event->cpu != -1) | ||
4428 | return swevent_hlist_get_cpu(event, event->cpu); | ||
4429 | |||
4430 | get_online_cpus(); | ||
4431 | for_each_possible_cpu(cpu) { | ||
4432 | err = swevent_hlist_get_cpu(event, cpu); | ||
4433 | if (err) { | ||
4434 | failed_cpu = cpu; | ||
4435 | goto fail; | ||
4436 | } | ||
4437 | } | ||
4438 | put_online_cpus(); | ||
4439 | |||
4440 | return 0; | ||
4441 | fail: | ||
4442 | for_each_possible_cpu(cpu) { | ||
4443 | if (cpu == failed_cpu) | ||
4444 | break; | ||
4445 | swevent_hlist_put_cpu(event, cpu); | ||
4446 | } | ||
4447 | |||
4448 | put_online_cpus(); | ||
4449 | return err; | ||
4450 | } | ||
4451 | |||
4362 | static void tp_perf_event_destroy(struct perf_event *event) | 4452 | static void tp_perf_event_destroy(struct perf_event *event) |
4363 | { | 4453 | { |
4364 | perf_trace_disable(event->attr.config); | 4454 | perf_trace_disable(event->attr.config); |
4455 | swevent_hlist_put(event); | ||
4365 | } | 4456 | } |
4366 | 4457 | ||
4367 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4458 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
4368 | { | 4459 | { |
4460 | int err; | ||
4461 | |||
4369 | /* | 4462 | /* |
4370 | * Raw tracepoint data is a severe data leak, only allow root to | 4463 | * Raw tracepoint data is a severe data leak, only allow root to |
4371 | * have these. | 4464 | * have these. |
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4379 | return NULL; | 4472 | return NULL; |
4380 | 4473 | ||
4381 | event->destroy = tp_perf_event_destroy; | 4474 | event->destroy = tp_perf_event_destroy; |
4475 | err = swevent_hlist_get(event); | ||
4476 | if (err) { | ||
4477 | perf_trace_disable(event->attr.config); | ||
4478 | return ERR_PTR(err); | ||
4479 | } | ||
4382 | 4480 | ||
4383 | return &perf_ops_generic; | 4481 | return &perf_ops_generic; |
4384 | } | 4482 | } |
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event) | |||
4479 | WARN_ON(event->parent); | 4577 | WARN_ON(event->parent); |
4480 | 4578 | ||
4481 | atomic_dec(&perf_swevent_enabled[event_id]); | 4579 | atomic_dec(&perf_swevent_enabled[event_id]); |
4580 | swevent_hlist_put(event); | ||
4482 | } | 4581 | } |
4483 | 4582 | ||
4484 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 4583 | static const struct pmu *sw_perf_event_init(struct perf_event *event) |
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4517 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | 4616 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: |
4518 | case PERF_COUNT_SW_EMULATION_FAULTS: | 4617 | case PERF_COUNT_SW_EMULATION_FAULTS: |
4519 | if (!event->parent) { | 4618 | if (!event->parent) { |
4619 | int err; | ||
4620 | |||
4621 | err = swevent_hlist_get(event); | ||
4622 | if (err) | ||
4623 | return ERR_PTR(err); | ||
4624 | |||
4520 | atomic_inc(&perf_swevent_enabled[event_id]); | 4625 | atomic_inc(&perf_swevent_enabled[event_id]); |
4521 | event->destroy = sw_perf_event_destroy; | 4626 | event->destroy = sw_perf_event_destroy; |
4522 | } | 4627 | } |
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void) | |||
5389 | 5494 | ||
5390 | for_each_possible_cpu(cpu) { | 5495 | for_each_possible_cpu(cpu) { |
5391 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 5496 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
5497 | mutex_init(&cpuctx->hlist_mutex); | ||
5392 | __perf_event_init_context(&cpuctx->ctx, NULL); | 5498 | __perf_event_init_context(&cpuctx->ctx, NULL); |
5393 | } | 5499 | } |
5394 | } | 5500 | } |
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
5402 | spin_lock(&perf_resource_lock); | 5508 | spin_lock(&perf_resource_lock); |
5403 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 5509 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; |
5404 | spin_unlock(&perf_resource_lock); | 5510 | spin_unlock(&perf_resource_lock); |
5511 | |||
5512 | mutex_lock(&cpuctx->hlist_mutex); | ||
5513 | if (cpuctx->hlist_refcount > 0) { | ||
5514 | struct swevent_hlist *hlist; | ||
5515 | |||
5516 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | ||
5517 | WARN_ON_ONCE(!hlist); | ||
5518 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | ||
5519 | } | ||
5520 | mutex_unlock(&cpuctx->hlist_mutex); | ||
5405 | } | 5521 | } |
5406 | 5522 | ||
5407 | #ifdef CONFIG_HOTPLUG_CPU | 5523 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu) | |||
5421 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 5537 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); |
5422 | struct perf_event_context *ctx = &cpuctx->ctx; | 5538 | struct perf_event_context *ctx = &cpuctx->ctx; |
5423 | 5539 | ||
5540 | mutex_lock(&cpuctx->hlist_mutex); | ||
5541 | swevent_hlist_release(cpuctx); | ||
5542 | mutex_unlock(&cpuctx->hlist_mutex); | ||
5543 | |||
5424 | mutex_lock(&ctx->mutex); | 5544 | mutex_lock(&ctx->mutex); |
5425 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | 5545 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); |
5426 | mutex_unlock(&ctx->mutex); | 5546 | mutex_unlock(&ctx->mutex); |