diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 15:54:49 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 15:54:49 -0400 |
commit | 5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (patch) | |
tree | a0d6de0930ba83ecf4629c2e2e261f5eaa2d8f33 /kernel/perf_event.c | |
parent | 888a6f77e0418b049f83d37547c209b904d30af4 (diff) | |
parent | 750ed158bf6c782d2813da1bca2c824365a0b777 (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (163 commits)
tracing: Fix compile issue for trace_sched_wakeup.c
[S390] hardirq: remove pointless header file includes
[IA64] Move local_softirq_pending() definition
perf, powerpc: Fix power_pmu_event_init to not use event->ctx
ftrace: Remove recursion between recordmcount and scripts/mod/empty
jump_label: Add COND_STMT(), reducer wrappery
perf: Optimize sw events
perf: Use jump_labels to optimize the scheduler hooks
jump_label: Add atomic_t interface
jump_label: Use more consistent naming
perf, hw_breakpoint: Fix crash in hw_breakpoint creation
perf: Find task before event alloc
perf: Fix task refcount bugs
perf: Fix group moving
irq_work: Add generic hardirq context callbacks
perf_events: Fix transaction recovery in group_sched_in()
perf_events: Fix bogus AMD64 generic TLB events
perf_events: Fix bogus context time tracking
tracing: Remove parent recording in latency tracer graph options
tracing: Use one prologue for the preempt irqs off tracer function tracers
...
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 2592 |
1 files changed, 1517 insertions, 1075 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b98bed3d818..f309e8014c7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -31,24 +31,18 @@ | |||
31 | #include <linux/kernel_stat.h> | 31 | #include <linux/kernel_stat.h> |
32 | #include <linux/perf_event.h> | 32 | #include <linux/perf_event.h> |
33 | #include <linux/ftrace_event.h> | 33 | #include <linux/ftrace_event.h> |
34 | #include <linux/hw_breakpoint.h> | ||
35 | 34 | ||
36 | #include <asm/irq_regs.h> | 35 | #include <asm/irq_regs.h> |
37 | 36 | ||
38 | /* | 37 | atomic_t perf_task_events __read_mostly; |
39 | * Each CPU has a list of per CPU events: | ||
40 | */ | ||
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
42 | |||
43 | int perf_max_events __read_mostly = 1; | ||
44 | static int perf_reserved_percpu __read_mostly; | ||
45 | static int perf_overcommit __read_mostly = 1; | ||
46 | |||
47 | static atomic_t nr_events __read_mostly; | ||
48 | static atomic_t nr_mmap_events __read_mostly; | 38 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 39 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 40 | static atomic_t nr_task_events __read_mostly; |
51 | 41 | ||
42 | static LIST_HEAD(pmus); | ||
43 | static DEFINE_MUTEX(pmus_lock); | ||
44 | static struct srcu_struct pmus_srcu; | ||
45 | |||
52 | /* | 46 | /* |
53 | * perf event paranoia level: | 47 | * perf event paranoia level: |
54 | * -1 - not paranoid at all | 48 | * -1 - not paranoid at all |
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
67 | 61 | ||
68 | static atomic64_t perf_event_id; | 62 | static atomic64_t perf_event_id; |
69 | 63 | ||
70 | /* | 64 | void __weak perf_event_print_debug(void) { } |
71 | * Lock for (sysadmin-configurable) event reservations: | ||
72 | */ | ||
73 | static DEFINE_SPINLOCK(perf_resource_lock); | ||
74 | 65 | ||
75 | /* | 66 | extern __weak const char *perf_pmu_name(void) |
76 | * Architecture provided APIs - weak aliases: | ||
77 | */ | ||
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | ||
79 | { | 67 | { |
80 | return NULL; | 68 | return "pmu"; |
81 | } | 69 | } |
82 | 70 | ||
83 | void __weak hw_perf_disable(void) { barrier(); } | 71 | void perf_pmu_disable(struct pmu *pmu) |
84 | void __weak hw_perf_enable(void) { barrier(); } | 72 | { |
85 | 73 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | |
86 | void __weak perf_event_print_debug(void) { } | 74 | if (!(*count)++) |
87 | 75 | pmu->pmu_disable(pmu); | |
88 | static DEFINE_PER_CPU(int, perf_disable_count); | 76 | } |
89 | 77 | ||
90 | void perf_disable(void) | 78 | void perf_pmu_enable(struct pmu *pmu) |
91 | { | 79 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) | 80 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
93 | hw_perf_disable(); | 81 | if (!--(*count)) |
82 | pmu->pmu_enable(pmu); | ||
94 | } | 83 | } |
95 | 84 | ||
96 | void perf_enable(void) | 85 | static DEFINE_PER_CPU(struct list_head, rotation_list); |
86 | |||
87 | /* | ||
88 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
89 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
90 | * disabled, while rotate_context is called from IRQ context. | ||
91 | */ | ||
92 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
97 | { | 93 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) | 94 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
99 | hw_perf_enable(); | 95 | struct list_head *head = &__get_cpu_var(rotation_list); |
96 | |||
97 | WARN_ON(!irqs_disabled()); | ||
98 | |||
99 | if (list_empty(&cpuctx->rotation_list)) | ||
100 | list_add(&cpuctx->rotation_list, head); | ||
100 | } | 101 | } |
101 | 102 | ||
102 | static void get_ctx(struct perf_event_context *ctx) | 103 | static void get_ctx(struct perf_event_context *ctx) |
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
151 | * the context could get moved to another task. | 152 | * the context could get moved to another task. |
152 | */ | 153 | */ |
153 | static struct perf_event_context * | 154 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 155 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
155 | { | 156 | { |
156 | struct perf_event_context *ctx; | 157 | struct perf_event_context *ctx; |
157 | 158 | ||
158 | rcu_read_lock(); | 159 | rcu_read_lock(); |
159 | retry: | 160 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); | 161 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
161 | if (ctx) { | 162 | if (ctx) { |
162 | /* | 163 | /* |
163 | * If this context is a clone of another, it might | 164 | * If this context is a clone of another, it might |
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
170 | * can't get swapped on us any more. | 171 | * can't get swapped on us any more. |
171 | */ | 172 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 173 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 174 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 175 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; | 176 | goto retry; |
176 | } | 177 | } |
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
189 | * can't get swapped to another task. This also increments its | 190 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. | 191 | * reference count so that the context can't get freed. |
191 | */ | 192 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 193 | static struct perf_event_context * |
194 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
193 | { | 195 | { |
194 | struct perf_event_context *ctx; | 196 | struct perf_event_context *ctx; |
195 | unsigned long flags; | 197 | unsigned long flags; |
196 | 198 | ||
197 | ctx = perf_lock_task_context(task, &flags); | 199 | ctx = perf_lock_task_context(task, ctxn, &flags); |
198 | if (ctx) { | 200 | if (ctx) { |
199 | ++ctx->pin_count; | 201 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 202 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
302 | } | 304 | } |
303 | 305 | ||
304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 306 | list_add_rcu(&event->event_entry, &ctx->event_list); |
307 | if (!ctx->nr_events) | ||
308 | perf_pmu_rotate_start(ctx->pmu); | ||
305 | ctx->nr_events++; | 309 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) | 310 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; | 311 | ctx->nr_stat++; |
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) | |||
311 | { | 315 | { |
312 | struct perf_event *group_leader = event->group_leader; | 316 | struct perf_event *group_leader = event->group_leader; |
313 | 317 | ||
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | 318 | /* |
319 | * We can have double attach due to group movement in perf_event_open. | ||
320 | */ | ||
321 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
322 | return; | ||
323 | |||
315 | event->attach_state |= PERF_ATTACH_GROUP; | 324 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | 325 | ||
317 | if (group_leader == event) | 326 | if (group_leader == event) |
@@ -408,8 +417,8 @@ event_filter_match(struct perf_event *event) | |||
408 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 417 | return event->cpu == -1 || event->cpu == smp_processor_id(); |
409 | } | 418 | } |
410 | 419 | ||
411 | static void | 420 | static int |
412 | event_sched_out(struct perf_event *event, | 421 | __event_sched_out(struct perf_event *event, |
413 | struct perf_cpu_context *cpuctx, | 422 | struct perf_cpu_context *cpuctx, |
414 | struct perf_event_context *ctx) | 423 | struct perf_event_context *ctx) |
415 | { | 424 | { |
@@ -428,15 +437,14 @@ event_sched_out(struct perf_event *event, | |||
428 | } | 437 | } |
429 | 438 | ||
430 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 439 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
431 | return; | 440 | return 0; |
432 | 441 | ||
433 | event->state = PERF_EVENT_STATE_INACTIVE; | 442 | event->state = PERF_EVENT_STATE_INACTIVE; |
434 | if (event->pending_disable) { | 443 | if (event->pending_disable) { |
435 | event->pending_disable = 0; | 444 | event->pending_disable = 0; |
436 | event->state = PERF_EVENT_STATE_OFF; | 445 | event->state = PERF_EVENT_STATE_OFF; |
437 | } | 446 | } |
438 | event->tstamp_stopped = ctx->time; | 447 | event->pmu->del(event, 0); |
439 | event->pmu->disable(event); | ||
440 | event->oncpu = -1; | 448 | event->oncpu = -1; |
441 | 449 | ||
442 | if (!is_software_event(event)) | 450 | if (!is_software_event(event)) |
@@ -444,6 +452,19 @@ event_sched_out(struct perf_event *event, | |||
444 | ctx->nr_active--; | 452 | ctx->nr_active--; |
445 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 453 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
446 | cpuctx->exclusive = 0; | 454 | cpuctx->exclusive = 0; |
455 | return 1; | ||
456 | } | ||
457 | |||
458 | static void | ||
459 | event_sched_out(struct perf_event *event, | ||
460 | struct perf_cpu_context *cpuctx, | ||
461 | struct perf_event_context *ctx) | ||
462 | { | ||
463 | int ret; | ||
464 | |||
465 | ret = __event_sched_out(event, cpuctx, ctx); | ||
466 | if (ret) | ||
467 | event->tstamp_stopped = ctx->time; | ||
447 | } | 468 | } |
448 | 469 | ||
449 | static void | 470 | static void |
@@ -466,6 +487,12 @@ group_sched_out(struct perf_event *group_event, | |||
466 | cpuctx->exclusive = 0; | 487 | cpuctx->exclusive = 0; |
467 | } | 488 | } |
468 | 489 | ||
490 | static inline struct perf_cpu_context * | ||
491 | __get_cpu_context(struct perf_event_context *ctx) | ||
492 | { | ||
493 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
494 | } | ||
495 | |||
469 | /* | 496 | /* |
470 | * Cross CPU call to remove a performance event | 497 | * Cross CPU call to remove a performance event |
471 | * | 498 | * |
@@ -474,9 +501,9 @@ group_sched_out(struct perf_event *group_event, | |||
474 | */ | 501 | */ |
475 | static void __perf_event_remove_from_context(void *info) | 502 | static void __perf_event_remove_from_context(void *info) |
476 | { | 503 | { |
477 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
478 | struct perf_event *event = info; | 504 | struct perf_event *event = info; |
479 | struct perf_event_context *ctx = event->ctx; | 505 | struct perf_event_context *ctx = event->ctx; |
506 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
480 | 507 | ||
481 | /* | 508 | /* |
482 | * If this is a task context, we need to check whether it is | 509 | * If this is a task context, we need to check whether it is |
@@ -487,27 +514,11 @@ static void __perf_event_remove_from_context(void *info) | |||
487 | return; | 514 | return; |
488 | 515 | ||
489 | raw_spin_lock(&ctx->lock); | 516 | raw_spin_lock(&ctx->lock); |
490 | /* | ||
491 | * Protect the list operation against NMI by disabling the | ||
492 | * events on a global level. | ||
493 | */ | ||
494 | perf_disable(); | ||
495 | 517 | ||
496 | event_sched_out(event, cpuctx, ctx); | 518 | event_sched_out(event, cpuctx, ctx); |
497 | 519 | ||
498 | list_del_event(event, ctx); | 520 | list_del_event(event, ctx); |
499 | 521 | ||
500 | if (!ctx->task) { | ||
501 | /* | ||
502 | * Allow more per task events with respect to the | ||
503 | * reservation: | ||
504 | */ | ||
505 | cpuctx->max_pertask = | ||
506 | min(perf_max_events - ctx->nr_events, | ||
507 | perf_max_events - perf_reserved_percpu); | ||
508 | } | ||
509 | |||
510 | perf_enable(); | ||
511 | raw_spin_unlock(&ctx->lock); | 522 | raw_spin_unlock(&ctx->lock); |
512 | } | 523 | } |
513 | 524 | ||
@@ -572,8 +583,8 @@ retry: | |||
572 | static void __perf_event_disable(void *info) | 583 | static void __perf_event_disable(void *info) |
573 | { | 584 | { |
574 | struct perf_event *event = info; | 585 | struct perf_event *event = info; |
575 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
576 | struct perf_event_context *ctx = event->ctx; | 586 | struct perf_event_context *ctx = event->ctx; |
587 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
577 | 588 | ||
578 | /* | 589 | /* |
579 | * If this is a per-task event, need to check whether this | 590 | * If this is a per-task event, need to check whether this |
@@ -628,7 +639,7 @@ void perf_event_disable(struct perf_event *event) | |||
628 | return; | 639 | return; |
629 | } | 640 | } |
630 | 641 | ||
631 | retry: | 642 | retry: |
632 | task_oncpu_function_call(task, __perf_event_disable, event); | 643 | task_oncpu_function_call(task, __perf_event_disable, event); |
633 | 644 | ||
634 | raw_spin_lock_irq(&ctx->lock); | 645 | raw_spin_lock_irq(&ctx->lock); |
@@ -653,7 +664,7 @@ void perf_event_disable(struct perf_event *event) | |||
653 | } | 664 | } |
654 | 665 | ||
655 | static int | 666 | static int |
656 | event_sched_in(struct perf_event *event, | 667 | __event_sched_in(struct perf_event *event, |
657 | struct perf_cpu_context *cpuctx, | 668 | struct perf_cpu_context *cpuctx, |
658 | struct perf_event_context *ctx) | 669 | struct perf_event_context *ctx) |
659 | { | 670 | { |
@@ -667,14 +678,12 @@ event_sched_in(struct perf_event *event, | |||
667 | */ | 678 | */ |
668 | smp_wmb(); | 679 | smp_wmb(); |
669 | 680 | ||
670 | if (event->pmu->enable(event)) { | 681 | if (event->pmu->add(event, PERF_EF_START)) { |
671 | event->state = PERF_EVENT_STATE_INACTIVE; | 682 | event->state = PERF_EVENT_STATE_INACTIVE; |
672 | event->oncpu = -1; | 683 | event->oncpu = -1; |
673 | return -EAGAIN; | 684 | return -EAGAIN; |
674 | } | 685 | } |
675 | 686 | ||
676 | event->tstamp_running += ctx->time - event->tstamp_stopped; | ||
677 | |||
678 | if (!is_software_event(event)) | 687 | if (!is_software_event(event)) |
679 | cpuctx->active_oncpu++; | 688 | cpuctx->active_oncpu++; |
680 | ctx->nr_active++; | 689 | ctx->nr_active++; |
@@ -685,28 +694,56 @@ event_sched_in(struct perf_event *event, | |||
685 | return 0; | 694 | return 0; |
686 | } | 695 | } |
687 | 696 | ||
697 | static inline int | ||
698 | event_sched_in(struct perf_event *event, | ||
699 | struct perf_cpu_context *cpuctx, | ||
700 | struct perf_event_context *ctx) | ||
701 | { | ||
702 | int ret = __event_sched_in(event, cpuctx, ctx); | ||
703 | if (ret) | ||
704 | return ret; | ||
705 | event->tstamp_running += ctx->time - event->tstamp_stopped; | ||
706 | return 0; | ||
707 | } | ||
708 | |||
709 | static void | ||
710 | group_commit_event_sched_in(struct perf_event *group_event, | ||
711 | struct perf_cpu_context *cpuctx, | ||
712 | struct perf_event_context *ctx) | ||
713 | { | ||
714 | struct perf_event *event; | ||
715 | u64 now = ctx->time; | ||
716 | |||
717 | group_event->tstamp_running += now - group_event->tstamp_stopped; | ||
718 | /* | ||
719 | * Schedule in siblings as one group (if any): | ||
720 | */ | ||
721 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | ||
722 | event->tstamp_running += now - event->tstamp_stopped; | ||
723 | } | ||
724 | } | ||
725 | |||
688 | static int | 726 | static int |
689 | group_sched_in(struct perf_event *group_event, | 727 | group_sched_in(struct perf_event *group_event, |
690 | struct perf_cpu_context *cpuctx, | 728 | struct perf_cpu_context *cpuctx, |
691 | struct perf_event_context *ctx) | 729 | struct perf_event_context *ctx) |
692 | { | 730 | { |
693 | struct perf_event *event, *partial_group = NULL; | 731 | struct perf_event *event, *partial_group = NULL; |
694 | const struct pmu *pmu = group_event->pmu; | 732 | struct pmu *pmu = group_event->pmu; |
695 | bool txn = false; | ||
696 | 733 | ||
697 | if (group_event->state == PERF_EVENT_STATE_OFF) | 734 | if (group_event->state == PERF_EVENT_STATE_OFF) |
698 | return 0; | 735 | return 0; |
699 | 736 | ||
700 | /* Check if group transaction availabe */ | 737 | pmu->start_txn(pmu); |
701 | if (pmu->start_txn) | ||
702 | txn = true; | ||
703 | 738 | ||
704 | if (txn) | 739 | /* |
705 | pmu->start_txn(pmu); | 740 | * use __event_sched_in() to delay updating tstamp_running |
706 | 741 | * until the transaction is committed. In case of failure | |
707 | if (event_sched_in(group_event, cpuctx, ctx)) { | 742 | * we will keep an unmodified tstamp_running which is a |
708 | if (txn) | 743 | * requirement to get correct timing information |
709 | pmu->cancel_txn(pmu); | 744 | */ |
745 | if (__event_sched_in(group_event, cpuctx, ctx)) { | ||
746 | pmu->cancel_txn(pmu); | ||
710 | return -EAGAIN; | 747 | return -EAGAIN; |
711 | } | 748 | } |
712 | 749 | ||
@@ -714,29 +751,33 @@ group_sched_in(struct perf_event *group_event, | |||
714 | * Schedule in siblings as one group (if any): | 751 | * Schedule in siblings as one group (if any): |
715 | */ | 752 | */ |
716 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 753 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
717 | if (event_sched_in(event, cpuctx, ctx)) { | 754 | if (__event_sched_in(event, cpuctx, ctx)) { |
718 | partial_group = event; | 755 | partial_group = event; |
719 | goto group_error; | 756 | goto group_error; |
720 | } | 757 | } |
721 | } | 758 | } |
722 | 759 | ||
723 | if (!txn || !pmu->commit_txn(pmu)) | 760 | if (!pmu->commit_txn(pmu)) { |
761 | /* commit tstamp_running */ | ||
762 | group_commit_event_sched_in(group_event, cpuctx, ctx); | ||
724 | return 0; | 763 | return 0; |
725 | 764 | } | |
726 | group_error: | 765 | group_error: |
727 | /* | 766 | /* |
728 | * Groups can be scheduled in as one unit only, so undo any | 767 | * Groups can be scheduled in as one unit only, so undo any |
729 | * partial group before returning: | 768 | * partial group before returning: |
769 | * | ||
770 | * use __event_sched_out() to avoid updating tstamp_stopped | ||
771 | * because the event never actually ran | ||
730 | */ | 772 | */ |
731 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 773 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
732 | if (event == partial_group) | 774 | if (event == partial_group) |
733 | break; | 775 | break; |
734 | event_sched_out(event, cpuctx, ctx); | 776 | __event_sched_out(event, cpuctx, ctx); |
735 | } | 777 | } |
736 | event_sched_out(group_event, cpuctx, ctx); | 778 | __event_sched_out(group_event, cpuctx, ctx); |
737 | 779 | ||
738 | if (txn) | 780 | pmu->cancel_txn(pmu); |
739 | pmu->cancel_txn(pmu); | ||
740 | 781 | ||
741 | return -EAGAIN; | 782 | return -EAGAIN; |
742 | } | 783 | } |
@@ -789,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event, | |||
789 | */ | 830 | */ |
790 | static void __perf_install_in_context(void *info) | 831 | static void __perf_install_in_context(void *info) |
791 | { | 832 | { |
792 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
793 | struct perf_event *event = info; | 833 | struct perf_event *event = info; |
794 | struct perf_event_context *ctx = event->ctx; | 834 | struct perf_event_context *ctx = event->ctx; |
795 | struct perf_event *leader = event->group_leader; | 835 | struct perf_event *leader = event->group_leader; |
836 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
796 | int err; | 837 | int err; |
797 | 838 | ||
798 | /* | 839 | /* |
@@ -812,12 +853,6 @@ static void __perf_install_in_context(void *info) | |||
812 | ctx->is_active = 1; | 853 | ctx->is_active = 1; |
813 | update_context_time(ctx); | 854 | update_context_time(ctx); |
814 | 855 | ||
815 | /* | ||
816 | * Protect the list operation against NMI by disabling the | ||
817 | * events on a global level. NOP for non NMI based events. | ||
818 | */ | ||
819 | perf_disable(); | ||
820 | |||
821 | add_event_to_ctx(event, ctx); | 856 | add_event_to_ctx(event, ctx); |
822 | 857 | ||
823 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 858 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -855,12 +890,7 @@ static void __perf_install_in_context(void *info) | |||
855 | } | 890 | } |
856 | } | 891 | } |
857 | 892 | ||
858 | if (!err && !ctx->task && cpuctx->max_pertask) | 893 | unlock: |
859 | cpuctx->max_pertask--; | ||
860 | |||
861 | unlock: | ||
862 | perf_enable(); | ||
863 | |||
864 | raw_spin_unlock(&ctx->lock); | 894 | raw_spin_unlock(&ctx->lock); |
865 | } | 895 | } |
866 | 896 | ||
@@ -883,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
883 | { | 913 | { |
884 | struct task_struct *task = ctx->task; | 914 | struct task_struct *task = ctx->task; |
885 | 915 | ||
916 | event->ctx = ctx; | ||
917 | |||
886 | if (!task) { | 918 | if (!task) { |
887 | /* | 919 | /* |
888 | * Per cpu events are installed via an smp call and | 920 | * Per cpu events are installed via an smp call and |
@@ -931,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
931 | 963 | ||
932 | event->state = PERF_EVENT_STATE_INACTIVE; | 964 | event->state = PERF_EVENT_STATE_INACTIVE; |
933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 965 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
934 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 966 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 967 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { |
936 | sub->tstamp_enabled = | 968 | sub->tstamp_enabled = |
937 | ctx->time - sub->total_time_enabled; | 969 | ctx->time - sub->total_time_enabled; |
970 | } | ||
971 | } | ||
938 | } | 972 | } |
939 | 973 | ||
940 | /* | 974 | /* |
@@ -943,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
943 | static void __perf_event_enable(void *info) | 977 | static void __perf_event_enable(void *info) |
944 | { | 978 | { |
945 | struct perf_event *event = info; | 979 | struct perf_event *event = info; |
946 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
947 | struct perf_event_context *ctx = event->ctx; | 980 | struct perf_event_context *ctx = event->ctx; |
948 | struct perf_event *leader = event->group_leader; | 981 | struct perf_event *leader = event->group_leader; |
982 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
949 | int err; | 983 | int err; |
950 | 984 | ||
951 | /* | 985 | /* |
@@ -979,12 +1013,10 @@ static void __perf_event_enable(void *info) | |||
979 | if (!group_can_go_on(event, cpuctx, 1)) { | 1013 | if (!group_can_go_on(event, cpuctx, 1)) { |
980 | err = -EEXIST; | 1014 | err = -EEXIST; |
981 | } else { | 1015 | } else { |
982 | perf_disable(); | ||
983 | if (event == leader) | 1016 | if (event == leader) |
984 | err = group_sched_in(event, cpuctx, ctx); | 1017 | err = group_sched_in(event, cpuctx, ctx); |
985 | else | 1018 | else |
986 | err = event_sched_in(event, cpuctx, ctx); | 1019 | err = event_sched_in(event, cpuctx, ctx); |
987 | perf_enable(); | ||
988 | } | 1020 | } |
989 | 1021 | ||
990 | if (err) { | 1022 | if (err) { |
@@ -1000,7 +1032,7 @@ static void __perf_event_enable(void *info) | |||
1000 | } | 1032 | } |
1001 | } | 1033 | } |
1002 | 1034 | ||
1003 | unlock: | 1035 | unlock: |
1004 | raw_spin_unlock(&ctx->lock); | 1036 | raw_spin_unlock(&ctx->lock); |
1005 | } | 1037 | } |
1006 | 1038 | ||
@@ -1041,7 +1073,7 @@ void perf_event_enable(struct perf_event *event) | |||
1041 | if (event->state == PERF_EVENT_STATE_ERROR) | 1073 | if (event->state == PERF_EVENT_STATE_ERROR) |
1042 | event->state = PERF_EVENT_STATE_OFF; | 1074 | event->state = PERF_EVENT_STATE_OFF; |
1043 | 1075 | ||
1044 | retry: | 1076 | retry: |
1045 | raw_spin_unlock_irq(&ctx->lock); | 1077 | raw_spin_unlock_irq(&ctx->lock); |
1046 | task_oncpu_function_call(task, __perf_event_enable, event); | 1078 | task_oncpu_function_call(task, __perf_event_enable, event); |
1047 | 1079 | ||
@@ -1061,7 +1093,7 @@ void perf_event_enable(struct perf_event *event) | |||
1061 | if (event->state == PERF_EVENT_STATE_OFF) | 1093 | if (event->state == PERF_EVENT_STATE_OFF) |
1062 | __perf_event_mark_enabled(event, ctx); | 1094 | __perf_event_mark_enabled(event, ctx); |
1063 | 1095 | ||
1064 | out: | 1096 | out: |
1065 | raw_spin_unlock_irq(&ctx->lock); | 1097 | raw_spin_unlock_irq(&ctx->lock); |
1066 | } | 1098 | } |
1067 | 1099 | ||
@@ -1092,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1092 | struct perf_event *event; | 1124 | struct perf_event *event; |
1093 | 1125 | ||
1094 | raw_spin_lock(&ctx->lock); | 1126 | raw_spin_lock(&ctx->lock); |
1127 | perf_pmu_disable(ctx->pmu); | ||
1095 | ctx->is_active = 0; | 1128 | ctx->is_active = 0; |
1096 | if (likely(!ctx->nr_events)) | 1129 | if (likely(!ctx->nr_events)) |
1097 | goto out; | 1130 | goto out; |
1098 | update_context_time(ctx); | 1131 | update_context_time(ctx); |
1099 | 1132 | ||
1100 | perf_disable(); | ||
1101 | if (!ctx->nr_active) | 1133 | if (!ctx->nr_active) |
1102 | goto out_enable; | 1134 | goto out; |
1103 | 1135 | ||
1104 | if (event_type & EVENT_PINNED) | 1136 | if (event_type & EVENT_PINNED) { |
1105 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1137 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1106 | group_sched_out(event, cpuctx, ctx); | 1138 | group_sched_out(event, cpuctx, ctx); |
1139 | } | ||
1107 | 1140 | ||
1108 | if (event_type & EVENT_FLEXIBLE) | 1141 | if (event_type & EVENT_FLEXIBLE) { |
1109 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1142 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1110 | group_sched_out(event, cpuctx, ctx); | 1143 | group_sched_out(event, cpuctx, ctx); |
1111 | 1144 | } | |
1112 | out_enable: | 1145 | out: |
1113 | perf_enable(); | 1146 | perf_pmu_enable(ctx->pmu); |
1114 | out: | ||
1115 | raw_spin_unlock(&ctx->lock); | 1147 | raw_spin_unlock(&ctx->lock); |
1116 | } | 1148 | } |
1117 | 1149 | ||
@@ -1209,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1209 | } | 1241 | } |
1210 | } | 1242 | } |
1211 | 1243 | ||
1212 | /* | 1244 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1213 | * Called from scheduler to remove the events of the current task, | 1245 | struct task_struct *next) |
1214 | * with interrupts disabled. | ||
1215 | * | ||
1216 | * We stop each event and update the event value in event->count. | ||
1217 | * | ||
1218 | * This does not protect us against NMI, but disable() | ||
1219 | * sets the disabled bit in the control field of event _before_ | ||
1220 | * accessing the event control register. If a NMI hits, then it will | ||
1221 | * not restart the event. | ||
1222 | */ | ||
1223 | void perf_event_task_sched_out(struct task_struct *task, | ||
1224 | struct task_struct *next) | ||
1225 | { | 1246 | { |
1226 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1247 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1227 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1228 | struct perf_event_context *next_ctx; | 1248 | struct perf_event_context *next_ctx; |
1229 | struct perf_event_context *parent; | 1249 | struct perf_event_context *parent; |
1250 | struct perf_cpu_context *cpuctx; | ||
1230 | int do_switch = 1; | 1251 | int do_switch = 1; |
1231 | 1252 | ||
1232 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1253 | if (likely(!ctx)) |
1254 | return; | ||
1233 | 1255 | ||
1234 | if (likely(!ctx || !cpuctx->task_ctx)) | 1256 | cpuctx = __get_cpu_context(ctx); |
1257 | if (!cpuctx->task_ctx) | ||
1235 | return; | 1258 | return; |
1236 | 1259 | ||
1237 | rcu_read_lock(); | 1260 | rcu_read_lock(); |
1238 | parent = rcu_dereference(ctx->parent_ctx); | 1261 | parent = rcu_dereference(ctx->parent_ctx); |
1239 | next_ctx = next->perf_event_ctxp; | 1262 | next_ctx = next->perf_event_ctxp[ctxn]; |
1240 | if (parent && next_ctx && | 1263 | if (parent && next_ctx && |
1241 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1264 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1242 | /* | 1265 | /* |
@@ -1255,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1255 | * XXX do we need a memory barrier of sorts | 1278 | * XXX do we need a memory barrier of sorts |
1256 | * wrt to rcu_dereference() of perf_event_ctxp | 1279 | * wrt to rcu_dereference() of perf_event_ctxp |
1257 | */ | 1280 | */ |
1258 | task->perf_event_ctxp = next_ctx; | 1281 | task->perf_event_ctxp[ctxn] = next_ctx; |
1259 | next->perf_event_ctxp = ctx; | 1282 | next->perf_event_ctxp[ctxn] = ctx; |
1260 | ctx->task = next; | 1283 | ctx->task = next; |
1261 | next_ctx->task = task; | 1284 | next_ctx->task = task; |
1262 | do_switch = 0; | 1285 | do_switch = 0; |
@@ -1274,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1274 | } | 1297 | } |
1275 | } | 1298 | } |
1276 | 1299 | ||
1300 | #define for_each_task_context_nr(ctxn) \ | ||
1301 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
1302 | |||
1303 | /* | ||
1304 | * Called from scheduler to remove the events of the current task, | ||
1305 | * with interrupts disabled. | ||
1306 | * | ||
1307 | * We stop each event and update the event value in event->count. | ||
1308 | * | ||
1309 | * This does not protect us against NMI, but disable() | ||
1310 | * sets the disabled bit in the control field of event _before_ | ||
1311 | * accessing the event control register. If a NMI hits, then it will | ||
1312 | * not restart the event. | ||
1313 | */ | ||
1314 | void __perf_event_task_sched_out(struct task_struct *task, | ||
1315 | struct task_struct *next) | ||
1316 | { | ||
1317 | int ctxn; | ||
1318 | |||
1319 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
1320 | |||
1321 | for_each_task_context_nr(ctxn) | ||
1322 | perf_event_context_sched_out(task, ctxn, next); | ||
1323 | } | ||
1324 | |||
1277 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1325 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1278 | enum event_type_t event_type) | 1326 | enum event_type_t event_type) |
1279 | { | 1327 | { |
1280 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1328 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1281 | 1329 | ||
1282 | if (!cpuctx->task_ctx) | 1330 | if (!cpuctx->task_ctx) |
1283 | return; | 1331 | return; |
@@ -1292,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1292 | /* | 1340 | /* |
1293 | * Called with IRQs disabled | 1341 | * Called with IRQs disabled |
1294 | */ | 1342 | */ |
1295 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
1296 | { | ||
1297 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1298 | } | ||
1299 | |||
1300 | /* | ||
1301 | * Called with IRQs disabled | ||
1302 | */ | ||
1303 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1343 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1304 | enum event_type_t event_type) | 1344 | enum event_type_t event_type) |
1305 | { | 1345 | { |
@@ -1350,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1350 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1390 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1351 | continue; | 1391 | continue; |
1352 | 1392 | ||
1353 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1393 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1354 | if (group_sched_in(event, cpuctx, ctx)) | 1394 | if (group_sched_in(event, cpuctx, ctx)) |
1355 | can_add_hw = 0; | 1395 | can_add_hw = 0; |
1396 | } | ||
1356 | } | 1397 | } |
1357 | } | 1398 | } |
1358 | 1399 | ||
@@ -1368,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1368 | 1409 | ||
1369 | ctx->timestamp = perf_clock(); | 1410 | ctx->timestamp = perf_clock(); |
1370 | 1411 | ||
1371 | perf_disable(); | ||
1372 | |||
1373 | /* | 1412 | /* |
1374 | * First go through the list and put on any pinned groups | 1413 | * First go through the list and put on any pinned groups |
1375 | * in order to give them the best chance of going on. | 1414 | * in order to give them the best chance of going on. |
@@ -1381,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1381 | if (event_type & EVENT_FLEXIBLE) | 1420 | if (event_type & EVENT_FLEXIBLE) |
1382 | ctx_flexible_sched_in(ctx, cpuctx); | 1421 | ctx_flexible_sched_in(ctx, cpuctx); |
1383 | 1422 | ||
1384 | perf_enable(); | 1423 | out: |
1385 | out: | ||
1386 | raw_spin_unlock(&ctx->lock); | 1424 | raw_spin_unlock(&ctx->lock); |
1387 | } | 1425 | } |
1388 | 1426 | ||
@@ -1394,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
1394 | ctx_sched_in(ctx, cpuctx, event_type); | 1432 | ctx_sched_in(ctx, cpuctx, event_type); |
1395 | } | 1433 | } |
1396 | 1434 | ||
1397 | static void task_ctx_sched_in(struct task_struct *task, | 1435 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
1398 | enum event_type_t event_type) | 1436 | enum event_type_t event_type) |
1399 | { | 1437 | { |
1400 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1438 | struct perf_cpu_context *cpuctx; |
1401 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1402 | 1439 | ||
1403 | if (likely(!ctx)) | 1440 | cpuctx = __get_cpu_context(ctx); |
1404 | return; | ||
1405 | if (cpuctx->task_ctx == ctx) | 1441 | if (cpuctx->task_ctx == ctx) |
1406 | return; | 1442 | return; |
1443 | |||
1407 | ctx_sched_in(ctx, cpuctx, event_type); | 1444 | ctx_sched_in(ctx, cpuctx, event_type); |
1408 | cpuctx->task_ctx = ctx; | 1445 | cpuctx->task_ctx = ctx; |
1409 | } | 1446 | } |
1410 | /* | ||
1411 | * Called from scheduler to add the events of the current task | ||
1412 | * with interrupts disabled. | ||
1413 | * | ||
1414 | * We restore the event value and then enable it. | ||
1415 | * | ||
1416 | * This does not protect us against NMI, but enable() | ||
1417 | * sets the enabled bit in the control field of event _before_ | ||
1418 | * accessing the event control register. If a NMI hits, then it will | ||
1419 | * keep the event running. | ||
1420 | */ | ||
1421 | void perf_event_task_sched_in(struct task_struct *task) | ||
1422 | { | ||
1423 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1424 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1425 | 1447 | ||
1426 | if (likely(!ctx)) | 1448 | void perf_event_context_sched_in(struct perf_event_context *ctx) |
1427 | return; | 1449 | { |
1450 | struct perf_cpu_context *cpuctx; | ||
1428 | 1451 | ||
1452 | cpuctx = __get_cpu_context(ctx); | ||
1429 | if (cpuctx->task_ctx == ctx) | 1453 | if (cpuctx->task_ctx == ctx) |
1430 | return; | 1454 | return; |
1431 | 1455 | ||
1432 | perf_disable(); | 1456 | perf_pmu_disable(ctx->pmu); |
1433 | |||
1434 | /* | 1457 | /* |
1435 | * We want to keep the following priority order: | 1458 | * We want to keep the following priority order: |
1436 | * cpu pinned (that don't need to move), task pinned, | 1459 | * cpu pinned (that don't need to move), task pinned, |
@@ -1444,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
1444 | 1467 | ||
1445 | cpuctx->task_ctx = ctx; | 1468 | cpuctx->task_ctx = ctx; |
1446 | 1469 | ||
1447 | perf_enable(); | 1470 | /* |
1471 | * Since these rotations are per-cpu, we need to ensure the | ||
1472 | * cpu-context we got scheduled on is actually rotating. | ||
1473 | */ | ||
1474 | perf_pmu_rotate_start(ctx->pmu); | ||
1475 | perf_pmu_enable(ctx->pmu); | ||
1476 | } | ||
1477 | |||
1478 | /* | ||
1479 | * Called from scheduler to add the events of the current task | ||
1480 | * with interrupts disabled. | ||
1481 | * | ||
1482 | * We restore the event value and then enable it. | ||
1483 | * | ||
1484 | * This does not protect us against NMI, but enable() | ||
1485 | * sets the enabled bit in the control field of event _before_ | ||
1486 | * accessing the event control register. If a NMI hits, then it will | ||
1487 | * keep the event running. | ||
1488 | */ | ||
1489 | void __perf_event_task_sched_in(struct task_struct *task) | ||
1490 | { | ||
1491 | struct perf_event_context *ctx; | ||
1492 | int ctxn; | ||
1493 | |||
1494 | for_each_task_context_nr(ctxn) { | ||
1495 | ctx = task->perf_event_ctxp[ctxn]; | ||
1496 | if (likely(!ctx)) | ||
1497 | continue; | ||
1498 | |||
1499 | perf_event_context_sched_in(ctx); | ||
1500 | } | ||
1448 | } | 1501 | } |
1449 | 1502 | ||
1450 | #define MAX_INTERRUPTS (~0ULL) | 1503 | #define MAX_INTERRUPTS (~0ULL) |
@@ -1524,22 +1577,6 @@ do { \ | |||
1524 | return div64_u64(dividend, divisor); | 1577 | return div64_u64(dividend, divisor); |
1525 | } | 1578 | } |
1526 | 1579 | ||
1527 | static void perf_event_stop(struct perf_event *event) | ||
1528 | { | ||
1529 | if (!event->pmu->stop) | ||
1530 | return event->pmu->disable(event); | ||
1531 | |||
1532 | return event->pmu->stop(event); | ||
1533 | } | ||
1534 | |||
1535 | static int perf_event_start(struct perf_event *event) | ||
1536 | { | ||
1537 | if (!event->pmu->start) | ||
1538 | return event->pmu->enable(event); | ||
1539 | |||
1540 | return event->pmu->start(event); | ||
1541 | } | ||
1542 | |||
1543 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1580 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1544 | { | 1581 | { |
1545 | struct hw_perf_event *hwc = &event->hw; | 1582 | struct hw_perf_event *hwc = &event->hw; |
@@ -1559,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1559 | hwc->sample_period = sample_period; | 1596 | hwc->sample_period = sample_period; |
1560 | 1597 | ||
1561 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 1598 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1562 | perf_disable(); | 1599 | event->pmu->stop(event, PERF_EF_UPDATE); |
1563 | perf_event_stop(event); | ||
1564 | local64_set(&hwc->period_left, 0); | 1600 | local64_set(&hwc->period_left, 0); |
1565 | perf_event_start(event); | 1601 | event->pmu->start(event, PERF_EF_RELOAD); |
1566 | perf_enable(); | ||
1567 | } | 1602 | } |
1568 | } | 1603 | } |
1569 | 1604 | ||
1570 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1605 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
1571 | { | 1606 | { |
1572 | struct perf_event *event; | 1607 | struct perf_event *event; |
1573 | struct hw_perf_event *hwc; | 1608 | struct hw_perf_event *hwc; |
@@ -1592,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1592 | */ | 1627 | */ |
1593 | if (interrupts == MAX_INTERRUPTS) { | 1628 | if (interrupts == MAX_INTERRUPTS) { |
1594 | perf_log_throttle(event, 1); | 1629 | perf_log_throttle(event, 1); |
1595 | perf_disable(); | 1630 | event->pmu->start(event, 0); |
1596 | event->pmu->unthrottle(event); | ||
1597 | perf_enable(); | ||
1598 | } | 1631 | } |
1599 | 1632 | ||
1600 | if (!event->attr.freq || !event->attr.sample_freq) | 1633 | if (!event->attr.freq || !event->attr.sample_freq) |
1601 | continue; | 1634 | continue; |
1602 | 1635 | ||
1603 | perf_disable(); | ||
1604 | event->pmu->read(event); | 1636 | event->pmu->read(event); |
1605 | now = local64_read(&event->count); | 1637 | now = local64_read(&event->count); |
1606 | delta = now - hwc->freq_count_stamp; | 1638 | delta = now - hwc->freq_count_stamp; |
1607 | hwc->freq_count_stamp = now; | 1639 | hwc->freq_count_stamp = now; |
1608 | 1640 | ||
1609 | if (delta > 0) | 1641 | if (delta > 0) |
1610 | perf_adjust_period(event, TICK_NSEC, delta); | 1642 | perf_adjust_period(event, period, delta); |
1611 | perf_enable(); | ||
1612 | } | 1643 | } |
1613 | raw_spin_unlock(&ctx->lock); | 1644 | raw_spin_unlock(&ctx->lock); |
1614 | } | 1645 | } |
@@ -1626,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1626 | raw_spin_unlock(&ctx->lock); | 1657 | raw_spin_unlock(&ctx->lock); |
1627 | } | 1658 | } |
1628 | 1659 | ||
1629 | void perf_event_task_tick(struct task_struct *curr) | 1660 | /* |
1661 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
1662 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
1663 | * disabled, while rotate_context is called from IRQ context. | ||
1664 | */ | ||
1665 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
1630 | { | 1666 | { |
1631 | struct perf_cpu_context *cpuctx; | 1667 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
1632 | struct perf_event_context *ctx; | 1668 | struct perf_event_context *ctx = NULL; |
1633 | int rotate = 0; | 1669 | int rotate = 0, remove = 1; |
1634 | |||
1635 | if (!atomic_read(&nr_events)) | ||
1636 | return; | ||
1637 | 1670 | ||
1638 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1671 | if (cpuctx->ctx.nr_events) { |
1639 | if (cpuctx->ctx.nr_events && | 1672 | remove = 0; |
1640 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 1673 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
1641 | rotate = 1; | 1674 | rotate = 1; |
1675 | } | ||
1642 | 1676 | ||
1643 | ctx = curr->perf_event_ctxp; | 1677 | ctx = cpuctx->task_ctx; |
1644 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 1678 | if (ctx && ctx->nr_events) { |
1645 | rotate = 1; | 1679 | remove = 0; |
1680 | if (ctx->nr_events != ctx->nr_active) | ||
1681 | rotate = 1; | ||
1682 | } | ||
1646 | 1683 | ||
1647 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1684 | perf_pmu_disable(cpuctx->ctx.pmu); |
1685 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
1648 | if (ctx) | 1686 | if (ctx) |
1649 | perf_ctx_adjust_freq(ctx); | 1687 | perf_ctx_adjust_freq(ctx, interval); |
1650 | 1688 | ||
1651 | if (!rotate) | 1689 | if (!rotate) |
1652 | return; | 1690 | goto done; |
1653 | 1691 | ||
1654 | perf_disable(); | ||
1655 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1692 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1656 | if (ctx) | 1693 | if (ctx) |
1657 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1694 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1662,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1662 | 1699 | ||
1663 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1700 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1664 | if (ctx) | 1701 | if (ctx) |
1665 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1702 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1666 | perf_enable(); | 1703 | |
1704 | done: | ||
1705 | if (remove) | ||
1706 | list_del_init(&cpuctx->rotation_list); | ||
1707 | |||
1708 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
1709 | } | ||
1710 | |||
1711 | void perf_event_task_tick(void) | ||
1712 | { | ||
1713 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
1714 | struct perf_cpu_context *cpuctx, *tmp; | ||
1715 | |||
1716 | WARN_ON(!irqs_disabled()); | ||
1717 | |||
1718 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
1719 | if (cpuctx->jiffies_interval == 1 || | ||
1720 | !(jiffies % cpuctx->jiffies_interval)) | ||
1721 | perf_rotate_context(cpuctx); | ||
1722 | } | ||
1667 | } | 1723 | } |
1668 | 1724 | ||
1669 | static int event_enable_on_exec(struct perf_event *event, | 1725 | static int event_enable_on_exec(struct perf_event *event, |
@@ -1685,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event, | |||
1685 | * Enable all of a task's events that have been marked enable-on-exec. | 1741 | * Enable all of a task's events that have been marked enable-on-exec. |
1686 | * This expects task == current. | 1742 | * This expects task == current. |
1687 | */ | 1743 | */ |
1688 | static void perf_event_enable_on_exec(struct task_struct *task) | 1744 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
1689 | { | 1745 | { |
1690 | struct perf_event_context *ctx; | ||
1691 | struct perf_event *event; | 1746 | struct perf_event *event; |
1692 | unsigned long flags; | 1747 | unsigned long flags; |
1693 | int enabled = 0; | 1748 | int enabled = 0; |
1694 | int ret; | 1749 | int ret; |
1695 | 1750 | ||
1696 | local_irq_save(flags); | 1751 | local_irq_save(flags); |
1697 | ctx = task->perf_event_ctxp; | ||
1698 | if (!ctx || !ctx->nr_events) | 1752 | if (!ctx || !ctx->nr_events) |
1699 | goto out; | 1753 | goto out; |
1700 | 1754 | ||
1701 | __perf_event_task_sched_out(ctx); | 1755 | task_ctx_sched_out(ctx, EVENT_ALL); |
1702 | 1756 | ||
1703 | raw_spin_lock(&ctx->lock); | 1757 | raw_spin_lock(&ctx->lock); |
1704 | 1758 | ||
@@ -1722,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1722 | 1776 | ||
1723 | raw_spin_unlock(&ctx->lock); | 1777 | raw_spin_unlock(&ctx->lock); |
1724 | 1778 | ||
1725 | perf_event_task_sched_in(task); | 1779 | perf_event_context_sched_in(ctx); |
1726 | out: | 1780 | out: |
1727 | local_irq_restore(flags); | 1781 | local_irq_restore(flags); |
1728 | } | 1782 | } |
1729 | 1783 | ||
@@ -1732,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1732 | */ | 1786 | */ |
1733 | static void __perf_event_read(void *info) | 1787 | static void __perf_event_read(void *info) |
1734 | { | 1788 | { |
1735 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1736 | struct perf_event *event = info; | 1789 | struct perf_event *event = info; |
1737 | struct perf_event_context *ctx = event->ctx; | 1790 | struct perf_event_context *ctx = event->ctx; |
1791 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1738 | 1792 | ||
1739 | /* | 1793 | /* |
1740 | * If this is a task context, we need to check whether it is | 1794 | * If this is a task context, we need to check whether it is |
@@ -1773,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event) | |||
1773 | unsigned long flags; | 1827 | unsigned long flags; |
1774 | 1828 | ||
1775 | raw_spin_lock_irqsave(&ctx->lock, flags); | 1829 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1776 | update_context_time(ctx); | 1830 | /* |
1831 | * may read while context is not active | ||
1832 | * (e.g., thread is blocked), in that case | ||
1833 | * we cannot update context time | ||
1834 | */ | ||
1835 | if (ctx->is_active) | ||
1836 | update_context_time(ctx); | ||
1777 | update_event_times(event); | 1837 | update_event_times(event); |
1778 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1838 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1779 | } | 1839 | } |
@@ -1782,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event) | |||
1782 | } | 1842 | } |
1783 | 1843 | ||
1784 | /* | 1844 | /* |
1785 | * Initialize the perf_event context in a task_struct: | 1845 | * Callchain support |
1786 | */ | 1846 | */ |
1847 | |||
1848 | struct callchain_cpus_entries { | ||
1849 | struct rcu_head rcu_head; | ||
1850 | struct perf_callchain_entry *cpu_entries[0]; | ||
1851 | }; | ||
1852 | |||
1853 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
1854 | static atomic_t nr_callchain_events; | ||
1855 | static DEFINE_MUTEX(callchain_mutex); | ||
1856 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
1857 | |||
1858 | |||
1859 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
1860 | struct pt_regs *regs) | ||
1861 | { | ||
1862 | } | ||
1863 | |||
1864 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
1865 | struct pt_regs *regs) | ||
1866 | { | ||
1867 | } | ||
1868 | |||
1869 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
1870 | { | ||
1871 | struct callchain_cpus_entries *entries; | ||
1872 | int cpu; | ||
1873 | |||
1874 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
1875 | |||
1876 | for_each_possible_cpu(cpu) | ||
1877 | kfree(entries->cpu_entries[cpu]); | ||
1878 | |||
1879 | kfree(entries); | ||
1880 | } | ||
1881 | |||
1882 | static void release_callchain_buffers(void) | ||
1883 | { | ||
1884 | struct callchain_cpus_entries *entries; | ||
1885 | |||
1886 | entries = callchain_cpus_entries; | ||
1887 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
1888 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
1889 | } | ||
1890 | |||
1891 | static int alloc_callchain_buffers(void) | ||
1892 | { | ||
1893 | int cpu; | ||
1894 | int size; | ||
1895 | struct callchain_cpus_entries *entries; | ||
1896 | |||
1897 | /* | ||
1898 | * We can't use the percpu allocation API for data that can be | ||
1899 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
1900 | * until that gets sorted out. | ||
1901 | */ | ||
1902 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | ||
1903 | num_possible_cpus(); | ||
1904 | |||
1905 | entries = kzalloc(size, GFP_KERNEL); | ||
1906 | if (!entries) | ||
1907 | return -ENOMEM; | ||
1908 | |||
1909 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
1910 | |||
1911 | for_each_possible_cpu(cpu) { | ||
1912 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
1913 | cpu_to_node(cpu)); | ||
1914 | if (!entries->cpu_entries[cpu]) | ||
1915 | goto fail; | ||
1916 | } | ||
1917 | |||
1918 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
1919 | |||
1920 | return 0; | ||
1921 | |||
1922 | fail: | ||
1923 | for_each_possible_cpu(cpu) | ||
1924 | kfree(entries->cpu_entries[cpu]); | ||
1925 | kfree(entries); | ||
1926 | |||
1927 | return -ENOMEM; | ||
1928 | } | ||
1929 | |||
1930 | static int get_callchain_buffers(void) | ||
1931 | { | ||
1932 | int err = 0; | ||
1933 | int count; | ||
1934 | |||
1935 | mutex_lock(&callchain_mutex); | ||
1936 | |||
1937 | count = atomic_inc_return(&nr_callchain_events); | ||
1938 | if (WARN_ON_ONCE(count < 1)) { | ||
1939 | err = -EINVAL; | ||
1940 | goto exit; | ||
1941 | } | ||
1942 | |||
1943 | if (count > 1) { | ||
1944 | /* If the allocation failed, give up */ | ||
1945 | if (!callchain_cpus_entries) | ||
1946 | err = -ENOMEM; | ||
1947 | goto exit; | ||
1948 | } | ||
1949 | |||
1950 | err = alloc_callchain_buffers(); | ||
1951 | if (err) | ||
1952 | release_callchain_buffers(); | ||
1953 | exit: | ||
1954 | mutex_unlock(&callchain_mutex); | ||
1955 | |||
1956 | return err; | ||
1957 | } | ||
1958 | |||
1959 | static void put_callchain_buffers(void) | ||
1960 | { | ||
1961 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
1962 | release_callchain_buffers(); | ||
1963 | mutex_unlock(&callchain_mutex); | ||
1964 | } | ||
1965 | } | ||
1966 | |||
1967 | static int get_recursion_context(int *recursion) | ||
1968 | { | ||
1969 | int rctx; | ||
1970 | |||
1971 | if (in_nmi()) | ||
1972 | rctx = 3; | ||
1973 | else if (in_irq()) | ||
1974 | rctx = 2; | ||
1975 | else if (in_softirq()) | ||
1976 | rctx = 1; | ||
1977 | else | ||
1978 | rctx = 0; | ||
1979 | |||
1980 | if (recursion[rctx]) | ||
1981 | return -1; | ||
1982 | |||
1983 | recursion[rctx]++; | ||
1984 | barrier(); | ||
1985 | |||
1986 | return rctx; | ||
1987 | } | ||
1988 | |||
1989 | static inline void put_recursion_context(int *recursion, int rctx) | ||
1990 | { | ||
1991 | barrier(); | ||
1992 | recursion[rctx]--; | ||
1993 | } | ||
1994 | |||
1995 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
1996 | { | ||
1997 | int cpu; | ||
1998 | struct callchain_cpus_entries *entries; | ||
1999 | |||
2000 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
2001 | if (*rctx == -1) | ||
2002 | return NULL; | ||
2003 | |||
2004 | entries = rcu_dereference(callchain_cpus_entries); | ||
2005 | if (!entries) | ||
2006 | return NULL; | ||
2007 | |||
2008 | cpu = smp_processor_id(); | ||
2009 | |||
2010 | return &entries->cpu_entries[cpu][*rctx]; | ||
2011 | } | ||
2012 | |||
1787 | static void | 2013 | static void |
1788 | __perf_event_init_context(struct perf_event_context *ctx, | 2014 | put_callchain_entry(int rctx) |
1789 | struct task_struct *task) | 2015 | { |
2016 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
2017 | } | ||
2018 | |||
2019 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2020 | { | ||
2021 | int rctx; | ||
2022 | struct perf_callchain_entry *entry; | ||
2023 | |||
2024 | |||
2025 | entry = get_callchain_entry(&rctx); | ||
2026 | if (rctx == -1) | ||
2027 | return NULL; | ||
2028 | |||
2029 | if (!entry) | ||
2030 | goto exit_put; | ||
2031 | |||
2032 | entry->nr = 0; | ||
2033 | |||
2034 | if (!user_mode(regs)) { | ||
2035 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2036 | perf_callchain_kernel(entry, regs); | ||
2037 | if (current->mm) | ||
2038 | regs = task_pt_regs(current); | ||
2039 | else | ||
2040 | regs = NULL; | ||
2041 | } | ||
2042 | |||
2043 | if (regs) { | ||
2044 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2045 | perf_callchain_user(entry, regs); | ||
2046 | } | ||
2047 | |||
2048 | exit_put: | ||
2049 | put_callchain_entry(rctx); | ||
2050 | |||
2051 | return entry; | ||
2052 | } | ||
2053 | |||
2054 | /* | ||
2055 | * Initialize the perf_event context in a task_struct: | ||
2056 | */ | ||
2057 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
1790 | { | 2058 | { |
1791 | raw_spin_lock_init(&ctx->lock); | 2059 | raw_spin_lock_init(&ctx->lock); |
1792 | mutex_init(&ctx->mutex); | 2060 | mutex_init(&ctx->mutex); |
@@ -1794,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
1794 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2062 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1795 | INIT_LIST_HEAD(&ctx->event_list); | 2063 | INIT_LIST_HEAD(&ctx->event_list); |
1796 | atomic_set(&ctx->refcount, 1); | 2064 | atomic_set(&ctx->refcount, 1); |
1797 | ctx->task = task; | ||
1798 | } | 2065 | } |
1799 | 2066 | ||
1800 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2067 | static struct perf_event_context * |
2068 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
1801 | { | 2069 | { |
1802 | struct perf_event_context *ctx; | 2070 | struct perf_event_context *ctx; |
1803 | struct perf_cpu_context *cpuctx; | ||
1804 | struct task_struct *task; | ||
1805 | unsigned long flags; | ||
1806 | int err; | ||
1807 | |||
1808 | if (pid == -1 && cpu != -1) { | ||
1809 | /* Must be root to operate on a CPU event: */ | ||
1810 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
1811 | return ERR_PTR(-EACCES); | ||
1812 | 2071 | ||
1813 | if (cpu < 0 || cpu >= nr_cpumask_bits) | 2072 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
1814 | return ERR_PTR(-EINVAL); | 2073 | if (!ctx) |
2074 | return NULL; | ||
1815 | 2075 | ||
1816 | /* | 2076 | __perf_event_init_context(ctx); |
1817 | * We could be clever and allow to attach a event to an | 2077 | if (task) { |
1818 | * offline CPU and activate it when the CPU comes up, but | 2078 | ctx->task = task; |
1819 | * that's for later. | 2079 | get_task_struct(task); |
1820 | */ | 2080 | } |
1821 | if (!cpu_online(cpu)) | 2081 | ctx->pmu = pmu; |
1822 | return ERR_PTR(-ENODEV); | ||
1823 | 2082 | ||
1824 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2083 | return ctx; |
1825 | ctx = &cpuctx->ctx; | 2084 | } |
1826 | get_ctx(ctx); | ||
1827 | 2085 | ||
1828 | return ctx; | 2086 | static struct task_struct * |
1829 | } | 2087 | find_lively_task_by_vpid(pid_t vpid) |
2088 | { | ||
2089 | struct task_struct *task; | ||
2090 | int err; | ||
1830 | 2091 | ||
1831 | rcu_read_lock(); | 2092 | rcu_read_lock(); |
1832 | if (!pid) | 2093 | if (!vpid) |
1833 | task = current; | 2094 | task = current; |
1834 | else | 2095 | else |
1835 | task = find_task_by_vpid(pid); | 2096 | task = find_task_by_vpid(vpid); |
1836 | if (task) | 2097 | if (task) |
1837 | get_task_struct(task); | 2098 | get_task_struct(task); |
1838 | rcu_read_unlock(); | 2099 | rcu_read_unlock(); |
@@ -1852,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
1852 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2113 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1853 | goto errout; | 2114 | goto errout; |
1854 | 2115 | ||
1855 | retry: | 2116 | return task; |
1856 | ctx = perf_lock_task_context(task, &flags); | 2117 | errout: |
2118 | put_task_struct(task); | ||
2119 | return ERR_PTR(err); | ||
2120 | |||
2121 | } | ||
2122 | |||
2123 | static struct perf_event_context * | ||
2124 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
2125 | { | ||
2126 | struct perf_event_context *ctx; | ||
2127 | struct perf_cpu_context *cpuctx; | ||
2128 | unsigned long flags; | ||
2129 | int ctxn, err; | ||
2130 | |||
2131 | if (!task && cpu != -1) { | ||
2132 | /* Must be root to operate on a CPU event: */ | ||
2133 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
2134 | return ERR_PTR(-EACCES); | ||
2135 | |||
2136 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2137 | return ERR_PTR(-EINVAL); | ||
2138 | |||
2139 | /* | ||
2140 | * We could be clever and allow to attach a event to an | ||
2141 | * offline CPU and activate it when the CPU comes up, but | ||
2142 | * that's for later. | ||
2143 | */ | ||
2144 | if (!cpu_online(cpu)) | ||
2145 | return ERR_PTR(-ENODEV); | ||
2146 | |||
2147 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
2148 | ctx = &cpuctx->ctx; | ||
2149 | get_ctx(ctx); | ||
2150 | |||
2151 | return ctx; | ||
2152 | } | ||
2153 | |||
2154 | err = -EINVAL; | ||
2155 | ctxn = pmu->task_ctx_nr; | ||
2156 | if (ctxn < 0) | ||
2157 | goto errout; | ||
2158 | |||
2159 | retry: | ||
2160 | ctx = perf_lock_task_context(task, ctxn, &flags); | ||
1857 | if (ctx) { | 2161 | if (ctx) { |
1858 | unclone_ctx(ctx); | 2162 | unclone_ctx(ctx); |
1859 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2163 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1860 | } | 2164 | } |
1861 | 2165 | ||
1862 | if (!ctx) { | 2166 | if (!ctx) { |
1863 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2167 | ctx = alloc_perf_context(pmu, task); |
1864 | err = -ENOMEM; | 2168 | err = -ENOMEM; |
1865 | if (!ctx) | 2169 | if (!ctx) |
1866 | goto errout; | 2170 | goto errout; |
1867 | __perf_event_init_context(ctx, task); | 2171 | |
1868 | get_ctx(ctx); | 2172 | get_ctx(ctx); |
1869 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2173 | |
2174 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | ||
1870 | /* | 2175 | /* |
1871 | * We raced with some other task; use | 2176 | * We raced with some other task; use |
1872 | * the context they set. | 2177 | * the context they set. |
1873 | */ | 2178 | */ |
2179 | put_task_struct(task); | ||
1874 | kfree(ctx); | 2180 | kfree(ctx); |
1875 | goto retry; | 2181 | goto retry; |
1876 | } | 2182 | } |
1877 | get_task_struct(task); | ||
1878 | } | 2183 | } |
1879 | 2184 | ||
1880 | put_task_struct(task); | ||
1881 | return ctx; | 2185 | return ctx; |
1882 | 2186 | ||
1883 | errout: | 2187 | errout: |
1884 | put_task_struct(task); | ||
1885 | return ERR_PTR(err); | 2188 | return ERR_PTR(err); |
1886 | } | 2189 | } |
1887 | 2190 | ||
@@ -1898,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head) | |||
1898 | kfree(event); | 2201 | kfree(event); |
1899 | } | 2202 | } |
1900 | 2203 | ||
1901 | static void perf_pending_sync(struct perf_event *event); | ||
1902 | static void perf_buffer_put(struct perf_buffer *buffer); | 2204 | static void perf_buffer_put(struct perf_buffer *buffer); |
1903 | 2205 | ||
1904 | static void free_event(struct perf_event *event) | 2206 | static void free_event(struct perf_event *event) |
1905 | { | 2207 | { |
1906 | perf_pending_sync(event); | 2208 | irq_work_sync(&event->pending); |
1907 | 2209 | ||
1908 | if (!event->parent) { | 2210 | if (!event->parent) { |
1909 | atomic_dec(&nr_events); | 2211 | if (event->attach_state & PERF_ATTACH_TASK) |
2212 | jump_label_dec(&perf_task_events); | ||
1910 | if (event->attr.mmap || event->attr.mmap_data) | 2213 | if (event->attr.mmap || event->attr.mmap_data) |
1911 | atomic_dec(&nr_mmap_events); | 2214 | atomic_dec(&nr_mmap_events); |
1912 | if (event->attr.comm) | 2215 | if (event->attr.comm) |
1913 | atomic_dec(&nr_comm_events); | 2216 | atomic_dec(&nr_comm_events); |
1914 | if (event->attr.task) | 2217 | if (event->attr.task) |
1915 | atomic_dec(&nr_task_events); | 2218 | atomic_dec(&nr_task_events); |
2219 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
2220 | put_callchain_buffers(); | ||
1916 | } | 2221 | } |
1917 | 2222 | ||
1918 | if (event->buffer) { | 2223 | if (event->buffer) { |
@@ -1923,7 +2228,9 @@ static void free_event(struct perf_event *event) | |||
1923 | if (event->destroy) | 2228 | if (event->destroy) |
1924 | event->destroy(event); | 2229 | event->destroy(event); |
1925 | 2230 | ||
1926 | put_ctx(event->ctx); | 2231 | if (event->ctx) |
2232 | put_ctx(event->ctx); | ||
2233 | |||
1927 | call_rcu(&event->rcu_head, free_event_rcu); | 2234 | call_rcu(&event->rcu_head, free_event_rcu); |
1928 | } | 2235 | } |
1929 | 2236 | ||
@@ -2342,6 +2649,9 @@ int perf_event_task_disable(void) | |||
2342 | 2649 | ||
2343 | static int perf_event_index(struct perf_event *event) | 2650 | static int perf_event_index(struct perf_event *event) |
2344 | { | 2651 | { |
2652 | if (event->hw.state & PERF_HES_STOPPED) | ||
2653 | return 0; | ||
2654 | |||
2345 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2655 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2346 | return 0; | 2656 | return 0; |
2347 | 2657 | ||
@@ -2845,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
2845 | } | 3155 | } |
2846 | } | 3156 | } |
2847 | 3157 | ||
2848 | /* | 3158 | static void perf_pending_event(struct irq_work *entry) |
2849 | * Pending wakeups | ||
2850 | * | ||
2851 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
2852 | * | ||
2853 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
2854 | * single linked list and use cmpxchg() to add entries lockless. | ||
2855 | */ | ||
2856 | |||
2857 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
2858 | { | 3159 | { |
2859 | struct perf_event *event = container_of(entry, | 3160 | struct perf_event *event = container_of(entry, |
2860 | struct perf_event, pending); | 3161 | struct perf_event, pending); |
@@ -2870,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
2870 | } | 3171 | } |
2871 | } | 3172 | } |
2872 | 3173 | ||
2873 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
2874 | |||
2875 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
2876 | PENDING_TAIL, | ||
2877 | }; | ||
2878 | |||
2879 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
2880 | void (*func)(struct perf_pending_entry *)) | ||
2881 | { | ||
2882 | struct perf_pending_entry **head; | ||
2883 | |||
2884 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
2885 | return; | ||
2886 | |||
2887 | entry->func = func; | ||
2888 | |||
2889 | head = &get_cpu_var(perf_pending_head); | ||
2890 | |||
2891 | do { | ||
2892 | entry->next = *head; | ||
2893 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
2894 | |||
2895 | set_perf_event_pending(); | ||
2896 | |||
2897 | put_cpu_var(perf_pending_head); | ||
2898 | } | ||
2899 | |||
2900 | static int __perf_pending_run(void) | ||
2901 | { | ||
2902 | struct perf_pending_entry *list; | ||
2903 | int nr = 0; | ||
2904 | |||
2905 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
2906 | while (list != PENDING_TAIL) { | ||
2907 | void (*func)(struct perf_pending_entry *); | ||
2908 | struct perf_pending_entry *entry = list; | ||
2909 | |||
2910 | list = list->next; | ||
2911 | |||
2912 | func = entry->func; | ||
2913 | entry->next = NULL; | ||
2914 | /* | ||
2915 | * Ensure we observe the unqueue before we issue the wakeup, | ||
2916 | * so that we won't be waiting forever. | ||
2917 | * -- see perf_not_pending(). | ||
2918 | */ | ||
2919 | smp_wmb(); | ||
2920 | |||
2921 | func(entry); | ||
2922 | nr++; | ||
2923 | } | ||
2924 | |||
2925 | return nr; | ||
2926 | } | ||
2927 | |||
2928 | static inline int perf_not_pending(struct perf_event *event) | ||
2929 | { | ||
2930 | /* | ||
2931 | * If we flush on whatever cpu we run, there is a chance we don't | ||
2932 | * need to wait. | ||
2933 | */ | ||
2934 | get_cpu(); | ||
2935 | __perf_pending_run(); | ||
2936 | put_cpu(); | ||
2937 | |||
2938 | /* | ||
2939 | * Ensure we see the proper queue state before going to sleep | ||
2940 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
2941 | */ | ||
2942 | smp_rmb(); | ||
2943 | return event->pending.next == NULL; | ||
2944 | } | ||
2945 | |||
2946 | static void perf_pending_sync(struct perf_event *event) | ||
2947 | { | ||
2948 | wait_event(event->waitq, perf_not_pending(event)); | ||
2949 | } | ||
2950 | |||
2951 | void perf_event_do_pending(void) | ||
2952 | { | ||
2953 | __perf_pending_run(); | ||
2954 | } | ||
2955 | |||
2956 | /* | ||
2957 | * Callchain support -- arch specific | ||
2958 | */ | ||
2959 | |||
2960 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2961 | { | ||
2962 | return NULL; | ||
2963 | } | ||
2964 | |||
2965 | |||
2966 | /* | 3174 | /* |
2967 | * We assume there is only KVM supporting the callbacks. | 3175 | * We assume there is only KVM supporting the callbacks. |
2968 | * Later on, we might change it to a list if there is | 3176 | * Later on, we might change it to a list if there is |
@@ -3012,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
3012 | 3220 | ||
3013 | if (handle->nmi) { | 3221 | if (handle->nmi) { |
3014 | handle->event->pending_wakeup = 1; | 3222 | handle->event->pending_wakeup = 1; |
3015 | perf_pending_queue(&handle->event->pending, | 3223 | irq_work_queue(&handle->event->pending); |
3016 | perf_pending_event); | ||
3017 | } else | 3224 | } else |
3018 | perf_event_wakeup(handle->event); | 3225 | perf_event_wakeup(handle->event); |
3019 | } | 3226 | } |
@@ -3069,7 +3276,7 @@ again: | |||
3069 | if (handle->wakeup != local_read(&buffer->wakeup)) | 3276 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3070 | perf_output_wakeup(handle); | 3277 | perf_output_wakeup(handle); |
3071 | 3278 | ||
3072 | out: | 3279 | out: |
3073 | preempt_enable(); | 3280 | preempt_enable(); |
3074 | } | 3281 | } |
3075 | 3282 | ||
@@ -3457,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
3457 | struct perf_output_handle handle; | 3664 | struct perf_output_handle handle; |
3458 | struct perf_event_header header; | 3665 | struct perf_event_header header; |
3459 | 3666 | ||
3667 | /* protect the callchain buffers */ | ||
3668 | rcu_read_lock(); | ||
3669 | |||
3460 | perf_prepare_sample(&header, data, event, regs); | 3670 | perf_prepare_sample(&header, data, event, regs); |
3461 | 3671 | ||
3462 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 3672 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3463 | return; | 3673 | goto exit; |
3464 | 3674 | ||
3465 | perf_output_sample(&handle, &header, data, event); | 3675 | perf_output_sample(&handle, &header, data, event); |
3466 | 3676 | ||
3467 | perf_output_end(&handle); | 3677 | perf_output_end(&handle); |
3678 | |||
3679 | exit: | ||
3680 | rcu_read_unlock(); | ||
3468 | } | 3681 | } |
3469 | 3682 | ||
3470 | /* | 3683 | /* |
@@ -3578,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
3578 | static void perf_event_task_event(struct perf_task_event *task_event) | 3791 | static void perf_event_task_event(struct perf_task_event *task_event) |
3579 | { | 3792 | { |
3580 | struct perf_cpu_context *cpuctx; | 3793 | struct perf_cpu_context *cpuctx; |
3581 | struct perf_event_context *ctx = task_event->task_ctx; | 3794 | struct perf_event_context *ctx; |
3795 | struct pmu *pmu; | ||
3796 | int ctxn; | ||
3582 | 3797 | ||
3583 | rcu_read_lock(); | 3798 | rcu_read_lock(); |
3584 | cpuctx = &get_cpu_var(perf_cpu_context); | 3799 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3585 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3800 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3586 | if (!ctx) | 3801 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3587 | ctx = rcu_dereference(current->perf_event_ctxp); | 3802 | |
3588 | if (ctx) | 3803 | ctx = task_event->task_ctx; |
3589 | perf_event_task_ctx(ctx, task_event); | 3804 | if (!ctx) { |
3590 | put_cpu_var(perf_cpu_context); | 3805 | ctxn = pmu->task_ctx_nr; |
3806 | if (ctxn < 0) | ||
3807 | goto next; | ||
3808 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3809 | } | ||
3810 | if (ctx) | ||
3811 | perf_event_task_ctx(ctx, task_event); | ||
3812 | next: | ||
3813 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3814 | } | ||
3591 | rcu_read_unlock(); | 3815 | rcu_read_unlock(); |
3592 | } | 3816 | } |
3593 | 3817 | ||
@@ -3692,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3692 | { | 3916 | { |
3693 | struct perf_cpu_context *cpuctx; | 3917 | struct perf_cpu_context *cpuctx; |
3694 | struct perf_event_context *ctx; | 3918 | struct perf_event_context *ctx; |
3695 | unsigned int size; | ||
3696 | char comm[TASK_COMM_LEN]; | 3919 | char comm[TASK_COMM_LEN]; |
3920 | unsigned int size; | ||
3921 | struct pmu *pmu; | ||
3922 | int ctxn; | ||
3697 | 3923 | ||
3698 | memset(comm, 0, sizeof(comm)); | 3924 | memset(comm, 0, sizeof(comm)); |
3699 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 3925 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -3705,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3705 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 3931 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3706 | 3932 | ||
3707 | rcu_read_lock(); | 3933 | rcu_read_lock(); |
3708 | cpuctx = &get_cpu_var(perf_cpu_context); | 3934 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3709 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 3935 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3710 | ctx = rcu_dereference(current->perf_event_ctxp); | 3936 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3711 | if (ctx) | 3937 | |
3712 | perf_event_comm_ctx(ctx, comm_event); | 3938 | ctxn = pmu->task_ctx_nr; |
3713 | put_cpu_var(perf_cpu_context); | 3939 | if (ctxn < 0) |
3940 | goto next; | ||
3941 | |||
3942 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3943 | if (ctx) | ||
3944 | perf_event_comm_ctx(ctx, comm_event); | ||
3945 | next: | ||
3946 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3947 | } | ||
3714 | rcu_read_unlock(); | 3948 | rcu_read_unlock(); |
3715 | } | 3949 | } |
3716 | 3950 | ||
3717 | void perf_event_comm(struct task_struct *task) | 3951 | void perf_event_comm(struct task_struct *task) |
3718 | { | 3952 | { |
3719 | struct perf_comm_event comm_event; | 3953 | struct perf_comm_event comm_event; |
3954 | struct perf_event_context *ctx; | ||
3955 | int ctxn; | ||
3956 | |||
3957 | for_each_task_context_nr(ctxn) { | ||
3958 | ctx = task->perf_event_ctxp[ctxn]; | ||
3959 | if (!ctx) | ||
3960 | continue; | ||
3720 | 3961 | ||
3721 | if (task->perf_event_ctxp) | 3962 | perf_event_enable_on_exec(ctx); |
3722 | perf_event_enable_on_exec(task); | 3963 | } |
3723 | 3964 | ||
3724 | if (!atomic_read(&nr_comm_events)) | 3965 | if (!atomic_read(&nr_comm_events)) |
3725 | return; | 3966 | return; |
@@ -3821,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3821 | char tmp[16]; | 4062 | char tmp[16]; |
3822 | char *buf = NULL; | 4063 | char *buf = NULL; |
3823 | const char *name; | 4064 | const char *name; |
4065 | struct pmu *pmu; | ||
4066 | int ctxn; | ||
3824 | 4067 | ||
3825 | memset(tmp, 0, sizeof(tmp)); | 4068 | memset(tmp, 0, sizeof(tmp)); |
3826 | 4069 | ||
@@ -3873,12 +4116,23 @@ got_name: | |||
3873 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4116 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3874 | 4117 | ||
3875 | rcu_read_lock(); | 4118 | rcu_read_lock(); |
3876 | cpuctx = &get_cpu_var(perf_cpu_context); | 4119 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3877 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4120 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3878 | ctx = rcu_dereference(current->perf_event_ctxp); | 4121 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
3879 | if (ctx) | 4122 | vma->vm_flags & VM_EXEC); |
3880 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4123 | |
3881 | put_cpu_var(perf_cpu_context); | 4124 | ctxn = pmu->task_ctx_nr; |
4125 | if (ctxn < 0) | ||
4126 | goto next; | ||
4127 | |||
4128 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4129 | if (ctx) { | ||
4130 | perf_event_mmap_ctx(ctx, mmap_event, | ||
4131 | vma->vm_flags & VM_EXEC); | ||
4132 | } | ||
4133 | next: | ||
4134 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4135 | } | ||
3882 | rcu_read_unlock(); | 4136 | rcu_read_unlock(); |
3883 | 4137 | ||
3884 | kfree(buf); | 4138 | kfree(buf); |
@@ -3960,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3960 | struct hw_perf_event *hwc = &event->hw; | 4214 | struct hw_perf_event *hwc = &event->hw; |
3961 | int ret = 0; | 4215 | int ret = 0; |
3962 | 4216 | ||
3963 | throttle = (throttle && event->pmu->unthrottle != NULL); | ||
3964 | |||
3965 | if (!throttle) { | 4217 | if (!throttle) { |
3966 | hwc->interrupts++; | 4218 | hwc->interrupts++; |
3967 | } else { | 4219 | } else { |
@@ -4004,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4004 | event->pending_kill = POLL_HUP; | 4256 | event->pending_kill = POLL_HUP; |
4005 | if (nmi) { | 4257 | if (nmi) { |
4006 | event->pending_disable = 1; | 4258 | event->pending_disable = 1; |
4007 | perf_pending_queue(&event->pending, | 4259 | irq_work_queue(&event->pending); |
4008 | perf_pending_event); | ||
4009 | } else | 4260 | } else |
4010 | perf_event_disable(event); | 4261 | perf_event_disable(event); |
4011 | } | 4262 | } |
@@ -4029,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
4029 | * Generic software event infrastructure | 4280 | * Generic software event infrastructure |
4030 | */ | 4281 | */ |
4031 | 4282 | ||
4283 | struct swevent_htable { | ||
4284 | struct swevent_hlist *swevent_hlist; | ||
4285 | struct mutex hlist_mutex; | ||
4286 | int hlist_refcount; | ||
4287 | |||
4288 | /* Recursion avoidance in each contexts */ | ||
4289 | int recursion[PERF_NR_CONTEXTS]; | ||
4290 | }; | ||
4291 | |||
4292 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
4293 | |||
4032 | /* | 4294 | /* |
4033 | * We directly increment event->count and keep a second value in | 4295 | * We directly increment event->count and keep a second value in |
4034 | * event->hw.period_left to count intervals. This period event | 4296 | * event->hw.period_left to count intervals. This period event |
@@ -4086,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4086 | } | 4348 | } |
4087 | } | 4349 | } |
4088 | 4350 | ||
4089 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 4351 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
4090 | int nmi, struct perf_sample_data *data, | 4352 | int nmi, struct perf_sample_data *data, |
4091 | struct pt_regs *regs) | 4353 | struct pt_regs *regs) |
4092 | { | 4354 | { |
@@ -4112,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4112 | static int perf_exclude_event(struct perf_event *event, | 4374 | static int perf_exclude_event(struct perf_event *event, |
4113 | struct pt_regs *regs) | 4375 | struct pt_regs *regs) |
4114 | { | 4376 | { |
4377 | if (event->hw.state & PERF_HES_STOPPED) | ||
4378 | return 0; | ||
4379 | |||
4115 | if (regs) { | 4380 | if (regs) { |
4116 | if (event->attr.exclude_user && user_mode(regs)) | 4381 | if (event->attr.exclude_user && user_mode(regs)) |
4117 | return 1; | 4382 | return 1; |
@@ -4158,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
4158 | 4423 | ||
4159 | /* For the read side: events when they trigger */ | 4424 | /* For the read side: events when they trigger */ |
4160 | static inline struct hlist_head * | 4425 | static inline struct hlist_head * |
4161 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4426 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
4162 | { | 4427 | { |
4163 | struct swevent_hlist *hlist; | 4428 | struct swevent_hlist *hlist; |
4164 | 4429 | ||
4165 | hlist = rcu_dereference(ctx->swevent_hlist); | 4430 | hlist = rcu_dereference(swhash->swevent_hlist); |
4166 | if (!hlist) | 4431 | if (!hlist) |
4167 | return NULL; | 4432 | return NULL; |
4168 | 4433 | ||
@@ -4171,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
4171 | 4436 | ||
4172 | /* For the event head insertion and removal in the hlist */ | 4437 | /* For the event head insertion and removal in the hlist */ |
4173 | static inline struct hlist_head * | 4438 | static inline struct hlist_head * |
4174 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 4439 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
4175 | { | 4440 | { |
4176 | struct swevent_hlist *hlist; | 4441 | struct swevent_hlist *hlist; |
4177 | u32 event_id = event->attr.config; | 4442 | u32 event_id = event->attr.config; |
@@ -4182,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
4182 | * and release. Which makes the protected version suitable here. | 4447 | * and release. Which makes the protected version suitable here. |
4183 | * The context lock guarantees that. | 4448 | * The context lock guarantees that. |
4184 | */ | 4449 | */ |
4185 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 4450 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
4186 | lockdep_is_held(&event->ctx->lock)); | 4451 | lockdep_is_held(&event->ctx->lock)); |
4187 | if (!hlist) | 4452 | if (!hlist) |
4188 | return NULL; | 4453 | return NULL; |
@@ -4195,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4195 | struct perf_sample_data *data, | 4460 | struct perf_sample_data *data, |
4196 | struct pt_regs *regs) | 4461 | struct pt_regs *regs) |
4197 | { | 4462 | { |
4198 | struct perf_cpu_context *cpuctx; | 4463 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4199 | struct perf_event *event; | 4464 | struct perf_event *event; |
4200 | struct hlist_node *node; | 4465 | struct hlist_node *node; |
4201 | struct hlist_head *head; | 4466 | struct hlist_head *head; |
4202 | 4467 | ||
4203 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4204 | |||
4205 | rcu_read_lock(); | 4468 | rcu_read_lock(); |
4206 | 4469 | head = find_swevent_head_rcu(swhash, type, event_id); | |
4207 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
4208 | |||
4209 | if (!head) | 4470 | if (!head) |
4210 | goto end; | 4471 | goto end; |
4211 | 4472 | ||
4212 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4473 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4213 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4474 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4214 | perf_swevent_add(event, nr, nmi, data, regs); | 4475 | perf_swevent_event(event, nr, nmi, data, regs); |
4215 | } | 4476 | } |
4216 | end: | 4477 | end: |
4217 | rcu_read_unlock(); | 4478 | rcu_read_unlock(); |
@@ -4219,33 +4480,17 @@ end: | |||
4219 | 4480 | ||
4220 | int perf_swevent_get_recursion_context(void) | 4481 | int perf_swevent_get_recursion_context(void) |
4221 | { | 4482 | { |
4222 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4483 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4223 | int rctx; | ||
4224 | |||
4225 | if (in_nmi()) | ||
4226 | rctx = 3; | ||
4227 | else if (in_irq()) | ||
4228 | rctx = 2; | ||
4229 | else if (in_softirq()) | ||
4230 | rctx = 1; | ||
4231 | else | ||
4232 | rctx = 0; | ||
4233 | |||
4234 | if (cpuctx->recursion[rctx]) | ||
4235 | return -1; | ||
4236 | 4484 | ||
4237 | cpuctx->recursion[rctx]++; | 4485 | return get_recursion_context(swhash->recursion); |
4238 | barrier(); | ||
4239 | |||
4240 | return rctx; | ||
4241 | } | 4486 | } |
4242 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4487 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4243 | 4488 | ||
4244 | void inline perf_swevent_put_recursion_context(int rctx) | 4489 | void inline perf_swevent_put_recursion_context(int rctx) |
4245 | { | 4490 | { |
4246 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4491 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4247 | barrier(); | 4492 | |
4248 | cpuctx->recursion[rctx]--; | 4493 | put_recursion_context(swhash->recursion, rctx); |
4249 | } | 4494 | } |
4250 | 4495 | ||
4251 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4496 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
@@ -4271,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
4271 | { | 4516 | { |
4272 | } | 4517 | } |
4273 | 4518 | ||
4274 | static int perf_swevent_enable(struct perf_event *event) | 4519 | static int perf_swevent_add(struct perf_event *event, int flags) |
4275 | { | 4520 | { |
4521 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
4276 | struct hw_perf_event *hwc = &event->hw; | 4522 | struct hw_perf_event *hwc = &event->hw; |
4277 | struct perf_cpu_context *cpuctx; | ||
4278 | struct hlist_head *head; | 4523 | struct hlist_head *head; |
4279 | 4524 | ||
4280 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4281 | |||
4282 | if (hwc->sample_period) { | 4525 | if (hwc->sample_period) { |
4283 | hwc->last_period = hwc->sample_period; | 4526 | hwc->last_period = hwc->sample_period; |
4284 | perf_swevent_set_period(event); | 4527 | perf_swevent_set_period(event); |
4285 | } | 4528 | } |
4286 | 4529 | ||
4287 | head = find_swevent_head(cpuctx, event); | 4530 | hwc->state = !(flags & PERF_EF_START); |
4531 | |||
4532 | head = find_swevent_head(swhash, event); | ||
4288 | if (WARN_ON_ONCE(!head)) | 4533 | if (WARN_ON_ONCE(!head)) |
4289 | return -EINVAL; | 4534 | return -EINVAL; |
4290 | 4535 | ||
@@ -4293,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4293 | return 0; | 4538 | return 0; |
4294 | } | 4539 | } |
4295 | 4540 | ||
4296 | static void perf_swevent_disable(struct perf_event *event) | 4541 | static void perf_swevent_del(struct perf_event *event, int flags) |
4297 | { | 4542 | { |
4298 | hlist_del_rcu(&event->hlist_entry); | 4543 | hlist_del_rcu(&event->hlist_entry); |
4299 | } | 4544 | } |
4300 | 4545 | ||
4301 | static void perf_swevent_void(struct perf_event *event) | 4546 | static void perf_swevent_start(struct perf_event *event, int flags) |
4302 | { | ||
4303 | } | ||
4304 | |||
4305 | static int perf_swevent_int(struct perf_event *event) | ||
4306 | { | ||
4307 | return 0; | ||
4308 | } | ||
4309 | |||
4310 | static const struct pmu perf_ops_generic = { | ||
4311 | .enable = perf_swevent_enable, | ||
4312 | .disable = perf_swevent_disable, | ||
4313 | .start = perf_swevent_int, | ||
4314 | .stop = perf_swevent_void, | ||
4315 | .read = perf_swevent_read, | ||
4316 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
4317 | }; | ||
4318 | |||
4319 | /* | ||
4320 | * hrtimer based swevent callback | ||
4321 | */ | ||
4322 | |||
4323 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4324 | { | 4547 | { |
4325 | enum hrtimer_restart ret = HRTIMER_RESTART; | 4548 | event->hw.state = 0; |
4326 | struct perf_sample_data data; | ||
4327 | struct pt_regs *regs; | ||
4328 | struct perf_event *event; | ||
4329 | u64 period; | ||
4330 | |||
4331 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
4332 | event->pmu->read(event); | ||
4333 | |||
4334 | perf_sample_data_init(&data, 0); | ||
4335 | data.period = event->hw.last_period; | ||
4336 | regs = get_irq_regs(); | ||
4337 | |||
4338 | if (regs && !perf_exclude_event(event, regs)) { | ||
4339 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4340 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4341 | ret = HRTIMER_NORESTART; | ||
4342 | } | ||
4343 | |||
4344 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4345 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4346 | |||
4347 | return ret; | ||
4348 | } | 4549 | } |
4349 | 4550 | ||
4350 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 4551 | static void perf_swevent_stop(struct perf_event *event, int flags) |
4351 | { | 4552 | { |
4352 | struct hw_perf_event *hwc = &event->hw; | 4553 | event->hw.state = PERF_HES_STOPPED; |
4353 | |||
4354 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4355 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4356 | if (hwc->sample_period) { | ||
4357 | u64 period; | ||
4358 | |||
4359 | if (hwc->remaining) { | ||
4360 | if (hwc->remaining < 0) | ||
4361 | period = 10000; | ||
4362 | else | ||
4363 | period = hwc->remaining; | ||
4364 | hwc->remaining = 0; | ||
4365 | } else { | ||
4366 | period = max_t(u64, 10000, hwc->sample_period); | ||
4367 | } | ||
4368 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4369 | ns_to_ktime(period), 0, | ||
4370 | HRTIMER_MODE_REL, 0); | ||
4371 | } | ||
4372 | } | ||
4373 | |||
4374 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4375 | { | ||
4376 | struct hw_perf_event *hwc = &event->hw; | ||
4377 | |||
4378 | if (hwc->sample_period) { | ||
4379 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4380 | hwc->remaining = ktime_to_ns(remaining); | ||
4381 | |||
4382 | hrtimer_cancel(&hwc->hrtimer); | ||
4383 | } | ||
4384 | } | ||
4385 | |||
4386 | /* | ||
4387 | * Software event: cpu wall time clock | ||
4388 | */ | ||
4389 | |||
4390 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
4391 | { | ||
4392 | int cpu = raw_smp_processor_id(); | ||
4393 | s64 prev; | ||
4394 | u64 now; | ||
4395 | |||
4396 | now = cpu_clock(cpu); | ||
4397 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4398 | local64_add(now - prev, &event->count); | ||
4399 | } | ||
4400 | |||
4401 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
4402 | { | ||
4403 | struct hw_perf_event *hwc = &event->hw; | ||
4404 | int cpu = raw_smp_processor_id(); | ||
4405 | |||
4406 | local64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
4407 | perf_swevent_start_hrtimer(event); | ||
4408 | |||
4409 | return 0; | ||
4410 | } | ||
4411 | |||
4412 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
4413 | { | ||
4414 | perf_swevent_cancel_hrtimer(event); | ||
4415 | cpu_clock_perf_event_update(event); | ||
4416 | } | ||
4417 | |||
4418 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
4419 | { | ||
4420 | cpu_clock_perf_event_update(event); | ||
4421 | } | ||
4422 | |||
4423 | static const struct pmu perf_ops_cpu_clock = { | ||
4424 | .enable = cpu_clock_perf_event_enable, | ||
4425 | .disable = cpu_clock_perf_event_disable, | ||
4426 | .read = cpu_clock_perf_event_read, | ||
4427 | }; | ||
4428 | |||
4429 | /* | ||
4430 | * Software event: task time clock | ||
4431 | */ | ||
4432 | |||
4433 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
4434 | { | ||
4435 | u64 prev; | ||
4436 | s64 delta; | ||
4437 | |||
4438 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4439 | delta = now - prev; | ||
4440 | local64_add(delta, &event->count); | ||
4441 | } | ||
4442 | |||
4443 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
4444 | { | ||
4445 | struct hw_perf_event *hwc = &event->hw; | ||
4446 | u64 now; | ||
4447 | |||
4448 | now = event->ctx->time; | ||
4449 | |||
4450 | local64_set(&hwc->prev_count, now); | ||
4451 | |||
4452 | perf_swevent_start_hrtimer(event); | ||
4453 | |||
4454 | return 0; | ||
4455 | } | ||
4456 | |||
4457 | static void task_clock_perf_event_disable(struct perf_event *event) | ||
4458 | { | ||
4459 | perf_swevent_cancel_hrtimer(event); | ||
4460 | task_clock_perf_event_update(event, event->ctx->time); | ||
4461 | |||
4462 | } | ||
4463 | |||
4464 | static void task_clock_perf_event_read(struct perf_event *event) | ||
4465 | { | ||
4466 | u64 time; | ||
4467 | |||
4468 | if (!in_nmi()) { | ||
4469 | update_context_time(event->ctx); | ||
4470 | time = event->ctx->time; | ||
4471 | } else { | ||
4472 | u64 now = perf_clock(); | ||
4473 | u64 delta = now - event->ctx->timestamp; | ||
4474 | time = event->ctx->time + delta; | ||
4475 | } | ||
4476 | |||
4477 | task_clock_perf_event_update(event, time); | ||
4478 | } | 4554 | } |
4479 | 4555 | ||
4480 | static const struct pmu perf_ops_task_clock = { | ||
4481 | .enable = task_clock_perf_event_enable, | ||
4482 | .disable = task_clock_perf_event_disable, | ||
4483 | .read = task_clock_perf_event_read, | ||
4484 | }; | ||
4485 | |||
4486 | /* Deref the hlist from the update side */ | 4556 | /* Deref the hlist from the update side */ |
4487 | static inline struct swevent_hlist * | 4557 | static inline struct swevent_hlist * |
4488 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 4558 | swevent_hlist_deref(struct swevent_htable *swhash) |
4489 | { | 4559 | { |
4490 | return rcu_dereference_protected(cpuctx->swevent_hlist, | 4560 | return rcu_dereference_protected(swhash->swevent_hlist, |
4491 | lockdep_is_held(&cpuctx->hlist_mutex)); | 4561 | lockdep_is_held(&swhash->hlist_mutex)); |
4492 | } | 4562 | } |
4493 | 4563 | ||
4494 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4564 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
@@ -4499,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
4499 | kfree(hlist); | 4569 | kfree(hlist); |
4500 | } | 4570 | } |
4501 | 4571 | ||
4502 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4572 | static void swevent_hlist_release(struct swevent_htable *swhash) |
4503 | { | 4573 | { |
4504 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 4574 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
4505 | 4575 | ||
4506 | if (!hlist) | 4576 | if (!hlist) |
4507 | return; | 4577 | return; |
4508 | 4578 | ||
4509 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4579 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
4510 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4580 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4511 | } | 4581 | } |
4512 | 4582 | ||
4513 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4583 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4514 | { | 4584 | { |
4515 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4585 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4516 | 4586 | ||
4517 | mutex_lock(&cpuctx->hlist_mutex); | 4587 | mutex_lock(&swhash->hlist_mutex); |
4518 | 4588 | ||
4519 | if (!--cpuctx->hlist_refcount) | 4589 | if (!--swhash->hlist_refcount) |
4520 | swevent_hlist_release(cpuctx); | 4590 | swevent_hlist_release(swhash); |
4521 | 4591 | ||
4522 | mutex_unlock(&cpuctx->hlist_mutex); | 4592 | mutex_unlock(&swhash->hlist_mutex); |
4523 | } | 4593 | } |
4524 | 4594 | ||
4525 | static void swevent_hlist_put(struct perf_event *event) | 4595 | static void swevent_hlist_put(struct perf_event *event) |
@@ -4537,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
4537 | 4607 | ||
4538 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 4608 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4539 | { | 4609 | { |
4540 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4610 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4541 | int err = 0; | 4611 | int err = 0; |
4542 | 4612 | ||
4543 | mutex_lock(&cpuctx->hlist_mutex); | 4613 | mutex_lock(&swhash->hlist_mutex); |
4544 | 4614 | ||
4545 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 4615 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
4546 | struct swevent_hlist *hlist; | 4616 | struct swevent_hlist *hlist; |
4547 | 4617 | ||
4548 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4618 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4550,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4550 | err = -ENOMEM; | 4620 | err = -ENOMEM; |
4551 | goto exit; | 4621 | goto exit; |
4552 | } | 4622 | } |
4553 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 4623 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
4554 | } | 4624 | } |
4555 | cpuctx->hlist_refcount++; | 4625 | swhash->hlist_refcount++; |
4556 | exit: | 4626 | exit: |
4557 | mutex_unlock(&cpuctx->hlist_mutex); | 4627 | mutex_unlock(&swhash->hlist_mutex); |
4558 | 4628 | ||
4559 | return err; | 4629 | return err; |
4560 | } | 4630 | } |
@@ -4578,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4578 | put_online_cpus(); | 4648 | put_online_cpus(); |
4579 | 4649 | ||
4580 | return 0; | 4650 | return 0; |
4581 | fail: | 4651 | fail: |
4582 | for_each_possible_cpu(cpu) { | 4652 | for_each_possible_cpu(cpu) { |
4583 | if (cpu == failed_cpu) | 4653 | if (cpu == failed_cpu) |
4584 | break; | 4654 | break; |
@@ -4589,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4589 | return err; | 4659 | return err; |
4590 | } | 4660 | } |
4591 | 4661 | ||
4592 | #ifdef CONFIG_EVENT_TRACING | 4662 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4663 | |||
4664 | static void sw_perf_event_destroy(struct perf_event *event) | ||
4665 | { | ||
4666 | u64 event_id = event->attr.config; | ||
4667 | |||
4668 | WARN_ON(event->parent); | ||
4669 | |||
4670 | jump_label_dec(&perf_swevent_enabled[event_id]); | ||
4671 | swevent_hlist_put(event); | ||
4672 | } | ||
4673 | |||
4674 | static int perf_swevent_init(struct perf_event *event) | ||
4675 | { | ||
4676 | int event_id = event->attr.config; | ||
4677 | |||
4678 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4679 | return -ENOENT; | ||
4680 | |||
4681 | switch (event_id) { | ||
4682 | case PERF_COUNT_SW_CPU_CLOCK: | ||
4683 | case PERF_COUNT_SW_TASK_CLOCK: | ||
4684 | return -ENOENT; | ||
4593 | 4685 | ||
4594 | static const struct pmu perf_ops_tracepoint = { | 4686 | default: |
4595 | .enable = perf_trace_enable, | 4687 | break; |
4596 | .disable = perf_trace_disable, | 4688 | } |
4597 | .start = perf_swevent_int, | 4689 | |
4598 | .stop = perf_swevent_void, | 4690 | if (event_id > PERF_COUNT_SW_MAX) |
4691 | return -ENOENT; | ||
4692 | |||
4693 | if (!event->parent) { | ||
4694 | int err; | ||
4695 | |||
4696 | err = swevent_hlist_get(event); | ||
4697 | if (err) | ||
4698 | return err; | ||
4699 | |||
4700 | jump_label_inc(&perf_swevent_enabled[event_id]); | ||
4701 | event->destroy = sw_perf_event_destroy; | ||
4702 | } | ||
4703 | |||
4704 | return 0; | ||
4705 | } | ||
4706 | |||
4707 | static struct pmu perf_swevent = { | ||
4708 | .task_ctx_nr = perf_sw_context, | ||
4709 | |||
4710 | .event_init = perf_swevent_init, | ||
4711 | .add = perf_swevent_add, | ||
4712 | .del = perf_swevent_del, | ||
4713 | .start = perf_swevent_start, | ||
4714 | .stop = perf_swevent_stop, | ||
4599 | .read = perf_swevent_read, | 4715 | .read = perf_swevent_read, |
4600 | .unthrottle = perf_swevent_void, | ||
4601 | }; | 4716 | }; |
4602 | 4717 | ||
4718 | #ifdef CONFIG_EVENT_TRACING | ||
4719 | |||
4603 | static int perf_tp_filter_match(struct perf_event *event, | 4720 | static int perf_tp_filter_match(struct perf_event *event, |
4604 | struct perf_sample_data *data) | 4721 | struct perf_sample_data *data) |
4605 | { | 4722 | { |
@@ -4643,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4643 | 4760 | ||
4644 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4761 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4645 | if (perf_tp_event_match(event, &data, regs)) | 4762 | if (perf_tp_event_match(event, &data, regs)) |
4646 | perf_swevent_add(event, count, 1, &data, regs); | 4763 | perf_swevent_event(event, count, 1, &data, regs); |
4647 | } | 4764 | } |
4648 | 4765 | ||
4649 | perf_swevent_put_recursion_context(rctx); | 4766 | perf_swevent_put_recursion_context(rctx); |
@@ -4655,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
4655 | perf_trace_destroy(event); | 4772 | perf_trace_destroy(event); |
4656 | } | 4773 | } |
4657 | 4774 | ||
4658 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4775 | static int perf_tp_event_init(struct perf_event *event) |
4659 | { | 4776 | { |
4660 | int err; | 4777 | int err; |
4661 | 4778 | ||
4779 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
4780 | return -ENOENT; | ||
4781 | |||
4662 | /* | 4782 | /* |
4663 | * Raw tracepoint data is a severe data leak, only allow root to | 4783 | * Raw tracepoint data is a severe data leak, only allow root to |
4664 | * have these. | 4784 | * have these. |
@@ -4666,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4666 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | 4786 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
4667 | perf_paranoid_tracepoint_raw() && | 4787 | perf_paranoid_tracepoint_raw() && |
4668 | !capable(CAP_SYS_ADMIN)) | 4788 | !capable(CAP_SYS_ADMIN)) |
4669 | return ERR_PTR(-EPERM); | 4789 | return -EPERM; |
4670 | 4790 | ||
4671 | err = perf_trace_init(event); | 4791 | err = perf_trace_init(event); |
4672 | if (err) | 4792 | if (err) |
4673 | return NULL; | 4793 | return err; |
4674 | 4794 | ||
4675 | event->destroy = tp_perf_event_destroy; | 4795 | event->destroy = tp_perf_event_destroy; |
4676 | 4796 | ||
4677 | return &perf_ops_tracepoint; | 4797 | return 0; |
4798 | } | ||
4799 | |||
4800 | static struct pmu perf_tracepoint = { | ||
4801 | .task_ctx_nr = perf_sw_context, | ||
4802 | |||
4803 | .event_init = perf_tp_event_init, | ||
4804 | .add = perf_trace_add, | ||
4805 | .del = perf_trace_del, | ||
4806 | .start = perf_swevent_start, | ||
4807 | .stop = perf_swevent_stop, | ||
4808 | .read = perf_swevent_read, | ||
4809 | }; | ||
4810 | |||
4811 | static inline void perf_tp_register(void) | ||
4812 | { | ||
4813 | perf_pmu_register(&perf_tracepoint); | ||
4678 | } | 4814 | } |
4679 | 4815 | ||
4680 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4816 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4702,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4702 | 4838 | ||
4703 | #else | 4839 | #else |
4704 | 4840 | ||
4705 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4841 | static inline void perf_tp_register(void) |
4706 | { | 4842 | { |
4707 | return NULL; | ||
4708 | } | 4843 | } |
4709 | 4844 | ||
4710 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4845 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4719,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4719 | #endif /* CONFIG_EVENT_TRACING */ | 4854 | #endif /* CONFIG_EVENT_TRACING */ |
4720 | 4855 | ||
4721 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4856 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4722 | static void bp_perf_event_destroy(struct perf_event *event) | 4857 | void perf_bp_event(struct perf_event *bp, void *data) |
4723 | { | 4858 | { |
4724 | release_bp_slot(event); | 4859 | struct perf_sample_data sample; |
4860 | struct pt_regs *regs = data; | ||
4861 | |||
4862 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
4863 | |||
4864 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
4865 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
4725 | } | 4866 | } |
4867 | #endif | ||
4726 | 4868 | ||
4727 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4869 | /* |
4870 | * hrtimer based swevent callback | ||
4871 | */ | ||
4872 | |||
4873 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4728 | { | 4874 | { |
4729 | int err; | 4875 | enum hrtimer_restart ret = HRTIMER_RESTART; |
4876 | struct perf_sample_data data; | ||
4877 | struct pt_regs *regs; | ||
4878 | struct perf_event *event; | ||
4879 | u64 period; | ||
4730 | 4880 | ||
4731 | err = register_perf_hw_breakpoint(bp); | 4881 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4732 | if (err) | 4882 | event->pmu->read(event); |
4733 | return ERR_PTR(err); | 4883 | |
4884 | perf_sample_data_init(&data, 0); | ||
4885 | data.period = event->hw.last_period; | ||
4886 | regs = get_irq_regs(); | ||
4887 | |||
4888 | if (regs && !perf_exclude_event(event, regs)) { | ||
4889 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4890 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4891 | ret = HRTIMER_NORESTART; | ||
4892 | } | ||
4893 | |||
4894 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4895 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4734 | 4896 | ||
4735 | bp->destroy = bp_perf_event_destroy; | 4897 | return ret; |
4898 | } | ||
4736 | 4899 | ||
4737 | return &perf_ops_bp; | 4900 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4901 | { | ||
4902 | struct hw_perf_event *hwc = &event->hw; | ||
4903 | |||
4904 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4905 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4906 | if (hwc->sample_period) { | ||
4907 | s64 period = local64_read(&hwc->period_left); | ||
4908 | |||
4909 | if (period) { | ||
4910 | if (period < 0) | ||
4911 | period = 10000; | ||
4912 | |||
4913 | local64_set(&hwc->period_left, 0); | ||
4914 | } else { | ||
4915 | period = max_t(u64, 10000, hwc->sample_period); | ||
4916 | } | ||
4917 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4918 | ns_to_ktime(period), 0, | ||
4919 | HRTIMER_MODE_REL_PINNED, 0); | ||
4920 | } | ||
4738 | } | 4921 | } |
4739 | 4922 | ||
4740 | void perf_bp_event(struct perf_event *bp, void *data) | 4923 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4741 | { | 4924 | { |
4742 | struct perf_sample_data sample; | 4925 | struct hw_perf_event *hwc = &event->hw; |
4743 | struct pt_regs *regs = data; | ||
4744 | 4926 | ||
4745 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 4927 | if (hwc->sample_period) { |
4928 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4929 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
4746 | 4930 | ||
4747 | if (!perf_exclude_event(bp, regs)) | 4931 | hrtimer_cancel(&hwc->hrtimer); |
4748 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4932 | } |
4749 | } | 4933 | } |
4750 | #else | 4934 | |
4751 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4935 | /* |
4936 | * Software event: cpu wall time clock | ||
4937 | */ | ||
4938 | |||
4939 | static void cpu_clock_event_update(struct perf_event *event) | ||
4752 | { | 4940 | { |
4753 | return NULL; | 4941 | s64 prev; |
4942 | u64 now; | ||
4943 | |||
4944 | now = local_clock(); | ||
4945 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4946 | local64_add(now - prev, &event->count); | ||
4754 | } | 4947 | } |
4755 | 4948 | ||
4756 | void perf_bp_event(struct perf_event *bp, void *regs) | 4949 | static void cpu_clock_event_start(struct perf_event *event, int flags) |
4757 | { | 4950 | { |
4951 | local64_set(&event->hw.prev_count, local_clock()); | ||
4952 | perf_swevent_start_hrtimer(event); | ||
4758 | } | 4953 | } |
4759 | #endif | ||
4760 | 4954 | ||
4761 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 4955 | static void cpu_clock_event_stop(struct perf_event *event, int flags) |
4956 | { | ||
4957 | perf_swevent_cancel_hrtimer(event); | ||
4958 | cpu_clock_event_update(event); | ||
4959 | } | ||
4762 | 4960 | ||
4763 | static void sw_perf_event_destroy(struct perf_event *event) | 4961 | static int cpu_clock_event_add(struct perf_event *event, int flags) |
4764 | { | 4962 | { |
4765 | u64 event_id = event->attr.config; | 4963 | if (flags & PERF_EF_START) |
4964 | cpu_clock_event_start(event, flags); | ||
4766 | 4965 | ||
4767 | WARN_ON(event->parent); | 4966 | return 0; |
4967 | } | ||
4768 | 4968 | ||
4769 | atomic_dec(&perf_swevent_enabled[event_id]); | 4969 | static void cpu_clock_event_del(struct perf_event *event, int flags) |
4770 | swevent_hlist_put(event); | 4970 | { |
4971 | cpu_clock_event_stop(event, flags); | ||
4771 | } | 4972 | } |
4772 | 4973 | ||
4773 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 4974 | static void cpu_clock_event_read(struct perf_event *event) |
4774 | { | 4975 | { |
4775 | const struct pmu *pmu = NULL; | 4976 | cpu_clock_event_update(event); |
4776 | u64 event_id = event->attr.config; | 4977 | } |
4978 | |||
4979 | static int cpu_clock_event_init(struct perf_event *event) | ||
4980 | { | ||
4981 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4982 | return -ENOENT; | ||
4983 | |||
4984 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
4985 | return -ENOENT; | ||
4986 | |||
4987 | return 0; | ||
4988 | } | ||
4777 | 4989 | ||
4990 | static struct pmu perf_cpu_clock = { | ||
4991 | .task_ctx_nr = perf_sw_context, | ||
4992 | |||
4993 | .event_init = cpu_clock_event_init, | ||
4994 | .add = cpu_clock_event_add, | ||
4995 | .del = cpu_clock_event_del, | ||
4996 | .start = cpu_clock_event_start, | ||
4997 | .stop = cpu_clock_event_stop, | ||
4998 | .read = cpu_clock_event_read, | ||
4999 | }; | ||
5000 | |||
5001 | /* | ||
5002 | * Software event: task time clock | ||
5003 | */ | ||
5004 | |||
5005 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
5006 | { | ||
5007 | u64 prev; | ||
5008 | s64 delta; | ||
5009 | |||
5010 | prev = local64_xchg(&event->hw.prev_count, now); | ||
5011 | delta = now - prev; | ||
5012 | local64_add(delta, &event->count); | ||
5013 | } | ||
5014 | |||
5015 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
5016 | { | ||
5017 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
5018 | perf_swevent_start_hrtimer(event); | ||
5019 | } | ||
5020 | |||
5021 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
5022 | { | ||
5023 | perf_swevent_cancel_hrtimer(event); | ||
5024 | task_clock_event_update(event, event->ctx->time); | ||
5025 | } | ||
5026 | |||
5027 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
5028 | { | ||
5029 | if (flags & PERF_EF_START) | ||
5030 | task_clock_event_start(event, flags); | ||
5031 | |||
5032 | return 0; | ||
5033 | } | ||
5034 | |||
5035 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
5036 | { | ||
5037 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
5038 | } | ||
5039 | |||
5040 | static void task_clock_event_read(struct perf_event *event) | ||
5041 | { | ||
5042 | u64 time; | ||
5043 | |||
5044 | if (!in_nmi()) { | ||
5045 | update_context_time(event->ctx); | ||
5046 | time = event->ctx->time; | ||
5047 | } else { | ||
5048 | u64 now = perf_clock(); | ||
5049 | u64 delta = now - event->ctx->timestamp; | ||
5050 | time = event->ctx->time + delta; | ||
5051 | } | ||
5052 | |||
5053 | task_clock_event_update(event, time); | ||
5054 | } | ||
5055 | |||
5056 | static int task_clock_event_init(struct perf_event *event) | ||
5057 | { | ||
5058 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5059 | return -ENOENT; | ||
5060 | |||
5061 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
5062 | return -ENOENT; | ||
5063 | |||
5064 | return 0; | ||
5065 | } | ||
5066 | |||
5067 | static struct pmu perf_task_clock = { | ||
5068 | .task_ctx_nr = perf_sw_context, | ||
5069 | |||
5070 | .event_init = task_clock_event_init, | ||
5071 | .add = task_clock_event_add, | ||
5072 | .del = task_clock_event_del, | ||
5073 | .start = task_clock_event_start, | ||
5074 | .stop = task_clock_event_stop, | ||
5075 | .read = task_clock_event_read, | ||
5076 | }; | ||
5077 | |||
5078 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
5079 | { | ||
5080 | } | ||
5081 | |||
5082 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
5083 | { | ||
5084 | return 0; | ||
5085 | } | ||
5086 | |||
5087 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
5088 | { | ||
5089 | perf_pmu_disable(pmu); | ||
5090 | } | ||
5091 | |||
5092 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
5093 | { | ||
5094 | perf_pmu_enable(pmu); | ||
5095 | return 0; | ||
5096 | } | ||
5097 | |||
5098 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
5099 | { | ||
5100 | perf_pmu_enable(pmu); | ||
5101 | } | ||
5102 | |||
5103 | /* | ||
5104 | * Ensures all contexts with the same task_ctx_nr have the same | ||
5105 | * pmu_cpu_context too. | ||
5106 | */ | ||
5107 | static void *find_pmu_context(int ctxn) | ||
5108 | { | ||
5109 | struct pmu *pmu; | ||
5110 | |||
5111 | if (ctxn < 0) | ||
5112 | return NULL; | ||
5113 | |||
5114 | list_for_each_entry(pmu, &pmus, entry) { | ||
5115 | if (pmu->task_ctx_nr == ctxn) | ||
5116 | return pmu->pmu_cpu_context; | ||
5117 | } | ||
5118 | |||
5119 | return NULL; | ||
5120 | } | ||
5121 | |||
5122 | static void free_pmu_context(void * __percpu cpu_context) | ||
5123 | { | ||
5124 | struct pmu *pmu; | ||
5125 | |||
5126 | mutex_lock(&pmus_lock); | ||
4778 | /* | 5127 | /* |
4779 | * Software events (currently) can't in general distinguish | 5128 | * Like a real lame refcount. |
4780 | * between user, kernel and hypervisor events. | ||
4781 | * However, context switches and cpu migrations are considered | ||
4782 | * to be kernel events, and page faults are never hypervisor | ||
4783 | * events. | ||
4784 | */ | 5129 | */ |
4785 | switch (event_id) { | 5130 | list_for_each_entry(pmu, &pmus, entry) { |
4786 | case PERF_COUNT_SW_CPU_CLOCK: | 5131 | if (pmu->pmu_cpu_context == cpu_context) |
4787 | pmu = &perf_ops_cpu_clock; | 5132 | goto out; |
5133 | } | ||
4788 | 5134 | ||
4789 | break; | 5135 | free_percpu(cpu_context); |
4790 | case PERF_COUNT_SW_TASK_CLOCK: | 5136 | out: |
4791 | /* | 5137 | mutex_unlock(&pmus_lock); |
4792 | * If the user instantiates this as a per-cpu event, | 5138 | } |
4793 | * use the cpu_clock event instead. | ||
4794 | */ | ||
4795 | if (event->ctx->task) | ||
4796 | pmu = &perf_ops_task_clock; | ||
4797 | else | ||
4798 | pmu = &perf_ops_cpu_clock; | ||
4799 | 5139 | ||
4800 | break; | 5140 | int perf_pmu_register(struct pmu *pmu) |
4801 | case PERF_COUNT_SW_PAGE_FAULTS: | 5141 | { |
4802 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 5142 | int cpu, ret; |
4803 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | 5143 | |
4804 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | 5144 | mutex_lock(&pmus_lock); |
4805 | case PERF_COUNT_SW_CPU_MIGRATIONS: | 5145 | ret = -ENOMEM; |
4806 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | 5146 | pmu->pmu_disable_count = alloc_percpu(int); |
4807 | case PERF_COUNT_SW_EMULATION_FAULTS: | 5147 | if (!pmu->pmu_disable_count) |
4808 | if (!event->parent) { | 5148 | goto unlock; |
4809 | int err; | ||
4810 | |||
4811 | err = swevent_hlist_get(event); | ||
4812 | if (err) | ||
4813 | return ERR_PTR(err); | ||
4814 | 5149 | ||
4815 | atomic_inc(&perf_swevent_enabled[event_id]); | 5150 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
4816 | event->destroy = sw_perf_event_destroy; | 5151 | if (pmu->pmu_cpu_context) |
5152 | goto got_cpu_context; | ||
5153 | |||
5154 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
5155 | if (!pmu->pmu_cpu_context) | ||
5156 | goto free_pdc; | ||
5157 | |||
5158 | for_each_possible_cpu(cpu) { | ||
5159 | struct perf_cpu_context *cpuctx; | ||
5160 | |||
5161 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5162 | __perf_event_init_context(&cpuctx->ctx); | ||
5163 | cpuctx->ctx.type = cpu_context; | ||
5164 | cpuctx->ctx.pmu = pmu; | ||
5165 | cpuctx->jiffies_interval = 1; | ||
5166 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
5167 | } | ||
5168 | |||
5169 | got_cpu_context: | ||
5170 | if (!pmu->start_txn) { | ||
5171 | if (pmu->pmu_enable) { | ||
5172 | /* | ||
5173 | * If we have pmu_enable/pmu_disable calls, install | ||
5174 | * transaction stubs that use that to try and batch | ||
5175 | * hardware accesses. | ||
5176 | */ | ||
5177 | pmu->start_txn = perf_pmu_start_txn; | ||
5178 | pmu->commit_txn = perf_pmu_commit_txn; | ||
5179 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
5180 | } else { | ||
5181 | pmu->start_txn = perf_pmu_nop_void; | ||
5182 | pmu->commit_txn = perf_pmu_nop_int; | ||
5183 | pmu->cancel_txn = perf_pmu_nop_void; | ||
5184 | } | ||
5185 | } | ||
5186 | |||
5187 | if (!pmu->pmu_enable) { | ||
5188 | pmu->pmu_enable = perf_pmu_nop_void; | ||
5189 | pmu->pmu_disable = perf_pmu_nop_void; | ||
5190 | } | ||
5191 | |||
5192 | list_add_rcu(&pmu->entry, &pmus); | ||
5193 | ret = 0; | ||
5194 | unlock: | ||
5195 | mutex_unlock(&pmus_lock); | ||
5196 | |||
5197 | return ret; | ||
5198 | |||
5199 | free_pdc: | ||
5200 | free_percpu(pmu->pmu_disable_count); | ||
5201 | goto unlock; | ||
5202 | } | ||
5203 | |||
5204 | void perf_pmu_unregister(struct pmu *pmu) | ||
5205 | { | ||
5206 | mutex_lock(&pmus_lock); | ||
5207 | list_del_rcu(&pmu->entry); | ||
5208 | mutex_unlock(&pmus_lock); | ||
5209 | |||
5210 | /* | ||
5211 | * We dereference the pmu list under both SRCU and regular RCU, so | ||
5212 | * synchronize against both of those. | ||
5213 | */ | ||
5214 | synchronize_srcu(&pmus_srcu); | ||
5215 | synchronize_rcu(); | ||
5216 | |||
5217 | free_percpu(pmu->pmu_disable_count); | ||
5218 | free_pmu_context(pmu->pmu_cpu_context); | ||
5219 | } | ||
5220 | |||
5221 | struct pmu *perf_init_event(struct perf_event *event) | ||
5222 | { | ||
5223 | struct pmu *pmu = NULL; | ||
5224 | int idx; | ||
5225 | |||
5226 | idx = srcu_read_lock(&pmus_srcu); | ||
5227 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
5228 | int ret = pmu->event_init(event); | ||
5229 | if (!ret) | ||
5230 | goto unlock; | ||
5231 | |||
5232 | if (ret != -ENOENT) { | ||
5233 | pmu = ERR_PTR(ret); | ||
5234 | goto unlock; | ||
4817 | } | 5235 | } |
4818 | pmu = &perf_ops_generic; | ||
4819 | break; | ||
4820 | } | 5236 | } |
5237 | pmu = ERR_PTR(-ENOENT); | ||
5238 | unlock: | ||
5239 | srcu_read_unlock(&pmus_srcu, idx); | ||
4821 | 5240 | ||
4822 | return pmu; | 5241 | return pmu; |
4823 | } | 5242 | } |
@@ -4826,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4826 | * Allocate and initialize a event structure | 5245 | * Allocate and initialize a event structure |
4827 | */ | 5246 | */ |
4828 | static struct perf_event * | 5247 | static struct perf_event * |
4829 | perf_event_alloc(struct perf_event_attr *attr, | 5248 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
4830 | int cpu, | 5249 | struct task_struct *task, |
4831 | struct perf_event_context *ctx, | 5250 | struct perf_event *group_leader, |
4832 | struct perf_event *group_leader, | 5251 | struct perf_event *parent_event, |
4833 | struct perf_event *parent_event, | 5252 | perf_overflow_handler_t overflow_handler) |
4834 | perf_overflow_handler_t overflow_handler, | 5253 | { |
4835 | gfp_t gfpflags) | 5254 | struct pmu *pmu; |
4836 | { | ||
4837 | const struct pmu *pmu; | ||
4838 | struct perf_event *event; | 5255 | struct perf_event *event; |
4839 | struct hw_perf_event *hwc; | 5256 | struct hw_perf_event *hwc; |
4840 | long err; | 5257 | long err; |
4841 | 5258 | ||
4842 | event = kzalloc(sizeof(*event), gfpflags); | 5259 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
4843 | if (!event) | 5260 | if (!event) |
4844 | return ERR_PTR(-ENOMEM); | 5261 | return ERR_PTR(-ENOMEM); |
4845 | 5262 | ||
@@ -4857,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4857 | INIT_LIST_HEAD(&event->event_entry); | 5274 | INIT_LIST_HEAD(&event->event_entry); |
4858 | INIT_LIST_HEAD(&event->sibling_list); | 5275 | INIT_LIST_HEAD(&event->sibling_list); |
4859 | init_waitqueue_head(&event->waitq); | 5276 | init_waitqueue_head(&event->waitq); |
5277 | init_irq_work(&event->pending, perf_pending_event); | ||
4860 | 5278 | ||
4861 | mutex_init(&event->mmap_mutex); | 5279 | mutex_init(&event->mmap_mutex); |
4862 | 5280 | ||
@@ -4864,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4864 | event->attr = *attr; | 5282 | event->attr = *attr; |
4865 | event->group_leader = group_leader; | 5283 | event->group_leader = group_leader; |
4866 | event->pmu = NULL; | 5284 | event->pmu = NULL; |
4867 | event->ctx = ctx; | ||
4868 | event->oncpu = -1; | 5285 | event->oncpu = -1; |
4869 | 5286 | ||
4870 | event->parent = parent_event; | 5287 | event->parent = parent_event; |
@@ -4874,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4874 | 5291 | ||
4875 | event->state = PERF_EVENT_STATE_INACTIVE; | 5292 | event->state = PERF_EVENT_STATE_INACTIVE; |
4876 | 5293 | ||
5294 | if (task) { | ||
5295 | event->attach_state = PERF_ATTACH_TASK; | ||
5296 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
5297 | /* | ||
5298 | * hw_breakpoint is a bit difficult here.. | ||
5299 | */ | ||
5300 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
5301 | event->hw.bp_target = task; | ||
5302 | #endif | ||
5303 | } | ||
5304 | |||
4877 | if (!overflow_handler && parent_event) | 5305 | if (!overflow_handler && parent_event) |
4878 | overflow_handler = parent_event->overflow_handler; | 5306 | overflow_handler = parent_event->overflow_handler; |
4879 | 5307 | ||
@@ -4898,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4898 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 5326 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4899 | goto done; | 5327 | goto done; |
4900 | 5328 | ||
4901 | switch (attr->type) { | 5329 | pmu = perf_init_event(event); |
4902 | case PERF_TYPE_RAW: | ||
4903 | case PERF_TYPE_HARDWARE: | ||
4904 | case PERF_TYPE_HW_CACHE: | ||
4905 | pmu = hw_perf_event_init(event); | ||
4906 | break; | ||
4907 | |||
4908 | case PERF_TYPE_SOFTWARE: | ||
4909 | pmu = sw_perf_event_init(event); | ||
4910 | break; | ||
4911 | |||
4912 | case PERF_TYPE_TRACEPOINT: | ||
4913 | pmu = tp_perf_event_init(event); | ||
4914 | break; | ||
4915 | 5330 | ||
4916 | case PERF_TYPE_BREAKPOINT: | ||
4917 | pmu = bp_perf_event_init(event); | ||
4918 | break; | ||
4919 | |||
4920 | |||
4921 | default: | ||
4922 | break; | ||
4923 | } | ||
4924 | done: | 5331 | done: |
4925 | err = 0; | 5332 | err = 0; |
4926 | if (!pmu) | 5333 | if (!pmu) |
@@ -4938,13 +5345,21 @@ done: | |||
4938 | event->pmu = pmu; | 5345 | event->pmu = pmu; |
4939 | 5346 | ||
4940 | if (!event->parent) { | 5347 | if (!event->parent) { |
4941 | atomic_inc(&nr_events); | 5348 | if (event->attach_state & PERF_ATTACH_TASK) |
5349 | jump_label_inc(&perf_task_events); | ||
4942 | if (event->attr.mmap || event->attr.mmap_data) | 5350 | if (event->attr.mmap || event->attr.mmap_data) |
4943 | atomic_inc(&nr_mmap_events); | 5351 | atomic_inc(&nr_mmap_events); |
4944 | if (event->attr.comm) | 5352 | if (event->attr.comm) |
4945 | atomic_inc(&nr_comm_events); | 5353 | atomic_inc(&nr_comm_events); |
4946 | if (event->attr.task) | 5354 | if (event->attr.task) |
4947 | atomic_inc(&nr_task_events); | 5355 | atomic_inc(&nr_task_events); |
5356 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
5357 | err = get_callchain_buffers(); | ||
5358 | if (err) { | ||
5359 | free_event(event); | ||
5360 | return ERR_PTR(err); | ||
5361 | } | ||
5362 | } | ||
4948 | } | 5363 | } |
4949 | 5364 | ||
4950 | return event; | 5365 | return event; |
@@ -5092,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5092 | struct perf_event_attr __user *, attr_uptr, | 5507 | struct perf_event_attr __user *, attr_uptr, |
5093 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5508 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5094 | { | 5509 | { |
5095 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 5510 | struct perf_event *group_leader = NULL, *output_event = NULL; |
5511 | struct perf_event *event, *sibling; | ||
5096 | struct perf_event_attr attr; | 5512 | struct perf_event_attr attr; |
5097 | struct perf_event_context *ctx; | 5513 | struct perf_event_context *ctx; |
5098 | struct file *event_file = NULL; | 5514 | struct file *event_file = NULL; |
5099 | struct file *group_file = NULL; | 5515 | struct file *group_file = NULL; |
5516 | struct task_struct *task = NULL; | ||
5517 | struct pmu *pmu; | ||
5100 | int event_fd; | 5518 | int event_fd; |
5519 | int move_group = 0; | ||
5101 | int fput_needed = 0; | 5520 | int fput_needed = 0; |
5102 | int err; | 5521 | int err; |
5103 | 5522 | ||
@@ -5123,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5123 | if (event_fd < 0) | 5542 | if (event_fd < 0) |
5124 | return event_fd; | 5543 | return event_fd; |
5125 | 5544 | ||
5126 | /* | ||
5127 | * Get the target context (task or percpu): | ||
5128 | */ | ||
5129 | ctx = find_get_context(pid, cpu); | ||
5130 | if (IS_ERR(ctx)) { | ||
5131 | err = PTR_ERR(ctx); | ||
5132 | goto err_fd; | ||
5133 | } | ||
5134 | |||
5135 | if (group_fd != -1) { | 5545 | if (group_fd != -1) { |
5136 | group_leader = perf_fget_light(group_fd, &fput_needed); | 5546 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5137 | if (IS_ERR(group_leader)) { | 5547 | if (IS_ERR(group_leader)) { |
5138 | err = PTR_ERR(group_leader); | 5548 | err = PTR_ERR(group_leader); |
5139 | goto err_put_context; | 5549 | goto err_fd; |
5140 | } | 5550 | } |
5141 | group_file = group_leader->filp; | 5551 | group_file = group_leader->filp; |
5142 | if (flags & PERF_FLAG_FD_OUTPUT) | 5552 | if (flags & PERF_FLAG_FD_OUTPUT) |
@@ -5145,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5145 | group_leader = NULL; | 5555 | group_leader = NULL; |
5146 | } | 5556 | } |
5147 | 5557 | ||
5558 | if (pid != -1) { | ||
5559 | task = find_lively_task_by_vpid(pid); | ||
5560 | if (IS_ERR(task)) { | ||
5561 | err = PTR_ERR(task); | ||
5562 | goto err_group_fd; | ||
5563 | } | ||
5564 | } | ||
5565 | |||
5566 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
5567 | if (IS_ERR(event)) { | ||
5568 | err = PTR_ERR(event); | ||
5569 | goto err_task; | ||
5570 | } | ||
5571 | |||
5572 | /* | ||
5573 | * Special case software events and allow them to be part of | ||
5574 | * any hardware group. | ||
5575 | */ | ||
5576 | pmu = event->pmu; | ||
5577 | |||
5578 | if (group_leader && | ||
5579 | (is_software_event(event) != is_software_event(group_leader))) { | ||
5580 | if (is_software_event(event)) { | ||
5581 | /* | ||
5582 | * If event and group_leader are not both a software | ||
5583 | * event, and event is, then group leader is not. | ||
5584 | * | ||
5585 | * Allow the addition of software events to !software | ||
5586 | * groups, this is safe because software events never | ||
5587 | * fail to schedule. | ||
5588 | */ | ||
5589 | pmu = group_leader->pmu; | ||
5590 | } else if (is_software_event(group_leader) && | ||
5591 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
5592 | /* | ||
5593 | * In case the group is a pure software group, and we | ||
5594 | * try to add a hardware event, move the whole group to | ||
5595 | * the hardware context. | ||
5596 | */ | ||
5597 | move_group = 1; | ||
5598 | } | ||
5599 | } | ||
5600 | |||
5601 | /* | ||
5602 | * Get the target context (task or percpu): | ||
5603 | */ | ||
5604 | ctx = find_get_context(pmu, task, cpu); | ||
5605 | if (IS_ERR(ctx)) { | ||
5606 | err = PTR_ERR(ctx); | ||
5607 | goto err_alloc; | ||
5608 | } | ||
5609 | |||
5148 | /* | 5610 | /* |
5149 | * Look up the group leader (we will attach this event to it): | 5611 | * Look up the group leader (we will attach this event to it): |
5150 | */ | 5612 | */ |
@@ -5156,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5156 | * becoming part of another group-sibling): | 5618 | * becoming part of another group-sibling): |
5157 | */ | 5619 | */ |
5158 | if (group_leader->group_leader != group_leader) | 5620 | if (group_leader->group_leader != group_leader) |
5159 | goto err_put_context; | 5621 | goto err_context; |
5160 | /* | 5622 | /* |
5161 | * Do not allow to attach to a group in a different | 5623 | * Do not allow to attach to a group in a different |
5162 | * task or CPU context: | 5624 | * task or CPU context: |
5163 | */ | 5625 | */ |
5164 | if (group_leader->ctx != ctx) | 5626 | if (move_group) { |
5165 | goto err_put_context; | 5627 | if (group_leader->ctx->type != ctx->type) |
5628 | goto err_context; | ||
5629 | } else { | ||
5630 | if (group_leader->ctx != ctx) | ||
5631 | goto err_context; | ||
5632 | } | ||
5633 | |||
5166 | /* | 5634 | /* |
5167 | * Only a group leader can be exclusive or pinned | 5635 | * Only a group leader can be exclusive or pinned |
5168 | */ | 5636 | */ |
5169 | if (attr.exclusive || attr.pinned) | 5637 | if (attr.exclusive || attr.pinned) |
5170 | goto err_put_context; | 5638 | goto err_context; |
5171 | } | ||
5172 | |||
5173 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
5174 | NULL, NULL, GFP_KERNEL); | ||
5175 | if (IS_ERR(event)) { | ||
5176 | err = PTR_ERR(event); | ||
5177 | goto err_put_context; | ||
5178 | } | 5639 | } |
5179 | 5640 | ||
5180 | if (output_event) { | 5641 | if (output_event) { |
5181 | err = perf_event_set_output(event, output_event); | 5642 | err = perf_event_set_output(event, output_event); |
5182 | if (err) | 5643 | if (err) |
5183 | goto err_free_put_context; | 5644 | goto err_context; |
5184 | } | 5645 | } |
5185 | 5646 | ||
5186 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5647 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5187 | if (IS_ERR(event_file)) { | 5648 | if (IS_ERR(event_file)) { |
5188 | err = PTR_ERR(event_file); | 5649 | err = PTR_ERR(event_file); |
5189 | goto err_free_put_context; | 5650 | goto err_context; |
5651 | } | ||
5652 | |||
5653 | if (move_group) { | ||
5654 | struct perf_event_context *gctx = group_leader->ctx; | ||
5655 | |||
5656 | mutex_lock(&gctx->mutex); | ||
5657 | perf_event_remove_from_context(group_leader); | ||
5658 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5659 | group_entry) { | ||
5660 | perf_event_remove_from_context(sibling); | ||
5661 | put_ctx(gctx); | ||
5662 | } | ||
5663 | mutex_unlock(&gctx->mutex); | ||
5664 | put_ctx(gctx); | ||
5190 | } | 5665 | } |
5191 | 5666 | ||
5192 | event->filp = event_file; | 5667 | event->filp = event_file; |
5193 | WARN_ON_ONCE(ctx->parent_ctx); | 5668 | WARN_ON_ONCE(ctx->parent_ctx); |
5194 | mutex_lock(&ctx->mutex); | 5669 | mutex_lock(&ctx->mutex); |
5670 | |||
5671 | if (move_group) { | ||
5672 | perf_install_in_context(ctx, group_leader, cpu); | ||
5673 | get_ctx(ctx); | ||
5674 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5675 | group_entry) { | ||
5676 | perf_install_in_context(ctx, sibling, cpu); | ||
5677 | get_ctx(ctx); | ||
5678 | } | ||
5679 | } | ||
5680 | |||
5195 | perf_install_in_context(ctx, event, cpu); | 5681 | perf_install_in_context(ctx, event, cpu); |
5196 | ++ctx->generation; | 5682 | ++ctx->generation; |
5197 | mutex_unlock(&ctx->mutex); | 5683 | mutex_unlock(&ctx->mutex); |
@@ -5212,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5212 | fd_install(event_fd, event_file); | 5698 | fd_install(event_fd, event_file); |
5213 | return event_fd; | 5699 | return event_fd; |
5214 | 5700 | ||
5215 | err_free_put_context: | 5701 | err_context: |
5702 | put_ctx(ctx); | ||
5703 | err_alloc: | ||
5216 | free_event(event); | 5704 | free_event(event); |
5217 | err_put_context: | 5705 | err_task: |
5706 | if (task) | ||
5707 | put_task_struct(task); | ||
5708 | err_group_fd: | ||
5218 | fput_light(group_file, fput_needed); | 5709 | fput_light(group_file, fput_needed); |
5219 | put_ctx(ctx); | ||
5220 | err_fd: | 5710 | err_fd: |
5221 | put_unused_fd(event_fd); | 5711 | put_unused_fd(event_fd); |
5222 | return err; | 5712 | return err; |
@@ -5227,32 +5717,31 @@ err_fd: | |||
5227 | * | 5717 | * |
5228 | * @attr: attributes of the counter to create | 5718 | * @attr: attributes of the counter to create |
5229 | * @cpu: cpu in which the counter is bound | 5719 | * @cpu: cpu in which the counter is bound |
5230 | * @pid: task to profile | 5720 | * @task: task to profile (NULL for percpu) |
5231 | */ | 5721 | */ |
5232 | struct perf_event * | 5722 | struct perf_event * |
5233 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 5723 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5234 | pid_t pid, | 5724 | struct task_struct *task, |
5235 | perf_overflow_handler_t overflow_handler) | 5725 | perf_overflow_handler_t overflow_handler) |
5236 | { | 5726 | { |
5237 | struct perf_event *event; | ||
5238 | struct perf_event_context *ctx; | 5727 | struct perf_event_context *ctx; |
5728 | struct perf_event *event; | ||
5239 | int err; | 5729 | int err; |
5240 | 5730 | ||
5241 | /* | 5731 | /* |
5242 | * Get the target context (task or percpu): | 5732 | * Get the target context (task or percpu): |
5243 | */ | 5733 | */ |
5244 | 5734 | ||
5245 | ctx = find_get_context(pid, cpu); | 5735 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
5246 | if (IS_ERR(ctx)) { | ||
5247 | err = PTR_ERR(ctx); | ||
5248 | goto err_exit; | ||
5249 | } | ||
5250 | |||
5251 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
5252 | NULL, overflow_handler, GFP_KERNEL); | ||
5253 | if (IS_ERR(event)) { | 5736 | if (IS_ERR(event)) { |
5254 | err = PTR_ERR(event); | 5737 | err = PTR_ERR(event); |
5255 | goto err_put_context; | 5738 | goto err; |
5739 | } | ||
5740 | |||
5741 | ctx = find_get_context(event->pmu, task, cpu); | ||
5742 | if (IS_ERR(ctx)) { | ||
5743 | err = PTR_ERR(ctx); | ||
5744 | goto err_free; | ||
5256 | } | 5745 | } |
5257 | 5746 | ||
5258 | event->filp = NULL; | 5747 | event->filp = NULL; |
@@ -5270,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5270 | 5759 | ||
5271 | return event; | 5760 | return event; |
5272 | 5761 | ||
5273 | err_put_context: | 5762 | err_free: |
5274 | put_ctx(ctx); | 5763 | free_event(event); |
5275 | err_exit: | 5764 | err: |
5276 | return ERR_PTR(err); | 5765 | return ERR_PTR(err); |
5277 | } | 5766 | } |
5278 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 5767 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5279 | 5768 | ||
5280 | /* | ||
5281 | * inherit a event from parent task to child task: | ||
5282 | */ | ||
5283 | static struct perf_event * | ||
5284 | inherit_event(struct perf_event *parent_event, | ||
5285 | struct task_struct *parent, | ||
5286 | struct perf_event_context *parent_ctx, | ||
5287 | struct task_struct *child, | ||
5288 | struct perf_event *group_leader, | ||
5289 | struct perf_event_context *child_ctx) | ||
5290 | { | ||
5291 | struct perf_event *child_event; | ||
5292 | |||
5293 | /* | ||
5294 | * Instead of creating recursive hierarchies of events, | ||
5295 | * we link inherited events back to the original parent, | ||
5296 | * which has a filp for sure, which we use as the reference | ||
5297 | * count: | ||
5298 | */ | ||
5299 | if (parent_event->parent) | ||
5300 | parent_event = parent_event->parent; | ||
5301 | |||
5302 | child_event = perf_event_alloc(&parent_event->attr, | ||
5303 | parent_event->cpu, child_ctx, | ||
5304 | group_leader, parent_event, | ||
5305 | NULL, GFP_KERNEL); | ||
5306 | if (IS_ERR(child_event)) | ||
5307 | return child_event; | ||
5308 | get_ctx(child_ctx); | ||
5309 | |||
5310 | /* | ||
5311 | * Make the child state follow the state of the parent event, | ||
5312 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5313 | * so we won't race with perf_event_{en, dis}able_family. | ||
5314 | */ | ||
5315 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5316 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5317 | else | ||
5318 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5319 | |||
5320 | if (parent_event->attr.freq) { | ||
5321 | u64 sample_period = parent_event->hw.sample_period; | ||
5322 | struct hw_perf_event *hwc = &child_event->hw; | ||
5323 | |||
5324 | hwc->sample_period = sample_period; | ||
5325 | hwc->last_period = sample_period; | ||
5326 | |||
5327 | local64_set(&hwc->period_left, sample_period); | ||
5328 | } | ||
5329 | |||
5330 | child_event->overflow_handler = parent_event->overflow_handler; | ||
5331 | |||
5332 | /* | ||
5333 | * Link it up in the child's context: | ||
5334 | */ | ||
5335 | add_event_to_ctx(child_event, child_ctx); | ||
5336 | |||
5337 | /* | ||
5338 | * Get a reference to the parent filp - we will fput it | ||
5339 | * when the child event exits. This is safe to do because | ||
5340 | * we are in the parent and we know that the filp still | ||
5341 | * exists and has a nonzero count: | ||
5342 | */ | ||
5343 | atomic_long_inc(&parent_event->filp->f_count); | ||
5344 | |||
5345 | /* | ||
5346 | * Link this into the parent event's child list | ||
5347 | */ | ||
5348 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
5349 | mutex_lock(&parent_event->child_mutex); | ||
5350 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
5351 | mutex_unlock(&parent_event->child_mutex); | ||
5352 | |||
5353 | return child_event; | ||
5354 | } | ||
5355 | |||
5356 | static int inherit_group(struct perf_event *parent_event, | ||
5357 | struct task_struct *parent, | ||
5358 | struct perf_event_context *parent_ctx, | ||
5359 | struct task_struct *child, | ||
5360 | struct perf_event_context *child_ctx) | ||
5361 | { | ||
5362 | struct perf_event *leader; | ||
5363 | struct perf_event *sub; | ||
5364 | struct perf_event *child_ctr; | ||
5365 | |||
5366 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
5367 | child, NULL, child_ctx); | ||
5368 | if (IS_ERR(leader)) | ||
5369 | return PTR_ERR(leader); | ||
5370 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
5371 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
5372 | child, leader, child_ctx); | ||
5373 | if (IS_ERR(child_ctr)) | ||
5374 | return PTR_ERR(child_ctr); | ||
5375 | } | ||
5376 | return 0; | ||
5377 | } | ||
5378 | |||
5379 | static void sync_child_event(struct perf_event *child_event, | 5769 | static void sync_child_event(struct perf_event *child_event, |
5380 | struct task_struct *child) | 5770 | struct task_struct *child) |
5381 | { | 5771 | { |
@@ -5432,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
5432 | } | 5822 | } |
5433 | } | 5823 | } |
5434 | 5824 | ||
5435 | /* | 5825 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
5436 | * When a child task exits, feed back event values to parent events. | ||
5437 | */ | ||
5438 | void perf_event_exit_task(struct task_struct *child) | ||
5439 | { | 5826 | { |
5440 | struct perf_event *child_event, *tmp; | 5827 | struct perf_event *child_event, *tmp; |
5441 | struct perf_event_context *child_ctx; | 5828 | struct perf_event_context *child_ctx; |
5442 | unsigned long flags; | 5829 | unsigned long flags; |
5443 | 5830 | ||
5444 | if (likely(!child->perf_event_ctxp)) { | 5831 | if (likely(!child->perf_event_ctxp[ctxn])) { |
5445 | perf_event_task(child, NULL, 0); | 5832 | perf_event_task(child, NULL, 0); |
5446 | return; | 5833 | return; |
5447 | } | 5834 | } |
@@ -5453,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
5453 | * scheduled, so we are now safe from rescheduling changing | 5840 | * scheduled, so we are now safe from rescheduling changing |
5454 | * our context. | 5841 | * our context. |
5455 | */ | 5842 | */ |
5456 | child_ctx = child->perf_event_ctxp; | 5843 | child_ctx = child->perf_event_ctxp[ctxn]; |
5457 | __perf_event_task_sched_out(child_ctx); | 5844 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5458 | 5845 | ||
5459 | /* | 5846 | /* |
5460 | * Take the context lock here so that if find_get_context is | 5847 | * Take the context lock here so that if find_get_context is |
@@ -5462,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
5462 | * incremented the context's refcount before we do put_ctx below. | 5849 | * incremented the context's refcount before we do put_ctx below. |
5463 | */ | 5850 | */ |
5464 | raw_spin_lock(&child_ctx->lock); | 5851 | raw_spin_lock(&child_ctx->lock); |
5465 | child->perf_event_ctxp = NULL; | 5852 | child->perf_event_ctxp[ctxn] = NULL; |
5466 | /* | 5853 | /* |
5467 | * If this context is a clone; unclone it so it can't get | 5854 | * If this context is a clone; unclone it so it can't get |
5468 | * swapped to another process while we're removing all | 5855 | * swapped to another process while we're removing all |
@@ -5515,6 +5902,17 @@ again: | |||
5515 | put_ctx(child_ctx); | 5902 | put_ctx(child_ctx); |
5516 | } | 5903 | } |
5517 | 5904 | ||
5905 | /* | ||
5906 | * When a child task exits, feed back event values to parent events. | ||
5907 | */ | ||
5908 | void perf_event_exit_task(struct task_struct *child) | ||
5909 | { | ||
5910 | int ctxn; | ||
5911 | |||
5912 | for_each_task_context_nr(ctxn) | ||
5913 | perf_event_exit_task_context(child, ctxn); | ||
5914 | } | ||
5915 | |||
5518 | static void perf_free_event(struct perf_event *event, | 5916 | static void perf_free_event(struct perf_event *event, |
5519 | struct perf_event_context *ctx) | 5917 | struct perf_event_context *ctx) |
5520 | { | 5918 | { |
@@ -5536,48 +5934,166 @@ static void perf_free_event(struct perf_event *event, | |||
5536 | 5934 | ||
5537 | /* | 5935 | /* |
5538 | * free an unexposed, unused context as created by inheritance by | 5936 | * free an unexposed, unused context as created by inheritance by |
5539 | * init_task below, used by fork() in case of fail. | 5937 | * perf_event_init_task below, used by fork() in case of fail. |
5540 | */ | 5938 | */ |
5541 | void perf_event_free_task(struct task_struct *task) | 5939 | void perf_event_free_task(struct task_struct *task) |
5542 | { | 5940 | { |
5543 | struct perf_event_context *ctx = task->perf_event_ctxp; | 5941 | struct perf_event_context *ctx; |
5544 | struct perf_event *event, *tmp; | 5942 | struct perf_event *event, *tmp; |
5943 | int ctxn; | ||
5545 | 5944 | ||
5546 | if (!ctx) | 5945 | for_each_task_context_nr(ctxn) { |
5547 | return; | 5946 | ctx = task->perf_event_ctxp[ctxn]; |
5947 | if (!ctx) | ||
5948 | continue; | ||
5548 | 5949 | ||
5549 | mutex_lock(&ctx->mutex); | 5950 | mutex_lock(&ctx->mutex); |
5550 | again: | 5951 | again: |
5551 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 5952 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
5552 | perf_free_event(event, ctx); | 5953 | group_entry) |
5954 | perf_free_event(event, ctx); | ||
5553 | 5955 | ||
5554 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 5956 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5555 | group_entry) | 5957 | group_entry) |
5556 | perf_free_event(event, ctx); | 5958 | perf_free_event(event, ctx); |
5557 | 5959 | ||
5558 | if (!list_empty(&ctx->pinned_groups) || | 5960 | if (!list_empty(&ctx->pinned_groups) || |
5559 | !list_empty(&ctx->flexible_groups)) | 5961 | !list_empty(&ctx->flexible_groups)) |
5560 | goto again; | 5962 | goto again; |
5561 | 5963 | ||
5562 | mutex_unlock(&ctx->mutex); | 5964 | mutex_unlock(&ctx->mutex); |
5563 | 5965 | ||
5564 | put_ctx(ctx); | 5966 | put_ctx(ctx); |
5967 | } | ||
5968 | } | ||
5969 | |||
5970 | void perf_event_delayed_put(struct task_struct *task) | ||
5971 | { | ||
5972 | int ctxn; | ||
5973 | |||
5974 | for_each_task_context_nr(ctxn) | ||
5975 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
5976 | } | ||
5977 | |||
5978 | /* | ||
5979 | * inherit a event from parent task to child task: | ||
5980 | */ | ||
5981 | static struct perf_event * | ||
5982 | inherit_event(struct perf_event *parent_event, | ||
5983 | struct task_struct *parent, | ||
5984 | struct perf_event_context *parent_ctx, | ||
5985 | struct task_struct *child, | ||
5986 | struct perf_event *group_leader, | ||
5987 | struct perf_event_context *child_ctx) | ||
5988 | { | ||
5989 | struct perf_event *child_event; | ||
5990 | unsigned long flags; | ||
5991 | |||
5992 | /* | ||
5993 | * Instead of creating recursive hierarchies of events, | ||
5994 | * we link inherited events back to the original parent, | ||
5995 | * which has a filp for sure, which we use as the reference | ||
5996 | * count: | ||
5997 | */ | ||
5998 | if (parent_event->parent) | ||
5999 | parent_event = parent_event->parent; | ||
6000 | |||
6001 | child_event = perf_event_alloc(&parent_event->attr, | ||
6002 | parent_event->cpu, | ||
6003 | child, | ||
6004 | group_leader, parent_event, | ||
6005 | NULL); | ||
6006 | if (IS_ERR(child_event)) | ||
6007 | return child_event; | ||
6008 | get_ctx(child_ctx); | ||
6009 | |||
6010 | /* | ||
6011 | * Make the child state follow the state of the parent event, | ||
6012 | * not its attr.disabled bit. We hold the parent's mutex, | ||
6013 | * so we won't race with perf_event_{en, dis}able_family. | ||
6014 | */ | ||
6015 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
6016 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
6017 | else | ||
6018 | child_event->state = PERF_EVENT_STATE_OFF; | ||
6019 | |||
6020 | if (parent_event->attr.freq) { | ||
6021 | u64 sample_period = parent_event->hw.sample_period; | ||
6022 | struct hw_perf_event *hwc = &child_event->hw; | ||
6023 | |||
6024 | hwc->sample_period = sample_period; | ||
6025 | hwc->last_period = sample_period; | ||
6026 | |||
6027 | local64_set(&hwc->period_left, sample_period); | ||
6028 | } | ||
6029 | |||
6030 | child_event->ctx = child_ctx; | ||
6031 | child_event->overflow_handler = parent_event->overflow_handler; | ||
6032 | |||
6033 | /* | ||
6034 | * Link it up in the child's context: | ||
6035 | */ | ||
6036 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
6037 | add_event_to_ctx(child_event, child_ctx); | ||
6038 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
6039 | |||
6040 | /* | ||
6041 | * Get a reference to the parent filp - we will fput it | ||
6042 | * when the child event exits. This is safe to do because | ||
6043 | * we are in the parent and we know that the filp still | ||
6044 | * exists and has a nonzero count: | ||
6045 | */ | ||
6046 | atomic_long_inc(&parent_event->filp->f_count); | ||
6047 | |||
6048 | /* | ||
6049 | * Link this into the parent event's child list | ||
6050 | */ | ||
6051 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
6052 | mutex_lock(&parent_event->child_mutex); | ||
6053 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
6054 | mutex_unlock(&parent_event->child_mutex); | ||
6055 | |||
6056 | return child_event; | ||
6057 | } | ||
6058 | |||
6059 | static int inherit_group(struct perf_event *parent_event, | ||
6060 | struct task_struct *parent, | ||
6061 | struct perf_event_context *parent_ctx, | ||
6062 | struct task_struct *child, | ||
6063 | struct perf_event_context *child_ctx) | ||
6064 | { | ||
6065 | struct perf_event *leader; | ||
6066 | struct perf_event *sub; | ||
6067 | struct perf_event *child_ctr; | ||
6068 | |||
6069 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
6070 | child, NULL, child_ctx); | ||
6071 | if (IS_ERR(leader)) | ||
6072 | return PTR_ERR(leader); | ||
6073 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
6074 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
6075 | child, leader, child_ctx); | ||
6076 | if (IS_ERR(child_ctr)) | ||
6077 | return PTR_ERR(child_ctr); | ||
6078 | } | ||
6079 | return 0; | ||
5565 | } | 6080 | } |
5566 | 6081 | ||
5567 | static int | 6082 | static int |
5568 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 6083 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5569 | struct perf_event_context *parent_ctx, | 6084 | struct perf_event_context *parent_ctx, |
5570 | struct task_struct *child, | 6085 | struct task_struct *child, int ctxn, |
5571 | int *inherited_all) | 6086 | int *inherited_all) |
5572 | { | 6087 | { |
5573 | int ret; | 6088 | int ret; |
5574 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 6089 | struct perf_event_context *child_ctx; |
5575 | 6090 | ||
5576 | if (!event->attr.inherit) { | 6091 | if (!event->attr.inherit) { |
5577 | *inherited_all = 0; | 6092 | *inherited_all = 0; |
5578 | return 0; | 6093 | return 0; |
5579 | } | 6094 | } |
5580 | 6095 | ||
6096 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5581 | if (!child_ctx) { | 6097 | if (!child_ctx) { |
5582 | /* | 6098 | /* |
5583 | * This is executed from the parent task context, so | 6099 | * This is executed from the parent task context, so |
@@ -5586,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5586 | * child. | 6102 | * child. |
5587 | */ | 6103 | */ |
5588 | 6104 | ||
5589 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 6105 | child_ctx = alloc_perf_context(event->pmu, child); |
5590 | GFP_KERNEL); | ||
5591 | if (!child_ctx) | 6106 | if (!child_ctx) |
5592 | return -ENOMEM; | 6107 | return -ENOMEM; |
5593 | 6108 | ||
5594 | __perf_event_init_context(child_ctx, child); | 6109 | child->perf_event_ctxp[ctxn] = child_ctx; |
5595 | child->perf_event_ctxp = child_ctx; | ||
5596 | get_task_struct(child); | ||
5597 | } | 6110 | } |
5598 | 6111 | ||
5599 | ret = inherit_group(event, parent, parent_ctx, | 6112 | ret = inherit_group(event, parent, parent_ctx, |
@@ -5605,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5605 | return ret; | 6118 | return ret; |
5606 | } | 6119 | } |
5607 | 6120 | ||
5608 | |||
5609 | /* | 6121 | /* |
5610 | * Initialize the perf_event context in task_struct | 6122 | * Initialize the perf_event context in task_struct |
5611 | */ | 6123 | */ |
5612 | int perf_event_init_task(struct task_struct *child) | 6124 | int perf_event_init_context(struct task_struct *child, int ctxn) |
5613 | { | 6125 | { |
5614 | struct perf_event_context *child_ctx, *parent_ctx; | 6126 | struct perf_event_context *child_ctx, *parent_ctx; |
5615 | struct perf_event_context *cloned_ctx; | 6127 | struct perf_event_context *cloned_ctx; |
@@ -5618,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child) | |||
5618 | int inherited_all = 1; | 6130 | int inherited_all = 1; |
5619 | int ret = 0; | 6131 | int ret = 0; |
5620 | 6132 | ||
5621 | child->perf_event_ctxp = NULL; | 6133 | child->perf_event_ctxp[ctxn] = NULL; |
5622 | 6134 | ||
5623 | mutex_init(&child->perf_event_mutex); | 6135 | mutex_init(&child->perf_event_mutex); |
5624 | INIT_LIST_HEAD(&child->perf_event_list); | 6136 | INIT_LIST_HEAD(&child->perf_event_list); |
5625 | 6137 | ||
5626 | if (likely(!parent->perf_event_ctxp)) | 6138 | if (likely(!parent->perf_event_ctxp[ctxn])) |
5627 | return 0; | 6139 | return 0; |
5628 | 6140 | ||
5629 | /* | 6141 | /* |
5630 | * If the parent's context is a clone, pin it so it won't get | 6142 | * If the parent's context is a clone, pin it so it won't get |
5631 | * swapped under us. | 6143 | * swapped under us. |
5632 | */ | 6144 | */ |
5633 | parent_ctx = perf_pin_task_context(parent); | 6145 | parent_ctx = perf_pin_task_context(parent, ctxn); |
5634 | 6146 | ||
5635 | /* | 6147 | /* |
5636 | * No need to check if parent_ctx != NULL here; since we saw | 6148 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -5650,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child) | |||
5650 | * the list, not manipulating it: | 6162 | * the list, not manipulating it: |
5651 | */ | 6163 | */ |
5652 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 6164 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5653 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6165 | ret = inherit_task_group(event, parent, parent_ctx, |
5654 | &inherited_all); | 6166 | child, ctxn, &inherited_all); |
5655 | if (ret) | 6167 | if (ret) |
5656 | break; | 6168 | break; |
5657 | } | 6169 | } |
5658 | 6170 | ||
5659 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6171 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5660 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6172 | ret = inherit_task_group(event, parent, parent_ctx, |
5661 | &inherited_all); | 6173 | child, ctxn, &inherited_all); |
5662 | if (ret) | 6174 | if (ret) |
5663 | break; | 6175 | break; |
5664 | } | 6176 | } |
5665 | 6177 | ||
5666 | child_ctx = child->perf_event_ctxp; | 6178 | child_ctx = child->perf_event_ctxp[ctxn]; |
5667 | 6179 | ||
5668 | if (child_ctx && inherited_all) { | 6180 | if (child_ctx && inherited_all) { |
5669 | /* | 6181 | /* |
@@ -5692,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child) | |||
5692 | return ret; | 6204 | return ret; |
5693 | } | 6205 | } |
5694 | 6206 | ||
6207 | /* | ||
6208 | * Initialize the perf_event context in task_struct | ||
6209 | */ | ||
6210 | int perf_event_init_task(struct task_struct *child) | ||
6211 | { | ||
6212 | int ctxn, ret; | ||
6213 | |||
6214 | for_each_task_context_nr(ctxn) { | ||
6215 | ret = perf_event_init_context(child, ctxn); | ||
6216 | if (ret) | ||
6217 | return ret; | ||
6218 | } | ||
6219 | |||
6220 | return 0; | ||
6221 | } | ||
6222 | |||
5695 | static void __init perf_event_init_all_cpus(void) | 6223 | static void __init perf_event_init_all_cpus(void) |
5696 | { | 6224 | { |
6225 | struct swevent_htable *swhash; | ||
5697 | int cpu; | 6226 | int cpu; |
5698 | struct perf_cpu_context *cpuctx; | ||
5699 | 6227 | ||
5700 | for_each_possible_cpu(cpu) { | 6228 | for_each_possible_cpu(cpu) { |
5701 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6229 | swhash = &per_cpu(swevent_htable, cpu); |
5702 | mutex_init(&cpuctx->hlist_mutex); | 6230 | mutex_init(&swhash->hlist_mutex); |
5703 | __perf_event_init_context(&cpuctx->ctx, NULL); | 6231 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
5704 | } | 6232 | } |
5705 | } | 6233 | } |
5706 | 6234 | ||
5707 | static void __cpuinit perf_event_init_cpu(int cpu) | 6235 | static void __cpuinit perf_event_init_cpu(int cpu) |
5708 | { | 6236 | { |
5709 | struct perf_cpu_context *cpuctx; | 6237 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5710 | |||
5711 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5712 | 6238 | ||
5713 | spin_lock(&perf_resource_lock); | 6239 | mutex_lock(&swhash->hlist_mutex); |
5714 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 6240 | if (swhash->hlist_refcount > 0) { |
5715 | spin_unlock(&perf_resource_lock); | ||
5716 | |||
5717 | mutex_lock(&cpuctx->hlist_mutex); | ||
5718 | if (cpuctx->hlist_refcount > 0) { | ||
5719 | struct swevent_hlist *hlist; | 6241 | struct swevent_hlist *hlist; |
5720 | 6242 | ||
5721 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 6243 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
5722 | WARN_ON_ONCE(!hlist); | 6244 | WARN_ON(!hlist); |
5723 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 6245 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
5724 | } | 6246 | } |
5725 | mutex_unlock(&cpuctx->hlist_mutex); | 6247 | mutex_unlock(&swhash->hlist_mutex); |
5726 | } | 6248 | } |
5727 | 6249 | ||
5728 | #ifdef CONFIG_HOTPLUG_CPU | 6250 | #ifdef CONFIG_HOTPLUG_CPU |
5729 | static void __perf_event_exit_cpu(void *info) | 6251 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
5730 | { | 6252 | { |
5731 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 6253 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
5732 | struct perf_event_context *ctx = &cpuctx->ctx; | 6254 | |
6255 | WARN_ON(!irqs_disabled()); | ||
6256 | |||
6257 | list_del_init(&cpuctx->rotation_list); | ||
6258 | } | ||
6259 | |||
6260 | static void __perf_event_exit_context(void *__info) | ||
6261 | { | ||
6262 | struct perf_event_context *ctx = __info; | ||
5733 | struct perf_event *event, *tmp; | 6263 | struct perf_event *event, *tmp; |
5734 | 6264 | ||
6265 | perf_pmu_rotate_stop(ctx->pmu); | ||
6266 | |||
5735 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6267 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5736 | __perf_event_remove_from_context(event); | 6268 | __perf_event_remove_from_context(event); |
5737 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 6269 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5738 | __perf_event_remove_from_context(event); | 6270 | __perf_event_remove_from_context(event); |
5739 | } | 6271 | } |
6272 | |||
6273 | static void perf_event_exit_cpu_context(int cpu) | ||
6274 | { | ||
6275 | struct perf_event_context *ctx; | ||
6276 | struct pmu *pmu; | ||
6277 | int idx; | ||
6278 | |||
6279 | idx = srcu_read_lock(&pmus_srcu); | ||
6280 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
6281 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
6282 | |||
6283 | mutex_lock(&ctx->mutex); | ||
6284 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
6285 | mutex_unlock(&ctx->mutex); | ||
6286 | } | ||
6287 | srcu_read_unlock(&pmus_srcu, idx); | ||
6288 | } | ||
6289 | |||
5740 | static void perf_event_exit_cpu(int cpu) | 6290 | static void perf_event_exit_cpu(int cpu) |
5741 | { | 6291 | { |
5742 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 6292 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5743 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
5744 | 6293 | ||
5745 | mutex_lock(&cpuctx->hlist_mutex); | 6294 | mutex_lock(&swhash->hlist_mutex); |
5746 | swevent_hlist_release(cpuctx); | 6295 | swevent_hlist_release(swhash); |
5747 | mutex_unlock(&cpuctx->hlist_mutex); | 6296 | mutex_unlock(&swhash->hlist_mutex); |
5748 | 6297 | ||
5749 | mutex_lock(&ctx->mutex); | 6298 | perf_event_exit_cpu_context(cpu); |
5750 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
5751 | mutex_unlock(&ctx->mutex); | ||
5752 | } | 6299 | } |
5753 | #else | 6300 | #else |
5754 | static inline void perf_event_exit_cpu(int cpu) { } | 6301 | static inline void perf_event_exit_cpu(int cpu) { } |
@@ -5778,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5778 | return NOTIFY_OK; | 6325 | return NOTIFY_OK; |
5779 | } | 6326 | } |
5780 | 6327 | ||
5781 | /* | ||
5782 | * This has to have a higher priority than migration_notifier in sched.c. | ||
5783 | */ | ||
5784 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
5785 | .notifier_call = perf_cpu_notify, | ||
5786 | .priority = 20, | ||
5787 | }; | ||
5788 | |||
5789 | void __init perf_event_init(void) | 6328 | void __init perf_event_init(void) |
5790 | { | 6329 | { |
5791 | perf_event_init_all_cpus(); | 6330 | perf_event_init_all_cpus(); |
5792 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 6331 | init_srcu_struct(&pmus_srcu); |
5793 | (void *)(long)smp_processor_id()); | 6332 | perf_pmu_register(&perf_swevent); |
5794 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 6333 | perf_pmu_register(&perf_cpu_clock); |
5795 | (void *)(long)smp_processor_id()); | 6334 | perf_pmu_register(&perf_task_clock); |
5796 | register_cpu_notifier(&perf_cpu_nb); | 6335 | perf_tp_register(); |
5797 | } | 6336 | perf_cpu_notifier(perf_cpu_notify); |
5798 | |||
5799 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | ||
5800 | struct sysdev_class_attribute *attr, | ||
5801 | char *buf) | ||
5802 | { | ||
5803 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
5804 | } | ||
5805 | |||
5806 | static ssize_t | ||
5807 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
5808 | struct sysdev_class_attribute *attr, | ||
5809 | const char *buf, | ||
5810 | size_t count) | ||
5811 | { | ||
5812 | struct perf_cpu_context *cpuctx; | ||
5813 | unsigned long val; | ||
5814 | int err, cpu, mpt; | ||
5815 | |||
5816 | err = strict_strtoul(buf, 10, &val); | ||
5817 | if (err) | ||
5818 | return err; | ||
5819 | if (val > perf_max_events) | ||
5820 | return -EINVAL; | ||
5821 | |||
5822 | spin_lock(&perf_resource_lock); | ||
5823 | perf_reserved_percpu = val; | ||
5824 | for_each_online_cpu(cpu) { | ||
5825 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5826 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
5827 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
5828 | perf_max_events - perf_reserved_percpu); | ||
5829 | cpuctx->max_pertask = mpt; | ||
5830 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
5831 | } | ||
5832 | spin_unlock(&perf_resource_lock); | ||
5833 | |||
5834 | return count; | ||
5835 | } | ||
5836 | |||
5837 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | ||
5838 | struct sysdev_class_attribute *attr, | ||
5839 | char *buf) | ||
5840 | { | ||
5841 | return sprintf(buf, "%d\n", perf_overcommit); | ||
5842 | } | ||
5843 | |||
5844 | static ssize_t | ||
5845 | perf_set_overcommit(struct sysdev_class *class, | ||
5846 | struct sysdev_class_attribute *attr, | ||
5847 | const char *buf, size_t count) | ||
5848 | { | ||
5849 | unsigned long val; | ||
5850 | int err; | ||
5851 | |||
5852 | err = strict_strtoul(buf, 10, &val); | ||
5853 | if (err) | ||
5854 | return err; | ||
5855 | if (val > 1) | ||
5856 | return -EINVAL; | ||
5857 | |||
5858 | spin_lock(&perf_resource_lock); | ||
5859 | perf_overcommit = val; | ||
5860 | spin_unlock(&perf_resource_lock); | ||
5861 | |||
5862 | return count; | ||
5863 | } | ||
5864 | |||
5865 | static SYSDEV_CLASS_ATTR( | ||
5866 | reserve_percpu, | ||
5867 | 0644, | ||
5868 | perf_show_reserve_percpu, | ||
5869 | perf_set_reserve_percpu | ||
5870 | ); | ||
5871 | |||
5872 | static SYSDEV_CLASS_ATTR( | ||
5873 | overcommit, | ||
5874 | 0644, | ||
5875 | perf_show_overcommit, | ||
5876 | perf_set_overcommit | ||
5877 | ); | ||
5878 | |||
5879 | static struct attribute *perfclass_attrs[] = { | ||
5880 | &attr_reserve_percpu.attr, | ||
5881 | &attr_overcommit.attr, | ||
5882 | NULL | ||
5883 | }; | ||
5884 | |||
5885 | static struct attribute_group perfclass_attr_group = { | ||
5886 | .attrs = perfclass_attrs, | ||
5887 | .name = "perf_events", | ||
5888 | }; | ||
5889 | |||
5890 | static int __init perf_event_sysfs_init(void) | ||
5891 | { | ||
5892 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
5893 | &perfclass_attr_group); | ||
5894 | } | 6337 | } |
5895 | device_initcall(perf_event_sysfs_init); | ||