diff options
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 2724 |
1 files changed, 1630 insertions, 1094 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 403d1804b198..2870feee81dd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -35,20 +35,15 @@ | |||
35 | 35 | ||
36 | #include <asm/irq_regs.h> | 36 | #include <asm/irq_regs.h> |
37 | 37 | ||
38 | /* | 38 | atomic_t perf_task_events __read_mostly; |
39 | * Each CPU has a list of per CPU events: | ||
40 | */ | ||
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
42 | |||
43 | int perf_max_events __read_mostly = 1; | ||
44 | static int perf_reserved_percpu __read_mostly; | ||
45 | static int perf_overcommit __read_mostly = 1; | ||
46 | |||
47 | static atomic_t nr_events __read_mostly; | ||
48 | static atomic_t nr_mmap_events __read_mostly; | 39 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 40 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 41 | static atomic_t nr_task_events __read_mostly; |
51 | 42 | ||
43 | static LIST_HEAD(pmus); | ||
44 | static DEFINE_MUTEX(pmus_lock); | ||
45 | static struct srcu_struct pmus_srcu; | ||
46 | |||
52 | /* | 47 | /* |
53 | * perf event paranoia level: | 48 | * perf event paranoia level: |
54 | * -1 - not paranoid at all | 49 | * -1 - not paranoid at all |
@@ -67,36 +62,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
67 | 62 | ||
68 | static atomic64_t perf_event_id; | 63 | static atomic64_t perf_event_id; |
69 | 64 | ||
70 | /* | 65 | void __weak perf_event_print_debug(void) { } |
71 | * Lock for (sysadmin-configurable) event reservations: | ||
72 | */ | ||
73 | static DEFINE_SPINLOCK(perf_resource_lock); | ||
74 | 66 | ||
75 | /* | 67 | extern __weak const char *perf_pmu_name(void) |
76 | * Architecture provided APIs - weak aliases: | ||
77 | */ | ||
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | ||
79 | { | 68 | { |
80 | return NULL; | 69 | return "pmu"; |
81 | } | 70 | } |
82 | 71 | ||
83 | void __weak hw_perf_disable(void) { barrier(); } | 72 | void perf_pmu_disable(struct pmu *pmu) |
84 | void __weak hw_perf_enable(void) { barrier(); } | 73 | { |
85 | 74 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | |
86 | void __weak perf_event_print_debug(void) { } | 75 | if (!(*count)++) |
87 | 76 | pmu->pmu_disable(pmu); | |
88 | static DEFINE_PER_CPU(int, perf_disable_count); | 77 | } |
89 | 78 | ||
90 | void perf_disable(void) | 79 | void perf_pmu_enable(struct pmu *pmu) |
91 | { | 80 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) | 81 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
93 | hw_perf_disable(); | 82 | if (!--(*count)) |
83 | pmu->pmu_enable(pmu); | ||
94 | } | 84 | } |
95 | 85 | ||
96 | void perf_enable(void) | 86 | static DEFINE_PER_CPU(struct list_head, rotation_list); |
87 | |||
88 | /* | ||
89 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
90 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
91 | * disabled, while rotate_context is called from IRQ context. | ||
92 | */ | ||
93 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
97 | { | 94 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) | 95 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
99 | hw_perf_enable(); | 96 | struct list_head *head = &__get_cpu_var(rotation_list); |
97 | |||
98 | WARN_ON(!irqs_disabled()); | ||
99 | |||
100 | if (list_empty(&cpuctx->rotation_list)) | ||
101 | list_add(&cpuctx->rotation_list, head); | ||
100 | } | 102 | } |
101 | 103 | ||
102 | static void get_ctx(struct perf_event_context *ctx) | 104 | static void get_ctx(struct perf_event_context *ctx) |
@@ -151,13 +153,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
151 | * the context could get moved to another task. | 153 | * the context could get moved to another task. |
152 | */ | 154 | */ |
153 | static struct perf_event_context * | 155 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 156 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
155 | { | 157 | { |
156 | struct perf_event_context *ctx; | 158 | struct perf_event_context *ctx; |
157 | 159 | ||
158 | rcu_read_lock(); | 160 | rcu_read_lock(); |
159 | retry: | 161 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); | 162 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
161 | if (ctx) { | 163 | if (ctx) { |
162 | /* | 164 | /* |
163 | * If this context is a clone of another, it might | 165 | * If this context is a clone of another, it might |
@@ -170,7 +172,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
170 | * can't get swapped on us any more. | 172 | * can't get swapped on us any more. |
171 | */ | 173 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 174 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 175 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 176 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; | 177 | goto retry; |
176 | } | 178 | } |
@@ -189,12 +191,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
189 | * can't get swapped to another task. This also increments its | 191 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. | 192 | * reference count so that the context can't get freed. |
191 | */ | 193 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 194 | static struct perf_event_context * |
195 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
193 | { | 196 | { |
194 | struct perf_event_context *ctx; | 197 | struct perf_event_context *ctx; |
195 | unsigned long flags; | 198 | unsigned long flags; |
196 | 199 | ||
197 | ctx = perf_lock_task_context(task, &flags); | 200 | ctx = perf_lock_task_context(task, ctxn, &flags); |
198 | if (ctx) { | 201 | if (ctx) { |
199 | ++ctx->pin_count; | 202 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 203 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
@@ -302,6 +305,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
302 | } | 305 | } |
303 | 306 | ||
304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 307 | list_add_rcu(&event->event_entry, &ctx->event_list); |
308 | if (!ctx->nr_events) | ||
309 | perf_pmu_rotate_start(ctx->pmu); | ||
305 | ctx->nr_events++; | 310 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) | 311 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; | 312 | ctx->nr_stat++; |
@@ -311,7 +316,12 @@ static void perf_group_attach(struct perf_event *event) | |||
311 | { | 316 | { |
312 | struct perf_event *group_leader = event->group_leader; | 317 | struct perf_event *group_leader = event->group_leader; |
313 | 318 | ||
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | 319 | /* |
320 | * We can have double attach due to group movement in perf_event_open. | ||
321 | */ | ||
322 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
323 | return; | ||
324 | |||
315 | event->attach_state |= PERF_ATTACH_GROUP; | 325 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | 326 | ||
317 | if (group_leader == event) | 327 | if (group_leader == event) |
@@ -402,11 +412,31 @@ static void perf_group_detach(struct perf_event *event) | |||
402 | } | 412 | } |
403 | } | 413 | } |
404 | 414 | ||
415 | static inline int | ||
416 | event_filter_match(struct perf_event *event) | ||
417 | { | ||
418 | return event->cpu == -1 || event->cpu == smp_processor_id(); | ||
419 | } | ||
420 | |||
405 | static void | 421 | static void |
406 | event_sched_out(struct perf_event *event, | 422 | event_sched_out(struct perf_event *event, |
407 | struct perf_cpu_context *cpuctx, | 423 | struct perf_cpu_context *cpuctx, |
408 | struct perf_event_context *ctx) | 424 | struct perf_event_context *ctx) |
409 | { | 425 | { |
426 | u64 delta; | ||
427 | /* | ||
428 | * An event which could not be activated because of | ||
429 | * filter mismatch still needs to have its timings | ||
430 | * maintained, otherwise bogus information is return | ||
431 | * via read() for time_enabled, time_running: | ||
432 | */ | ||
433 | if (event->state == PERF_EVENT_STATE_INACTIVE | ||
434 | && !event_filter_match(event)) { | ||
435 | delta = ctx->time - event->tstamp_stopped; | ||
436 | event->tstamp_running += delta; | ||
437 | event->tstamp_stopped = ctx->time; | ||
438 | } | ||
439 | |||
410 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 440 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
411 | return; | 441 | return; |
412 | 442 | ||
@@ -416,7 +446,7 @@ event_sched_out(struct perf_event *event, | |||
416 | event->state = PERF_EVENT_STATE_OFF; | 446 | event->state = PERF_EVENT_STATE_OFF; |
417 | } | 447 | } |
418 | event->tstamp_stopped = ctx->time; | 448 | event->tstamp_stopped = ctx->time; |
419 | event->pmu->disable(event); | 449 | event->pmu->del(event, 0); |
420 | event->oncpu = -1; | 450 | event->oncpu = -1; |
421 | 451 | ||
422 | if (!is_software_event(event)) | 452 | if (!is_software_event(event)) |
@@ -432,9 +462,7 @@ group_sched_out(struct perf_event *group_event, | |||
432 | struct perf_event_context *ctx) | 462 | struct perf_event_context *ctx) |
433 | { | 463 | { |
434 | struct perf_event *event; | 464 | struct perf_event *event; |
435 | 465 | int state = group_event->state; | |
436 | if (group_event->state != PERF_EVENT_STATE_ACTIVE) | ||
437 | return; | ||
438 | 466 | ||
439 | event_sched_out(group_event, cpuctx, ctx); | 467 | event_sched_out(group_event, cpuctx, ctx); |
440 | 468 | ||
@@ -444,10 +472,16 @@ group_sched_out(struct perf_event *group_event, | |||
444 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 472 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
445 | event_sched_out(event, cpuctx, ctx); | 473 | event_sched_out(event, cpuctx, ctx); |
446 | 474 | ||
447 | if (group_event->attr.exclusive) | 475 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
448 | cpuctx->exclusive = 0; | 476 | cpuctx->exclusive = 0; |
449 | } | 477 | } |
450 | 478 | ||
479 | static inline struct perf_cpu_context * | ||
480 | __get_cpu_context(struct perf_event_context *ctx) | ||
481 | { | ||
482 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
483 | } | ||
484 | |||
451 | /* | 485 | /* |
452 | * Cross CPU call to remove a performance event | 486 | * Cross CPU call to remove a performance event |
453 | * | 487 | * |
@@ -456,9 +490,9 @@ group_sched_out(struct perf_event *group_event, | |||
456 | */ | 490 | */ |
457 | static void __perf_event_remove_from_context(void *info) | 491 | static void __perf_event_remove_from_context(void *info) |
458 | { | 492 | { |
459 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
460 | struct perf_event *event = info; | 493 | struct perf_event *event = info; |
461 | struct perf_event_context *ctx = event->ctx; | 494 | struct perf_event_context *ctx = event->ctx; |
495 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
462 | 496 | ||
463 | /* | 497 | /* |
464 | * If this is a task context, we need to check whether it is | 498 | * If this is a task context, we need to check whether it is |
@@ -469,27 +503,11 @@ static void __perf_event_remove_from_context(void *info) | |||
469 | return; | 503 | return; |
470 | 504 | ||
471 | raw_spin_lock(&ctx->lock); | 505 | raw_spin_lock(&ctx->lock); |
472 | /* | ||
473 | * Protect the list operation against NMI by disabling the | ||
474 | * events on a global level. | ||
475 | */ | ||
476 | perf_disable(); | ||
477 | 506 | ||
478 | event_sched_out(event, cpuctx, ctx); | 507 | event_sched_out(event, cpuctx, ctx); |
479 | 508 | ||
480 | list_del_event(event, ctx); | 509 | list_del_event(event, ctx); |
481 | 510 | ||
482 | if (!ctx->task) { | ||
483 | /* | ||
484 | * Allow more per task events with respect to the | ||
485 | * reservation: | ||
486 | */ | ||
487 | cpuctx->max_pertask = | ||
488 | min(perf_max_events - ctx->nr_events, | ||
489 | perf_max_events - perf_reserved_percpu); | ||
490 | } | ||
491 | |||
492 | perf_enable(); | ||
493 | raw_spin_unlock(&ctx->lock); | 511 | raw_spin_unlock(&ctx->lock); |
494 | } | 512 | } |
495 | 513 | ||
@@ -554,8 +572,8 @@ retry: | |||
554 | static void __perf_event_disable(void *info) | 572 | static void __perf_event_disable(void *info) |
555 | { | 573 | { |
556 | struct perf_event *event = info; | 574 | struct perf_event *event = info; |
557 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
558 | struct perf_event_context *ctx = event->ctx; | 575 | struct perf_event_context *ctx = event->ctx; |
576 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
559 | 577 | ||
560 | /* | 578 | /* |
561 | * If this is a per-task event, need to check whether this | 579 | * If this is a per-task event, need to check whether this |
@@ -610,7 +628,7 @@ void perf_event_disable(struct perf_event *event) | |||
610 | return; | 628 | return; |
611 | } | 629 | } |
612 | 630 | ||
613 | retry: | 631 | retry: |
614 | task_oncpu_function_call(task, __perf_event_disable, event); | 632 | task_oncpu_function_call(task, __perf_event_disable, event); |
615 | 633 | ||
616 | raw_spin_lock_irq(&ctx->lock); | 634 | raw_spin_lock_irq(&ctx->lock); |
@@ -649,7 +667,7 @@ event_sched_in(struct perf_event *event, | |||
649 | */ | 667 | */ |
650 | smp_wmb(); | 668 | smp_wmb(); |
651 | 669 | ||
652 | if (event->pmu->enable(event)) { | 670 | if (event->pmu->add(event, PERF_EF_START)) { |
653 | event->state = PERF_EVENT_STATE_INACTIVE; | 671 | event->state = PERF_EVENT_STATE_INACTIVE; |
654 | event->oncpu = -1; | 672 | event->oncpu = -1; |
655 | return -EAGAIN; | 673 | return -EAGAIN; |
@@ -657,6 +675,8 @@ event_sched_in(struct perf_event *event, | |||
657 | 675 | ||
658 | event->tstamp_running += ctx->time - event->tstamp_stopped; | 676 | event->tstamp_running += ctx->time - event->tstamp_stopped; |
659 | 677 | ||
678 | event->shadow_ctx_time = ctx->time - ctx->timestamp; | ||
679 | |||
660 | if (!is_software_event(event)) | 680 | if (!is_software_event(event)) |
661 | cpuctx->active_oncpu++; | 681 | cpuctx->active_oncpu++; |
662 | ctx->nr_active++; | 682 | ctx->nr_active++; |
@@ -673,22 +693,17 @@ group_sched_in(struct perf_event *group_event, | |||
673 | struct perf_event_context *ctx) | 693 | struct perf_event_context *ctx) |
674 | { | 694 | { |
675 | struct perf_event *event, *partial_group = NULL; | 695 | struct perf_event *event, *partial_group = NULL; |
676 | const struct pmu *pmu = group_event->pmu; | 696 | struct pmu *pmu = group_event->pmu; |
677 | bool txn = false; | 697 | u64 now = ctx->time; |
698 | bool simulate = false; | ||
678 | 699 | ||
679 | if (group_event->state == PERF_EVENT_STATE_OFF) | 700 | if (group_event->state == PERF_EVENT_STATE_OFF) |
680 | return 0; | 701 | return 0; |
681 | 702 | ||
682 | /* Check if group transaction availabe */ | 703 | pmu->start_txn(pmu); |
683 | if (pmu->start_txn) | ||
684 | txn = true; | ||
685 | |||
686 | if (txn) | ||
687 | pmu->start_txn(pmu); | ||
688 | 704 | ||
689 | if (event_sched_in(group_event, cpuctx, ctx)) { | 705 | if (event_sched_in(group_event, cpuctx, ctx)) { |
690 | if (txn) | 706 | pmu->cancel_txn(pmu); |
691 | pmu->cancel_txn(pmu); | ||
692 | return -EAGAIN; | 707 | return -EAGAIN; |
693 | } | 708 | } |
694 | 709 | ||
@@ -702,23 +717,38 @@ group_sched_in(struct perf_event *group_event, | |||
702 | } | 717 | } |
703 | } | 718 | } |
704 | 719 | ||
705 | if (!txn || !pmu->commit_txn(pmu)) | 720 | if (!pmu->commit_txn(pmu)) |
706 | return 0; | 721 | return 0; |
707 | 722 | ||
708 | group_error: | 723 | group_error: |
709 | /* | 724 | /* |
710 | * Groups can be scheduled in as one unit only, so undo any | 725 | * Groups can be scheduled in as one unit only, so undo any |
711 | * partial group before returning: | 726 | * partial group before returning: |
727 | * The events up to the failed event are scheduled out normally, | ||
728 | * tstamp_stopped will be updated. | ||
729 | * | ||
730 | * The failed events and the remaining siblings need to have | ||
731 | * their timings updated as if they had gone thru event_sched_in() | ||
732 | * and event_sched_out(). This is required to get consistent timings | ||
733 | * across the group. This also takes care of the case where the group | ||
734 | * could never be scheduled by ensuring tstamp_stopped is set to mark | ||
735 | * the time the event was actually stopped, such that time delta | ||
736 | * calculation in update_event_times() is correct. | ||
712 | */ | 737 | */ |
713 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 738 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
714 | if (event == partial_group) | 739 | if (event == partial_group) |
715 | break; | 740 | simulate = true; |
716 | event_sched_out(event, cpuctx, ctx); | 741 | |
742 | if (simulate) { | ||
743 | event->tstamp_running += now - event->tstamp_stopped; | ||
744 | event->tstamp_stopped = now; | ||
745 | } else { | ||
746 | event_sched_out(event, cpuctx, ctx); | ||
747 | } | ||
717 | } | 748 | } |
718 | event_sched_out(group_event, cpuctx, ctx); | 749 | event_sched_out(group_event, cpuctx, ctx); |
719 | 750 | ||
720 | if (txn) | 751 | pmu->cancel_txn(pmu); |
721 | pmu->cancel_txn(pmu); | ||
722 | 752 | ||
723 | return -EAGAIN; | 753 | return -EAGAIN; |
724 | } | 754 | } |
@@ -771,10 +801,10 @@ static void add_event_to_ctx(struct perf_event *event, | |||
771 | */ | 801 | */ |
772 | static void __perf_install_in_context(void *info) | 802 | static void __perf_install_in_context(void *info) |
773 | { | 803 | { |
774 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
775 | struct perf_event *event = info; | 804 | struct perf_event *event = info; |
776 | struct perf_event_context *ctx = event->ctx; | 805 | struct perf_event_context *ctx = event->ctx; |
777 | struct perf_event *leader = event->group_leader; | 806 | struct perf_event *leader = event->group_leader; |
807 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
778 | int err; | 808 | int err; |
779 | 809 | ||
780 | /* | 810 | /* |
@@ -794,12 +824,6 @@ static void __perf_install_in_context(void *info) | |||
794 | ctx->is_active = 1; | 824 | ctx->is_active = 1; |
795 | update_context_time(ctx); | 825 | update_context_time(ctx); |
796 | 826 | ||
797 | /* | ||
798 | * Protect the list operation against NMI by disabling the | ||
799 | * events on a global level. NOP for non NMI based events. | ||
800 | */ | ||
801 | perf_disable(); | ||
802 | |||
803 | add_event_to_ctx(event, ctx); | 827 | add_event_to_ctx(event, ctx); |
804 | 828 | ||
805 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 829 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -837,12 +861,7 @@ static void __perf_install_in_context(void *info) | |||
837 | } | 861 | } |
838 | } | 862 | } |
839 | 863 | ||
840 | if (!err && !ctx->task && cpuctx->max_pertask) | 864 | unlock: |
841 | cpuctx->max_pertask--; | ||
842 | |||
843 | unlock: | ||
844 | perf_enable(); | ||
845 | |||
846 | raw_spin_unlock(&ctx->lock); | 865 | raw_spin_unlock(&ctx->lock); |
847 | } | 866 | } |
848 | 867 | ||
@@ -865,6 +884,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
865 | { | 884 | { |
866 | struct task_struct *task = ctx->task; | 885 | struct task_struct *task = ctx->task; |
867 | 886 | ||
887 | event->ctx = ctx; | ||
888 | |||
868 | if (!task) { | 889 | if (!task) { |
869 | /* | 890 | /* |
870 | * Per cpu events are installed via an smp call and | 891 | * Per cpu events are installed via an smp call and |
@@ -913,10 +934,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
913 | 934 | ||
914 | event->state = PERF_EVENT_STATE_INACTIVE; | 935 | event->state = PERF_EVENT_STATE_INACTIVE; |
915 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 936 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
916 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 937 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
917 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 938 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { |
918 | sub->tstamp_enabled = | 939 | sub->tstamp_enabled = |
919 | ctx->time - sub->total_time_enabled; | 940 | ctx->time - sub->total_time_enabled; |
941 | } | ||
942 | } | ||
920 | } | 943 | } |
921 | 944 | ||
922 | /* | 945 | /* |
@@ -925,9 +948,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
925 | static void __perf_event_enable(void *info) | 948 | static void __perf_event_enable(void *info) |
926 | { | 949 | { |
927 | struct perf_event *event = info; | 950 | struct perf_event *event = info; |
928 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
929 | struct perf_event_context *ctx = event->ctx; | 951 | struct perf_event_context *ctx = event->ctx; |
930 | struct perf_event *leader = event->group_leader; | 952 | struct perf_event *leader = event->group_leader; |
953 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
931 | int err; | 954 | int err; |
932 | 955 | ||
933 | /* | 956 | /* |
@@ -961,12 +984,10 @@ static void __perf_event_enable(void *info) | |||
961 | if (!group_can_go_on(event, cpuctx, 1)) { | 984 | if (!group_can_go_on(event, cpuctx, 1)) { |
962 | err = -EEXIST; | 985 | err = -EEXIST; |
963 | } else { | 986 | } else { |
964 | perf_disable(); | ||
965 | if (event == leader) | 987 | if (event == leader) |
966 | err = group_sched_in(event, cpuctx, ctx); | 988 | err = group_sched_in(event, cpuctx, ctx); |
967 | else | 989 | else |
968 | err = event_sched_in(event, cpuctx, ctx); | 990 | err = event_sched_in(event, cpuctx, ctx); |
969 | perf_enable(); | ||
970 | } | 991 | } |
971 | 992 | ||
972 | if (err) { | 993 | if (err) { |
@@ -982,7 +1003,7 @@ static void __perf_event_enable(void *info) | |||
982 | } | 1003 | } |
983 | } | 1004 | } |
984 | 1005 | ||
985 | unlock: | 1006 | unlock: |
986 | raw_spin_unlock(&ctx->lock); | 1007 | raw_spin_unlock(&ctx->lock); |
987 | } | 1008 | } |
988 | 1009 | ||
@@ -1023,7 +1044,7 @@ void perf_event_enable(struct perf_event *event) | |||
1023 | if (event->state == PERF_EVENT_STATE_ERROR) | 1044 | if (event->state == PERF_EVENT_STATE_ERROR) |
1024 | event->state = PERF_EVENT_STATE_OFF; | 1045 | event->state = PERF_EVENT_STATE_OFF; |
1025 | 1046 | ||
1026 | retry: | 1047 | retry: |
1027 | raw_spin_unlock_irq(&ctx->lock); | 1048 | raw_spin_unlock_irq(&ctx->lock); |
1028 | task_oncpu_function_call(task, __perf_event_enable, event); | 1049 | task_oncpu_function_call(task, __perf_event_enable, event); |
1029 | 1050 | ||
@@ -1043,7 +1064,7 @@ void perf_event_enable(struct perf_event *event) | |||
1043 | if (event->state == PERF_EVENT_STATE_OFF) | 1064 | if (event->state == PERF_EVENT_STATE_OFF) |
1044 | __perf_event_mark_enabled(event, ctx); | 1065 | __perf_event_mark_enabled(event, ctx); |
1045 | 1066 | ||
1046 | out: | 1067 | out: |
1047 | raw_spin_unlock_irq(&ctx->lock); | 1068 | raw_spin_unlock_irq(&ctx->lock); |
1048 | } | 1069 | } |
1049 | 1070 | ||
@@ -1074,26 +1095,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1074 | struct perf_event *event; | 1095 | struct perf_event *event; |
1075 | 1096 | ||
1076 | raw_spin_lock(&ctx->lock); | 1097 | raw_spin_lock(&ctx->lock); |
1098 | perf_pmu_disable(ctx->pmu); | ||
1077 | ctx->is_active = 0; | 1099 | ctx->is_active = 0; |
1078 | if (likely(!ctx->nr_events)) | 1100 | if (likely(!ctx->nr_events)) |
1079 | goto out; | 1101 | goto out; |
1080 | update_context_time(ctx); | 1102 | update_context_time(ctx); |
1081 | 1103 | ||
1082 | perf_disable(); | ||
1083 | if (!ctx->nr_active) | 1104 | if (!ctx->nr_active) |
1084 | goto out_enable; | 1105 | goto out; |
1085 | 1106 | ||
1086 | if (event_type & EVENT_PINNED) | 1107 | if (event_type & EVENT_PINNED) { |
1087 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1108 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1088 | group_sched_out(event, cpuctx, ctx); | 1109 | group_sched_out(event, cpuctx, ctx); |
1110 | } | ||
1089 | 1111 | ||
1090 | if (event_type & EVENT_FLEXIBLE) | 1112 | if (event_type & EVENT_FLEXIBLE) { |
1091 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1113 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1092 | group_sched_out(event, cpuctx, ctx); | 1114 | group_sched_out(event, cpuctx, ctx); |
1093 | 1115 | } | |
1094 | out_enable: | 1116 | out: |
1095 | perf_enable(); | 1117 | perf_pmu_enable(ctx->pmu); |
1096 | out: | ||
1097 | raw_spin_unlock(&ctx->lock); | 1118 | raw_spin_unlock(&ctx->lock); |
1098 | } | 1119 | } |
1099 | 1120 | ||
@@ -1191,34 +1212,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1191 | } | 1212 | } |
1192 | } | 1213 | } |
1193 | 1214 | ||
1194 | /* | 1215 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1195 | * Called from scheduler to remove the events of the current task, | 1216 | struct task_struct *next) |
1196 | * with interrupts disabled. | ||
1197 | * | ||
1198 | * We stop each event and update the event value in event->count. | ||
1199 | * | ||
1200 | * This does not protect us against NMI, but disable() | ||
1201 | * sets the disabled bit in the control field of event _before_ | ||
1202 | * accessing the event control register. If a NMI hits, then it will | ||
1203 | * not restart the event. | ||
1204 | */ | ||
1205 | void perf_event_task_sched_out(struct task_struct *task, | ||
1206 | struct task_struct *next) | ||
1207 | { | 1217 | { |
1208 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1218 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1209 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1210 | struct perf_event_context *next_ctx; | 1219 | struct perf_event_context *next_ctx; |
1211 | struct perf_event_context *parent; | 1220 | struct perf_event_context *parent; |
1221 | struct perf_cpu_context *cpuctx; | ||
1212 | int do_switch = 1; | 1222 | int do_switch = 1; |
1213 | 1223 | ||
1214 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1224 | if (likely(!ctx)) |
1225 | return; | ||
1215 | 1226 | ||
1216 | if (likely(!ctx || !cpuctx->task_ctx)) | 1227 | cpuctx = __get_cpu_context(ctx); |
1228 | if (!cpuctx->task_ctx) | ||
1217 | return; | 1229 | return; |
1218 | 1230 | ||
1219 | rcu_read_lock(); | 1231 | rcu_read_lock(); |
1220 | parent = rcu_dereference(ctx->parent_ctx); | 1232 | parent = rcu_dereference(ctx->parent_ctx); |
1221 | next_ctx = next->perf_event_ctxp; | 1233 | next_ctx = next->perf_event_ctxp[ctxn]; |
1222 | if (parent && next_ctx && | 1234 | if (parent && next_ctx && |
1223 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1235 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1224 | /* | 1236 | /* |
@@ -1237,8 +1249,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1237 | * XXX do we need a memory barrier of sorts | 1249 | * XXX do we need a memory barrier of sorts |
1238 | * wrt to rcu_dereference() of perf_event_ctxp | 1250 | * wrt to rcu_dereference() of perf_event_ctxp |
1239 | */ | 1251 | */ |
1240 | task->perf_event_ctxp = next_ctx; | 1252 | task->perf_event_ctxp[ctxn] = next_ctx; |
1241 | next->perf_event_ctxp = ctx; | 1253 | next->perf_event_ctxp[ctxn] = ctx; |
1242 | ctx->task = next; | 1254 | ctx->task = next; |
1243 | next_ctx->task = task; | 1255 | next_ctx->task = task; |
1244 | do_switch = 0; | 1256 | do_switch = 0; |
@@ -1256,10 +1268,33 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1256 | } | 1268 | } |
1257 | } | 1269 | } |
1258 | 1270 | ||
1271 | #define for_each_task_context_nr(ctxn) \ | ||
1272 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
1273 | |||
1274 | /* | ||
1275 | * Called from scheduler to remove the events of the current task, | ||
1276 | * with interrupts disabled. | ||
1277 | * | ||
1278 | * We stop each event and update the event value in event->count. | ||
1279 | * | ||
1280 | * This does not protect us against NMI, but disable() | ||
1281 | * sets the disabled bit in the control field of event _before_ | ||
1282 | * accessing the event control register. If a NMI hits, then it will | ||
1283 | * not restart the event. | ||
1284 | */ | ||
1285 | void __perf_event_task_sched_out(struct task_struct *task, | ||
1286 | struct task_struct *next) | ||
1287 | { | ||
1288 | int ctxn; | ||
1289 | |||
1290 | for_each_task_context_nr(ctxn) | ||
1291 | perf_event_context_sched_out(task, ctxn, next); | ||
1292 | } | ||
1293 | |||
1259 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1294 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1260 | enum event_type_t event_type) | 1295 | enum event_type_t event_type) |
1261 | { | 1296 | { |
1262 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1297 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1263 | 1298 | ||
1264 | if (!cpuctx->task_ctx) | 1299 | if (!cpuctx->task_ctx) |
1265 | return; | 1300 | return; |
@@ -1274,14 +1309,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1274 | /* | 1309 | /* |
1275 | * Called with IRQs disabled | 1310 | * Called with IRQs disabled |
1276 | */ | 1311 | */ |
1277 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
1278 | { | ||
1279 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1280 | } | ||
1281 | |||
1282 | /* | ||
1283 | * Called with IRQs disabled | ||
1284 | */ | ||
1285 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1312 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1286 | enum event_type_t event_type) | 1313 | enum event_type_t event_type) |
1287 | { | 1314 | { |
@@ -1332,9 +1359,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1332 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1359 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1333 | continue; | 1360 | continue; |
1334 | 1361 | ||
1335 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1362 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1336 | if (group_sched_in(event, cpuctx, ctx)) | 1363 | if (group_sched_in(event, cpuctx, ctx)) |
1337 | can_add_hw = 0; | 1364 | can_add_hw = 0; |
1365 | } | ||
1338 | } | 1366 | } |
1339 | } | 1367 | } |
1340 | 1368 | ||
@@ -1350,8 +1378,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1350 | 1378 | ||
1351 | ctx->timestamp = perf_clock(); | 1379 | ctx->timestamp = perf_clock(); |
1352 | 1380 | ||
1353 | perf_disable(); | ||
1354 | |||
1355 | /* | 1381 | /* |
1356 | * First go through the list and put on any pinned groups | 1382 | * First go through the list and put on any pinned groups |
1357 | * in order to give them the best chance of going on. | 1383 | * in order to give them the best chance of going on. |
@@ -1363,8 +1389,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1363 | if (event_type & EVENT_FLEXIBLE) | 1389 | if (event_type & EVENT_FLEXIBLE) |
1364 | ctx_flexible_sched_in(ctx, cpuctx); | 1390 | ctx_flexible_sched_in(ctx, cpuctx); |
1365 | 1391 | ||
1366 | perf_enable(); | 1392 | out: |
1367 | out: | ||
1368 | raw_spin_unlock(&ctx->lock); | 1393 | raw_spin_unlock(&ctx->lock); |
1369 | } | 1394 | } |
1370 | 1395 | ||
@@ -1376,43 +1401,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
1376 | ctx_sched_in(ctx, cpuctx, event_type); | 1401 | ctx_sched_in(ctx, cpuctx, event_type); |
1377 | } | 1402 | } |
1378 | 1403 | ||
1379 | static void task_ctx_sched_in(struct task_struct *task, | 1404 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
1380 | enum event_type_t event_type) | 1405 | enum event_type_t event_type) |
1381 | { | 1406 | { |
1382 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1407 | struct perf_cpu_context *cpuctx; |
1383 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1384 | 1408 | ||
1385 | if (likely(!ctx)) | 1409 | cpuctx = __get_cpu_context(ctx); |
1386 | return; | ||
1387 | if (cpuctx->task_ctx == ctx) | 1410 | if (cpuctx->task_ctx == ctx) |
1388 | return; | 1411 | return; |
1412 | |||
1389 | ctx_sched_in(ctx, cpuctx, event_type); | 1413 | ctx_sched_in(ctx, cpuctx, event_type); |
1390 | cpuctx->task_ctx = ctx; | 1414 | cpuctx->task_ctx = ctx; |
1391 | } | 1415 | } |
1392 | /* | ||
1393 | * Called from scheduler to add the events of the current task | ||
1394 | * with interrupts disabled. | ||
1395 | * | ||
1396 | * We restore the event value and then enable it. | ||
1397 | * | ||
1398 | * This does not protect us against NMI, but enable() | ||
1399 | * sets the enabled bit in the control field of event _before_ | ||
1400 | * accessing the event control register. If a NMI hits, then it will | ||
1401 | * keep the event running. | ||
1402 | */ | ||
1403 | void perf_event_task_sched_in(struct task_struct *task) | ||
1404 | { | ||
1405 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1406 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1407 | 1416 | ||
1408 | if (likely(!ctx)) | 1417 | void perf_event_context_sched_in(struct perf_event_context *ctx) |
1409 | return; | 1418 | { |
1419 | struct perf_cpu_context *cpuctx; | ||
1410 | 1420 | ||
1421 | cpuctx = __get_cpu_context(ctx); | ||
1411 | if (cpuctx->task_ctx == ctx) | 1422 | if (cpuctx->task_ctx == ctx) |
1412 | return; | 1423 | return; |
1413 | 1424 | ||
1414 | perf_disable(); | 1425 | perf_pmu_disable(ctx->pmu); |
1415 | |||
1416 | /* | 1426 | /* |
1417 | * We want to keep the following priority order: | 1427 | * We want to keep the following priority order: |
1418 | * cpu pinned (that don't need to move), task pinned, | 1428 | * cpu pinned (that don't need to move), task pinned, |
@@ -1426,7 +1436,37 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
1426 | 1436 | ||
1427 | cpuctx->task_ctx = ctx; | 1437 | cpuctx->task_ctx = ctx; |
1428 | 1438 | ||
1429 | perf_enable(); | 1439 | /* |
1440 | * Since these rotations are per-cpu, we need to ensure the | ||
1441 | * cpu-context we got scheduled on is actually rotating. | ||
1442 | */ | ||
1443 | perf_pmu_rotate_start(ctx->pmu); | ||
1444 | perf_pmu_enable(ctx->pmu); | ||
1445 | } | ||
1446 | |||
1447 | /* | ||
1448 | * Called from scheduler to add the events of the current task | ||
1449 | * with interrupts disabled. | ||
1450 | * | ||
1451 | * We restore the event value and then enable it. | ||
1452 | * | ||
1453 | * This does not protect us against NMI, but enable() | ||
1454 | * sets the enabled bit in the control field of event _before_ | ||
1455 | * accessing the event control register. If a NMI hits, then it will | ||
1456 | * keep the event running. | ||
1457 | */ | ||
1458 | void __perf_event_task_sched_in(struct task_struct *task) | ||
1459 | { | ||
1460 | struct perf_event_context *ctx; | ||
1461 | int ctxn; | ||
1462 | |||
1463 | for_each_task_context_nr(ctxn) { | ||
1464 | ctx = task->perf_event_ctxp[ctxn]; | ||
1465 | if (likely(!ctx)) | ||
1466 | continue; | ||
1467 | |||
1468 | perf_event_context_sched_in(ctx); | ||
1469 | } | ||
1430 | } | 1470 | } |
1431 | 1471 | ||
1432 | #define MAX_INTERRUPTS (~0ULL) | 1472 | #define MAX_INTERRUPTS (~0ULL) |
@@ -1506,22 +1546,6 @@ do { \ | |||
1506 | return div64_u64(dividend, divisor); | 1546 | return div64_u64(dividend, divisor); |
1507 | } | 1547 | } |
1508 | 1548 | ||
1509 | static void perf_event_stop(struct perf_event *event) | ||
1510 | { | ||
1511 | if (!event->pmu->stop) | ||
1512 | return event->pmu->disable(event); | ||
1513 | |||
1514 | return event->pmu->stop(event); | ||
1515 | } | ||
1516 | |||
1517 | static int perf_event_start(struct perf_event *event) | ||
1518 | { | ||
1519 | if (!event->pmu->start) | ||
1520 | return event->pmu->enable(event); | ||
1521 | |||
1522 | return event->pmu->start(event); | ||
1523 | } | ||
1524 | |||
1525 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1549 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1526 | { | 1550 | { |
1527 | struct hw_perf_event *hwc = &event->hw; | 1551 | struct hw_perf_event *hwc = &event->hw; |
@@ -1541,15 +1565,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1541 | hwc->sample_period = sample_period; | 1565 | hwc->sample_period = sample_period; |
1542 | 1566 | ||
1543 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 1567 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1544 | perf_disable(); | 1568 | event->pmu->stop(event, PERF_EF_UPDATE); |
1545 | perf_event_stop(event); | ||
1546 | local64_set(&hwc->period_left, 0); | 1569 | local64_set(&hwc->period_left, 0); |
1547 | perf_event_start(event); | 1570 | event->pmu->start(event, PERF_EF_RELOAD); |
1548 | perf_enable(); | ||
1549 | } | 1571 | } |
1550 | } | 1572 | } |
1551 | 1573 | ||
1552 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1574 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
1553 | { | 1575 | { |
1554 | struct perf_event *event; | 1576 | struct perf_event *event; |
1555 | struct hw_perf_event *hwc; | 1577 | struct hw_perf_event *hwc; |
@@ -1574,23 +1596,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1574 | */ | 1596 | */ |
1575 | if (interrupts == MAX_INTERRUPTS) { | 1597 | if (interrupts == MAX_INTERRUPTS) { |
1576 | perf_log_throttle(event, 1); | 1598 | perf_log_throttle(event, 1); |
1577 | perf_disable(); | 1599 | event->pmu->start(event, 0); |
1578 | event->pmu->unthrottle(event); | ||
1579 | perf_enable(); | ||
1580 | } | 1600 | } |
1581 | 1601 | ||
1582 | if (!event->attr.freq || !event->attr.sample_freq) | 1602 | if (!event->attr.freq || !event->attr.sample_freq) |
1583 | continue; | 1603 | continue; |
1584 | 1604 | ||
1585 | perf_disable(); | ||
1586 | event->pmu->read(event); | 1605 | event->pmu->read(event); |
1587 | now = local64_read(&event->count); | 1606 | now = local64_read(&event->count); |
1588 | delta = now - hwc->freq_count_stamp; | 1607 | delta = now - hwc->freq_count_stamp; |
1589 | hwc->freq_count_stamp = now; | 1608 | hwc->freq_count_stamp = now; |
1590 | 1609 | ||
1591 | if (delta > 0) | 1610 | if (delta > 0) |
1592 | perf_adjust_period(event, TICK_NSEC, delta); | 1611 | perf_adjust_period(event, period, delta); |
1593 | perf_enable(); | ||
1594 | } | 1612 | } |
1595 | raw_spin_unlock(&ctx->lock); | 1613 | raw_spin_unlock(&ctx->lock); |
1596 | } | 1614 | } |
@@ -1602,38 +1620,48 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1602 | { | 1620 | { |
1603 | raw_spin_lock(&ctx->lock); | 1621 | raw_spin_lock(&ctx->lock); |
1604 | 1622 | ||
1605 | /* Rotate the first entry last of non-pinned groups */ | 1623 | /* |
1606 | list_rotate_left(&ctx->flexible_groups); | 1624 | * Rotate the first entry last of non-pinned groups. Rotation might be |
1625 | * disabled by the inheritance code. | ||
1626 | */ | ||
1627 | if (!ctx->rotate_disable) | ||
1628 | list_rotate_left(&ctx->flexible_groups); | ||
1607 | 1629 | ||
1608 | raw_spin_unlock(&ctx->lock); | 1630 | raw_spin_unlock(&ctx->lock); |
1609 | } | 1631 | } |
1610 | 1632 | ||
1611 | void perf_event_task_tick(struct task_struct *curr) | 1633 | /* |
1634 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
1635 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
1636 | * disabled, while rotate_context is called from IRQ context. | ||
1637 | */ | ||
1638 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
1612 | { | 1639 | { |
1613 | struct perf_cpu_context *cpuctx; | 1640 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
1614 | struct perf_event_context *ctx; | 1641 | struct perf_event_context *ctx = NULL; |
1615 | int rotate = 0; | 1642 | int rotate = 0, remove = 1; |
1616 | 1643 | ||
1617 | if (!atomic_read(&nr_events)) | 1644 | if (cpuctx->ctx.nr_events) { |
1618 | return; | 1645 | remove = 0; |
1619 | 1646 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | |
1620 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1647 | rotate = 1; |
1621 | if (cpuctx->ctx.nr_events && | 1648 | } |
1622 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | ||
1623 | rotate = 1; | ||
1624 | 1649 | ||
1625 | ctx = curr->perf_event_ctxp; | 1650 | ctx = cpuctx->task_ctx; |
1626 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 1651 | if (ctx && ctx->nr_events) { |
1627 | rotate = 1; | 1652 | remove = 0; |
1653 | if (ctx->nr_events != ctx->nr_active) | ||
1654 | rotate = 1; | ||
1655 | } | ||
1628 | 1656 | ||
1629 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1657 | perf_pmu_disable(cpuctx->ctx.pmu); |
1658 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
1630 | if (ctx) | 1659 | if (ctx) |
1631 | perf_ctx_adjust_freq(ctx); | 1660 | perf_ctx_adjust_freq(ctx, interval); |
1632 | 1661 | ||
1633 | if (!rotate) | 1662 | if (!rotate) |
1634 | return; | 1663 | goto done; |
1635 | 1664 | ||
1636 | perf_disable(); | ||
1637 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1665 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1638 | if (ctx) | 1666 | if (ctx) |
1639 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1667 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1644,8 +1672,27 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1644 | 1672 | ||
1645 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1673 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1646 | if (ctx) | 1674 | if (ctx) |
1647 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1675 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1648 | perf_enable(); | 1676 | |
1677 | done: | ||
1678 | if (remove) | ||
1679 | list_del_init(&cpuctx->rotation_list); | ||
1680 | |||
1681 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
1682 | } | ||
1683 | |||
1684 | void perf_event_task_tick(void) | ||
1685 | { | ||
1686 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
1687 | struct perf_cpu_context *cpuctx, *tmp; | ||
1688 | |||
1689 | WARN_ON(!irqs_disabled()); | ||
1690 | |||
1691 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
1692 | if (cpuctx->jiffies_interval == 1 || | ||
1693 | !(jiffies % cpuctx->jiffies_interval)) | ||
1694 | perf_rotate_context(cpuctx); | ||
1695 | } | ||
1649 | } | 1696 | } |
1650 | 1697 | ||
1651 | static int event_enable_on_exec(struct perf_event *event, | 1698 | static int event_enable_on_exec(struct perf_event *event, |
@@ -1667,20 +1714,18 @@ static int event_enable_on_exec(struct perf_event *event, | |||
1667 | * Enable all of a task's events that have been marked enable-on-exec. | 1714 | * Enable all of a task's events that have been marked enable-on-exec. |
1668 | * This expects task == current. | 1715 | * This expects task == current. |
1669 | */ | 1716 | */ |
1670 | static void perf_event_enable_on_exec(struct task_struct *task) | 1717 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
1671 | { | 1718 | { |
1672 | struct perf_event_context *ctx; | ||
1673 | struct perf_event *event; | 1719 | struct perf_event *event; |
1674 | unsigned long flags; | 1720 | unsigned long flags; |
1675 | int enabled = 0; | 1721 | int enabled = 0; |
1676 | int ret; | 1722 | int ret; |
1677 | 1723 | ||
1678 | local_irq_save(flags); | 1724 | local_irq_save(flags); |
1679 | ctx = task->perf_event_ctxp; | ||
1680 | if (!ctx || !ctx->nr_events) | 1725 | if (!ctx || !ctx->nr_events) |
1681 | goto out; | 1726 | goto out; |
1682 | 1727 | ||
1683 | __perf_event_task_sched_out(ctx); | 1728 | task_ctx_sched_out(ctx, EVENT_ALL); |
1684 | 1729 | ||
1685 | raw_spin_lock(&ctx->lock); | 1730 | raw_spin_lock(&ctx->lock); |
1686 | 1731 | ||
@@ -1704,8 +1749,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1704 | 1749 | ||
1705 | raw_spin_unlock(&ctx->lock); | 1750 | raw_spin_unlock(&ctx->lock); |
1706 | 1751 | ||
1707 | perf_event_task_sched_in(task); | 1752 | perf_event_context_sched_in(ctx); |
1708 | out: | 1753 | out: |
1709 | local_irq_restore(flags); | 1754 | local_irq_restore(flags); |
1710 | } | 1755 | } |
1711 | 1756 | ||
@@ -1714,9 +1759,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1714 | */ | 1759 | */ |
1715 | static void __perf_event_read(void *info) | 1760 | static void __perf_event_read(void *info) |
1716 | { | 1761 | { |
1717 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1718 | struct perf_event *event = info; | 1762 | struct perf_event *event = info; |
1719 | struct perf_event_context *ctx = event->ctx; | 1763 | struct perf_event_context *ctx = event->ctx; |
1764 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1720 | 1765 | ||
1721 | /* | 1766 | /* |
1722 | * If this is a task context, we need to check whether it is | 1767 | * If this is a task context, we need to check whether it is |
@@ -1755,7 +1800,13 @@ static u64 perf_event_read(struct perf_event *event) | |||
1755 | unsigned long flags; | 1800 | unsigned long flags; |
1756 | 1801 | ||
1757 | raw_spin_lock_irqsave(&ctx->lock, flags); | 1802 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1758 | update_context_time(ctx); | 1803 | /* |
1804 | * may read while context is not active | ||
1805 | * (e.g., thread is blocked), in that case | ||
1806 | * we cannot update context time | ||
1807 | */ | ||
1808 | if (ctx->is_active) | ||
1809 | update_context_time(ctx); | ||
1759 | update_event_times(event); | 1810 | update_event_times(event); |
1760 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1811 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1761 | } | 1812 | } |
@@ -1764,11 +1815,219 @@ static u64 perf_event_read(struct perf_event *event) | |||
1764 | } | 1815 | } |
1765 | 1816 | ||
1766 | /* | 1817 | /* |
1767 | * Initialize the perf_event context in a task_struct: | 1818 | * Callchain support |
1768 | */ | 1819 | */ |
1820 | |||
1821 | struct callchain_cpus_entries { | ||
1822 | struct rcu_head rcu_head; | ||
1823 | struct perf_callchain_entry *cpu_entries[0]; | ||
1824 | }; | ||
1825 | |||
1826 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
1827 | static atomic_t nr_callchain_events; | ||
1828 | static DEFINE_MUTEX(callchain_mutex); | ||
1829 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
1830 | |||
1831 | |||
1832 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
1833 | struct pt_regs *regs) | ||
1834 | { | ||
1835 | } | ||
1836 | |||
1837 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
1838 | struct pt_regs *regs) | ||
1839 | { | ||
1840 | } | ||
1841 | |||
1842 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
1843 | { | ||
1844 | struct callchain_cpus_entries *entries; | ||
1845 | int cpu; | ||
1846 | |||
1847 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
1848 | |||
1849 | for_each_possible_cpu(cpu) | ||
1850 | kfree(entries->cpu_entries[cpu]); | ||
1851 | |||
1852 | kfree(entries); | ||
1853 | } | ||
1854 | |||
1855 | static void release_callchain_buffers(void) | ||
1856 | { | ||
1857 | struct callchain_cpus_entries *entries; | ||
1858 | |||
1859 | entries = callchain_cpus_entries; | ||
1860 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
1861 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
1862 | } | ||
1863 | |||
1864 | static int alloc_callchain_buffers(void) | ||
1865 | { | ||
1866 | int cpu; | ||
1867 | int size; | ||
1868 | struct callchain_cpus_entries *entries; | ||
1869 | |||
1870 | /* | ||
1871 | * We can't use the percpu allocation API for data that can be | ||
1872 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
1873 | * until that gets sorted out. | ||
1874 | */ | ||
1875 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | ||
1876 | num_possible_cpus(); | ||
1877 | |||
1878 | entries = kzalloc(size, GFP_KERNEL); | ||
1879 | if (!entries) | ||
1880 | return -ENOMEM; | ||
1881 | |||
1882 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
1883 | |||
1884 | for_each_possible_cpu(cpu) { | ||
1885 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
1886 | cpu_to_node(cpu)); | ||
1887 | if (!entries->cpu_entries[cpu]) | ||
1888 | goto fail; | ||
1889 | } | ||
1890 | |||
1891 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
1892 | |||
1893 | return 0; | ||
1894 | |||
1895 | fail: | ||
1896 | for_each_possible_cpu(cpu) | ||
1897 | kfree(entries->cpu_entries[cpu]); | ||
1898 | kfree(entries); | ||
1899 | |||
1900 | return -ENOMEM; | ||
1901 | } | ||
1902 | |||
1903 | static int get_callchain_buffers(void) | ||
1904 | { | ||
1905 | int err = 0; | ||
1906 | int count; | ||
1907 | |||
1908 | mutex_lock(&callchain_mutex); | ||
1909 | |||
1910 | count = atomic_inc_return(&nr_callchain_events); | ||
1911 | if (WARN_ON_ONCE(count < 1)) { | ||
1912 | err = -EINVAL; | ||
1913 | goto exit; | ||
1914 | } | ||
1915 | |||
1916 | if (count > 1) { | ||
1917 | /* If the allocation failed, give up */ | ||
1918 | if (!callchain_cpus_entries) | ||
1919 | err = -ENOMEM; | ||
1920 | goto exit; | ||
1921 | } | ||
1922 | |||
1923 | err = alloc_callchain_buffers(); | ||
1924 | if (err) | ||
1925 | release_callchain_buffers(); | ||
1926 | exit: | ||
1927 | mutex_unlock(&callchain_mutex); | ||
1928 | |||
1929 | return err; | ||
1930 | } | ||
1931 | |||
1932 | static void put_callchain_buffers(void) | ||
1933 | { | ||
1934 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
1935 | release_callchain_buffers(); | ||
1936 | mutex_unlock(&callchain_mutex); | ||
1937 | } | ||
1938 | } | ||
1939 | |||
1940 | static int get_recursion_context(int *recursion) | ||
1941 | { | ||
1942 | int rctx; | ||
1943 | |||
1944 | if (in_nmi()) | ||
1945 | rctx = 3; | ||
1946 | else if (in_irq()) | ||
1947 | rctx = 2; | ||
1948 | else if (in_softirq()) | ||
1949 | rctx = 1; | ||
1950 | else | ||
1951 | rctx = 0; | ||
1952 | |||
1953 | if (recursion[rctx]) | ||
1954 | return -1; | ||
1955 | |||
1956 | recursion[rctx]++; | ||
1957 | barrier(); | ||
1958 | |||
1959 | return rctx; | ||
1960 | } | ||
1961 | |||
1962 | static inline void put_recursion_context(int *recursion, int rctx) | ||
1963 | { | ||
1964 | barrier(); | ||
1965 | recursion[rctx]--; | ||
1966 | } | ||
1967 | |||
1968 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
1969 | { | ||
1970 | int cpu; | ||
1971 | struct callchain_cpus_entries *entries; | ||
1972 | |||
1973 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
1974 | if (*rctx == -1) | ||
1975 | return NULL; | ||
1976 | |||
1977 | entries = rcu_dereference(callchain_cpus_entries); | ||
1978 | if (!entries) | ||
1979 | return NULL; | ||
1980 | |||
1981 | cpu = smp_processor_id(); | ||
1982 | |||
1983 | return &entries->cpu_entries[cpu][*rctx]; | ||
1984 | } | ||
1985 | |||
1769 | static void | 1986 | static void |
1770 | __perf_event_init_context(struct perf_event_context *ctx, | 1987 | put_callchain_entry(int rctx) |
1771 | struct task_struct *task) | 1988 | { |
1989 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
1990 | } | ||
1991 | |||
1992 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1993 | { | ||
1994 | int rctx; | ||
1995 | struct perf_callchain_entry *entry; | ||
1996 | |||
1997 | |||
1998 | entry = get_callchain_entry(&rctx); | ||
1999 | if (rctx == -1) | ||
2000 | return NULL; | ||
2001 | |||
2002 | if (!entry) | ||
2003 | goto exit_put; | ||
2004 | |||
2005 | entry->nr = 0; | ||
2006 | |||
2007 | if (!user_mode(regs)) { | ||
2008 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2009 | perf_callchain_kernel(entry, regs); | ||
2010 | if (current->mm) | ||
2011 | regs = task_pt_regs(current); | ||
2012 | else | ||
2013 | regs = NULL; | ||
2014 | } | ||
2015 | |||
2016 | if (regs) { | ||
2017 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2018 | perf_callchain_user(entry, regs); | ||
2019 | } | ||
2020 | |||
2021 | exit_put: | ||
2022 | put_callchain_entry(rctx); | ||
2023 | |||
2024 | return entry; | ||
2025 | } | ||
2026 | |||
2027 | /* | ||
2028 | * Initialize the perf_event context in a task_struct: | ||
2029 | */ | ||
2030 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
1772 | { | 2031 | { |
1773 | raw_spin_lock_init(&ctx->lock); | 2032 | raw_spin_lock_init(&ctx->lock); |
1774 | mutex_init(&ctx->mutex); | 2033 | mutex_init(&ctx->mutex); |
@@ -1776,45 +2035,38 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
1776 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2035 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1777 | INIT_LIST_HEAD(&ctx->event_list); | 2036 | INIT_LIST_HEAD(&ctx->event_list); |
1778 | atomic_set(&ctx->refcount, 1); | 2037 | atomic_set(&ctx->refcount, 1); |
1779 | ctx->task = task; | ||
1780 | } | 2038 | } |
1781 | 2039 | ||
1782 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2040 | static struct perf_event_context * |
2041 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
1783 | { | 2042 | { |
1784 | struct perf_event_context *ctx; | 2043 | struct perf_event_context *ctx; |
1785 | struct perf_cpu_context *cpuctx; | ||
1786 | struct task_struct *task; | ||
1787 | unsigned long flags; | ||
1788 | int err; | ||
1789 | 2044 | ||
1790 | if (pid == -1 && cpu != -1) { | 2045 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
1791 | /* Must be root to operate on a CPU event: */ | 2046 | if (!ctx) |
1792 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2047 | return NULL; |
1793 | return ERR_PTR(-EACCES); | ||
1794 | |||
1795 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
1796 | return ERR_PTR(-EINVAL); | ||
1797 | 2048 | ||
1798 | /* | 2049 | __perf_event_init_context(ctx); |
1799 | * We could be clever and allow to attach a event to an | 2050 | if (task) { |
1800 | * offline CPU and activate it when the CPU comes up, but | 2051 | ctx->task = task; |
1801 | * that's for later. | 2052 | get_task_struct(task); |
1802 | */ | 2053 | } |
1803 | if (!cpu_online(cpu)) | 2054 | ctx->pmu = pmu; |
1804 | return ERR_PTR(-ENODEV); | ||
1805 | 2055 | ||
1806 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2056 | return ctx; |
1807 | ctx = &cpuctx->ctx; | 2057 | } |
1808 | get_ctx(ctx); | ||
1809 | 2058 | ||
1810 | return ctx; | 2059 | static struct task_struct * |
1811 | } | 2060 | find_lively_task_by_vpid(pid_t vpid) |
2061 | { | ||
2062 | struct task_struct *task; | ||
2063 | int err; | ||
1812 | 2064 | ||
1813 | rcu_read_lock(); | 2065 | rcu_read_lock(); |
1814 | if (!pid) | 2066 | if (!vpid) |
1815 | task = current; | 2067 | task = current; |
1816 | else | 2068 | else |
1817 | task = find_task_by_vpid(pid); | 2069 | task = find_task_by_vpid(vpid); |
1818 | if (task) | 2070 | if (task) |
1819 | get_task_struct(task); | 2071 | get_task_struct(task); |
1820 | rcu_read_unlock(); | 2072 | rcu_read_unlock(); |
@@ -1834,36 +2086,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
1834 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2086 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1835 | goto errout; | 2087 | goto errout; |
1836 | 2088 | ||
1837 | retry: | 2089 | return task; |
1838 | ctx = perf_lock_task_context(task, &flags); | 2090 | errout: |
2091 | put_task_struct(task); | ||
2092 | return ERR_PTR(err); | ||
2093 | |||
2094 | } | ||
2095 | |||
2096 | static struct perf_event_context * | ||
2097 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
2098 | { | ||
2099 | struct perf_event_context *ctx; | ||
2100 | struct perf_cpu_context *cpuctx; | ||
2101 | unsigned long flags; | ||
2102 | int ctxn, err; | ||
2103 | |||
2104 | if (!task && cpu != -1) { | ||
2105 | /* Must be root to operate on a CPU event: */ | ||
2106 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
2107 | return ERR_PTR(-EACCES); | ||
2108 | |||
2109 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2110 | return ERR_PTR(-EINVAL); | ||
2111 | |||
2112 | /* | ||
2113 | * We could be clever and allow to attach a event to an | ||
2114 | * offline CPU and activate it when the CPU comes up, but | ||
2115 | * that's for later. | ||
2116 | */ | ||
2117 | if (!cpu_online(cpu)) | ||
2118 | return ERR_PTR(-ENODEV); | ||
2119 | |||
2120 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
2121 | ctx = &cpuctx->ctx; | ||
2122 | get_ctx(ctx); | ||
2123 | |||
2124 | return ctx; | ||
2125 | } | ||
2126 | |||
2127 | err = -EINVAL; | ||
2128 | ctxn = pmu->task_ctx_nr; | ||
2129 | if (ctxn < 0) | ||
2130 | goto errout; | ||
2131 | |||
2132 | retry: | ||
2133 | ctx = perf_lock_task_context(task, ctxn, &flags); | ||
1839 | if (ctx) { | 2134 | if (ctx) { |
1840 | unclone_ctx(ctx); | 2135 | unclone_ctx(ctx); |
1841 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2136 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1842 | } | 2137 | } |
1843 | 2138 | ||
1844 | if (!ctx) { | 2139 | if (!ctx) { |
1845 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2140 | ctx = alloc_perf_context(pmu, task); |
1846 | err = -ENOMEM; | 2141 | err = -ENOMEM; |
1847 | if (!ctx) | 2142 | if (!ctx) |
1848 | goto errout; | 2143 | goto errout; |
1849 | __perf_event_init_context(ctx, task); | 2144 | |
1850 | get_ctx(ctx); | 2145 | get_ctx(ctx); |
1851 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2146 | |
2147 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | ||
1852 | /* | 2148 | /* |
1853 | * We raced with some other task; use | 2149 | * We raced with some other task; use |
1854 | * the context they set. | 2150 | * the context they set. |
1855 | */ | 2151 | */ |
2152 | put_task_struct(task); | ||
1856 | kfree(ctx); | 2153 | kfree(ctx); |
1857 | goto retry; | 2154 | goto retry; |
1858 | } | 2155 | } |
1859 | get_task_struct(task); | ||
1860 | } | 2156 | } |
1861 | 2157 | ||
1862 | put_task_struct(task); | ||
1863 | return ctx; | 2158 | return ctx; |
1864 | 2159 | ||
1865 | errout: | 2160 | errout: |
1866 | put_task_struct(task); | ||
1867 | return ERR_PTR(err); | 2161 | return ERR_PTR(err); |
1868 | } | 2162 | } |
1869 | 2163 | ||
@@ -1880,21 +2174,23 @@ static void free_event_rcu(struct rcu_head *head) | |||
1880 | kfree(event); | 2174 | kfree(event); |
1881 | } | 2175 | } |
1882 | 2176 | ||
1883 | static void perf_pending_sync(struct perf_event *event); | ||
1884 | static void perf_buffer_put(struct perf_buffer *buffer); | 2177 | static void perf_buffer_put(struct perf_buffer *buffer); |
1885 | 2178 | ||
1886 | static void free_event(struct perf_event *event) | 2179 | static void free_event(struct perf_event *event) |
1887 | { | 2180 | { |
1888 | perf_pending_sync(event); | 2181 | irq_work_sync(&event->pending); |
1889 | 2182 | ||
1890 | if (!event->parent) { | 2183 | if (!event->parent) { |
1891 | atomic_dec(&nr_events); | 2184 | if (event->attach_state & PERF_ATTACH_TASK) |
2185 | jump_label_dec(&perf_task_events); | ||
1892 | if (event->attr.mmap || event->attr.mmap_data) | 2186 | if (event->attr.mmap || event->attr.mmap_data) |
1893 | atomic_dec(&nr_mmap_events); | 2187 | atomic_dec(&nr_mmap_events); |
1894 | if (event->attr.comm) | 2188 | if (event->attr.comm) |
1895 | atomic_dec(&nr_comm_events); | 2189 | atomic_dec(&nr_comm_events); |
1896 | if (event->attr.task) | 2190 | if (event->attr.task) |
1897 | atomic_dec(&nr_task_events); | 2191 | atomic_dec(&nr_task_events); |
2192 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
2193 | put_callchain_buffers(); | ||
1898 | } | 2194 | } |
1899 | 2195 | ||
1900 | if (event->buffer) { | 2196 | if (event->buffer) { |
@@ -1905,7 +2201,9 @@ static void free_event(struct perf_event *event) | |||
1905 | if (event->destroy) | 2201 | if (event->destroy) |
1906 | event->destroy(event); | 2202 | event->destroy(event); |
1907 | 2203 | ||
1908 | put_ctx(event->ctx); | 2204 | if (event->ctx) |
2205 | put_ctx(event->ctx); | ||
2206 | |||
1909 | call_rcu(&event->rcu_head, free_event_rcu); | 2207 | call_rcu(&event->rcu_head, free_event_rcu); |
1910 | } | 2208 | } |
1911 | 2209 | ||
@@ -1939,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
1939 | raw_spin_unlock_irq(&ctx->lock); | 2237 | raw_spin_unlock_irq(&ctx->lock); |
1940 | mutex_unlock(&ctx->mutex); | 2238 | mutex_unlock(&ctx->mutex); |
1941 | 2239 | ||
1942 | mutex_lock(&event->owner->perf_event_mutex); | ||
1943 | list_del_init(&event->owner_entry); | ||
1944 | mutex_unlock(&event->owner->perf_event_mutex); | ||
1945 | put_task_struct(event->owner); | ||
1946 | |||
1947 | free_event(event); | 2240 | free_event(event); |
1948 | 2241 | ||
1949 | return 0; | 2242 | return 0; |
@@ -1956,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
1956 | static int perf_release(struct inode *inode, struct file *file) | 2249 | static int perf_release(struct inode *inode, struct file *file) |
1957 | { | 2250 | { |
1958 | struct perf_event *event = file->private_data; | 2251 | struct perf_event *event = file->private_data; |
2252 | struct task_struct *owner; | ||
1959 | 2253 | ||
1960 | file->private_data = NULL; | 2254 | file->private_data = NULL; |
1961 | 2255 | ||
2256 | rcu_read_lock(); | ||
2257 | owner = ACCESS_ONCE(event->owner); | ||
2258 | /* | ||
2259 | * Matches the smp_wmb() in perf_event_exit_task(). If we observe | ||
2260 | * !owner it means the list deletion is complete and we can indeed | ||
2261 | * free this event, otherwise we need to serialize on | ||
2262 | * owner->perf_event_mutex. | ||
2263 | */ | ||
2264 | smp_read_barrier_depends(); | ||
2265 | if (owner) { | ||
2266 | /* | ||
2267 | * Since delayed_put_task_struct() also drops the last | ||
2268 | * task reference we can safely take a new reference | ||
2269 | * while holding the rcu_read_lock(). | ||
2270 | */ | ||
2271 | get_task_struct(owner); | ||
2272 | } | ||
2273 | rcu_read_unlock(); | ||
2274 | |||
2275 | if (owner) { | ||
2276 | mutex_lock(&owner->perf_event_mutex); | ||
2277 | /* | ||
2278 | * We have to re-check the event->owner field, if it is cleared | ||
2279 | * we raced with perf_event_exit_task(), acquiring the mutex | ||
2280 | * ensured they're done, and we can proceed with freeing the | ||
2281 | * event. | ||
2282 | */ | ||
2283 | if (event->owner) | ||
2284 | list_del_init(&event->owner_entry); | ||
2285 | mutex_unlock(&owner->perf_event_mutex); | ||
2286 | put_task_struct(owner); | ||
2287 | } | ||
2288 | |||
1962 | return perf_event_release_kernel(event); | 2289 | return perf_event_release_kernel(event); |
1963 | } | 2290 | } |
1964 | 2291 | ||
@@ -2184,15 +2511,13 @@ static void perf_event_for_each(struct perf_event *event, | |||
2184 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 2511 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
2185 | { | 2512 | { |
2186 | struct perf_event_context *ctx = event->ctx; | 2513 | struct perf_event_context *ctx = event->ctx; |
2187 | unsigned long size; | ||
2188 | int ret = 0; | 2514 | int ret = 0; |
2189 | u64 value; | 2515 | u64 value; |
2190 | 2516 | ||
2191 | if (!event->attr.sample_period) | 2517 | if (!event->attr.sample_period) |
2192 | return -EINVAL; | 2518 | return -EINVAL; |
2193 | 2519 | ||
2194 | size = copy_from_user(&value, arg, sizeof(value)); | 2520 | if (copy_from_user(&value, arg, sizeof(value))) |
2195 | if (size != sizeof(value)) | ||
2196 | return -EFAULT; | 2521 | return -EFAULT; |
2197 | 2522 | ||
2198 | if (!value) | 2523 | if (!value) |
@@ -2326,6 +2651,9 @@ int perf_event_task_disable(void) | |||
2326 | 2651 | ||
2327 | static int perf_event_index(struct perf_event *event) | 2652 | static int perf_event_index(struct perf_event *event) |
2328 | { | 2653 | { |
2654 | if (event->hw.state & PERF_HES_STOPPED) | ||
2655 | return 0; | ||
2656 | |||
2329 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2657 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2330 | return 0; | 2658 | return 0; |
2331 | 2659 | ||
@@ -2829,16 +3157,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
2829 | } | 3157 | } |
2830 | } | 3158 | } |
2831 | 3159 | ||
2832 | /* | 3160 | static void perf_pending_event(struct irq_work *entry) |
2833 | * Pending wakeups | ||
2834 | * | ||
2835 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
2836 | * | ||
2837 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
2838 | * single linked list and use cmpxchg() to add entries lockless. | ||
2839 | */ | ||
2840 | |||
2841 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
2842 | { | 3161 | { |
2843 | struct perf_event *event = container_of(entry, | 3162 | struct perf_event *event = container_of(entry, |
2844 | struct perf_event, pending); | 3163 | struct perf_event, pending); |
@@ -2854,99 +3173,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
2854 | } | 3173 | } |
2855 | } | 3174 | } |
2856 | 3175 | ||
2857 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
2858 | |||
2859 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
2860 | PENDING_TAIL, | ||
2861 | }; | ||
2862 | |||
2863 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
2864 | void (*func)(struct perf_pending_entry *)) | ||
2865 | { | ||
2866 | struct perf_pending_entry **head; | ||
2867 | |||
2868 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
2869 | return; | ||
2870 | |||
2871 | entry->func = func; | ||
2872 | |||
2873 | head = &get_cpu_var(perf_pending_head); | ||
2874 | |||
2875 | do { | ||
2876 | entry->next = *head; | ||
2877 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
2878 | |||
2879 | set_perf_event_pending(); | ||
2880 | |||
2881 | put_cpu_var(perf_pending_head); | ||
2882 | } | ||
2883 | |||
2884 | static int __perf_pending_run(void) | ||
2885 | { | ||
2886 | struct perf_pending_entry *list; | ||
2887 | int nr = 0; | ||
2888 | |||
2889 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
2890 | while (list != PENDING_TAIL) { | ||
2891 | void (*func)(struct perf_pending_entry *); | ||
2892 | struct perf_pending_entry *entry = list; | ||
2893 | |||
2894 | list = list->next; | ||
2895 | |||
2896 | func = entry->func; | ||
2897 | entry->next = NULL; | ||
2898 | /* | ||
2899 | * Ensure we observe the unqueue before we issue the wakeup, | ||
2900 | * so that we won't be waiting forever. | ||
2901 | * -- see perf_not_pending(). | ||
2902 | */ | ||
2903 | smp_wmb(); | ||
2904 | |||
2905 | func(entry); | ||
2906 | nr++; | ||
2907 | } | ||
2908 | |||
2909 | return nr; | ||
2910 | } | ||
2911 | |||
2912 | static inline int perf_not_pending(struct perf_event *event) | ||
2913 | { | ||
2914 | /* | ||
2915 | * If we flush on whatever cpu we run, there is a chance we don't | ||
2916 | * need to wait. | ||
2917 | */ | ||
2918 | get_cpu(); | ||
2919 | __perf_pending_run(); | ||
2920 | put_cpu(); | ||
2921 | |||
2922 | /* | ||
2923 | * Ensure we see the proper queue state before going to sleep | ||
2924 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
2925 | */ | ||
2926 | smp_rmb(); | ||
2927 | return event->pending.next == NULL; | ||
2928 | } | ||
2929 | |||
2930 | static void perf_pending_sync(struct perf_event *event) | ||
2931 | { | ||
2932 | wait_event(event->waitq, perf_not_pending(event)); | ||
2933 | } | ||
2934 | |||
2935 | void perf_event_do_pending(void) | ||
2936 | { | ||
2937 | __perf_pending_run(); | ||
2938 | } | ||
2939 | |||
2940 | /* | ||
2941 | * Callchain support -- arch specific | ||
2942 | */ | ||
2943 | |||
2944 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2945 | { | ||
2946 | return NULL; | ||
2947 | } | ||
2948 | |||
2949 | |||
2950 | /* | 3176 | /* |
2951 | * We assume there is only KVM supporting the callbacks. | 3177 | * We assume there is only KVM supporting the callbacks. |
2952 | * Later on, we might change it to a list if there is | 3178 | * Later on, we might change it to a list if there is |
@@ -2996,8 +3222,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
2996 | 3222 | ||
2997 | if (handle->nmi) { | 3223 | if (handle->nmi) { |
2998 | handle->event->pending_wakeup = 1; | 3224 | handle->event->pending_wakeup = 1; |
2999 | perf_pending_queue(&handle->event->pending, | 3225 | irq_work_queue(&handle->event->pending); |
3000 | perf_pending_event); | ||
3001 | } else | 3226 | } else |
3002 | perf_event_wakeup(handle->event); | 3227 | perf_event_wakeup(handle->event); |
3003 | } | 3228 | } |
@@ -3053,7 +3278,7 @@ again: | |||
3053 | if (handle->wakeup != local_read(&buffer->wakeup)) | 3278 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3054 | perf_output_wakeup(handle); | 3279 | perf_output_wakeup(handle); |
3055 | 3280 | ||
3056 | out: | 3281 | out: |
3057 | preempt_enable(); | 3282 | preempt_enable(); |
3058 | } | 3283 | } |
3059 | 3284 | ||
@@ -3205,7 +3430,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | |||
3205 | } | 3430 | } |
3206 | 3431 | ||
3207 | static void perf_output_read_one(struct perf_output_handle *handle, | 3432 | static void perf_output_read_one(struct perf_output_handle *handle, |
3208 | struct perf_event *event) | 3433 | struct perf_event *event, |
3434 | u64 enabled, u64 running) | ||
3209 | { | 3435 | { |
3210 | u64 read_format = event->attr.read_format; | 3436 | u64 read_format = event->attr.read_format; |
3211 | u64 values[4]; | 3437 | u64 values[4]; |
@@ -3213,11 +3439,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3213 | 3439 | ||
3214 | values[n++] = perf_event_count(event); | 3440 | values[n++] = perf_event_count(event); |
3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 3441 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
3216 | values[n++] = event->total_time_enabled + | 3442 | values[n++] = enabled + |
3217 | atomic64_read(&event->child_total_time_enabled); | 3443 | atomic64_read(&event->child_total_time_enabled); |
3218 | } | 3444 | } |
3219 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | 3445 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { |
3220 | values[n++] = event->total_time_running + | 3446 | values[n++] = running + |
3221 | atomic64_read(&event->child_total_time_running); | 3447 | atomic64_read(&event->child_total_time_running); |
3222 | } | 3448 | } |
3223 | if (read_format & PERF_FORMAT_ID) | 3449 | if (read_format & PERF_FORMAT_ID) |
@@ -3230,7 +3456,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3230 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. | 3456 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. |
3231 | */ | 3457 | */ |
3232 | static void perf_output_read_group(struct perf_output_handle *handle, | 3458 | static void perf_output_read_group(struct perf_output_handle *handle, |
3233 | struct perf_event *event) | 3459 | struct perf_event *event, |
3460 | u64 enabled, u64 running) | ||
3234 | { | 3461 | { |
3235 | struct perf_event *leader = event->group_leader, *sub; | 3462 | struct perf_event *leader = event->group_leader, *sub; |
3236 | u64 read_format = event->attr.read_format; | 3463 | u64 read_format = event->attr.read_format; |
@@ -3240,10 +3467,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3240 | values[n++] = 1 + leader->nr_siblings; | 3467 | values[n++] = 1 + leader->nr_siblings; |
3241 | 3468 | ||
3242 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 3469 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
3243 | values[n++] = leader->total_time_enabled; | 3470 | values[n++] = enabled; |
3244 | 3471 | ||
3245 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 3472 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
3246 | values[n++] = leader->total_time_running; | 3473 | values[n++] = running; |
3247 | 3474 | ||
3248 | if (leader != event) | 3475 | if (leader != event) |
3249 | leader->pmu->read(leader); | 3476 | leader->pmu->read(leader); |
@@ -3268,13 +3495,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3268 | } | 3495 | } |
3269 | } | 3496 | } |
3270 | 3497 | ||
3498 | #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ | ||
3499 | PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
3500 | |||
3271 | static void perf_output_read(struct perf_output_handle *handle, | 3501 | static void perf_output_read(struct perf_output_handle *handle, |
3272 | struct perf_event *event) | 3502 | struct perf_event *event) |
3273 | { | 3503 | { |
3504 | u64 enabled = 0, running = 0, now, ctx_time; | ||
3505 | u64 read_format = event->attr.read_format; | ||
3506 | |||
3507 | /* | ||
3508 | * compute total_time_enabled, total_time_running | ||
3509 | * based on snapshot values taken when the event | ||
3510 | * was last scheduled in. | ||
3511 | * | ||
3512 | * we cannot simply called update_context_time() | ||
3513 | * because of locking issue as we are called in | ||
3514 | * NMI context | ||
3515 | */ | ||
3516 | if (read_format & PERF_FORMAT_TOTAL_TIMES) { | ||
3517 | now = perf_clock(); | ||
3518 | ctx_time = event->shadow_ctx_time + now; | ||
3519 | enabled = ctx_time - event->tstamp_enabled; | ||
3520 | running = ctx_time - event->tstamp_running; | ||
3521 | } | ||
3522 | |||
3274 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 3523 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
3275 | perf_output_read_group(handle, event); | 3524 | perf_output_read_group(handle, event, enabled, running); |
3276 | else | 3525 | else |
3277 | perf_output_read_one(handle, event); | 3526 | perf_output_read_one(handle, event, enabled, running); |
3278 | } | 3527 | } |
3279 | 3528 | ||
3280 | void perf_output_sample(struct perf_output_handle *handle, | 3529 | void perf_output_sample(struct perf_output_handle *handle, |
@@ -3441,14 +3690,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
3441 | struct perf_output_handle handle; | 3690 | struct perf_output_handle handle; |
3442 | struct perf_event_header header; | 3691 | struct perf_event_header header; |
3443 | 3692 | ||
3693 | /* protect the callchain buffers */ | ||
3694 | rcu_read_lock(); | ||
3695 | |||
3444 | perf_prepare_sample(&header, data, event, regs); | 3696 | perf_prepare_sample(&header, data, event, regs); |
3445 | 3697 | ||
3446 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 3698 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3447 | return; | 3699 | goto exit; |
3448 | 3700 | ||
3449 | perf_output_sample(&handle, &header, data, event); | 3701 | perf_output_sample(&handle, &header, data, event); |
3450 | 3702 | ||
3451 | perf_output_end(&handle); | 3703 | perf_output_end(&handle); |
3704 | |||
3705 | exit: | ||
3706 | rcu_read_unlock(); | ||
3452 | } | 3707 | } |
3453 | 3708 | ||
3454 | /* | 3709 | /* |
@@ -3562,16 +3817,29 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
3562 | static void perf_event_task_event(struct perf_task_event *task_event) | 3817 | static void perf_event_task_event(struct perf_task_event *task_event) |
3563 | { | 3818 | { |
3564 | struct perf_cpu_context *cpuctx; | 3819 | struct perf_cpu_context *cpuctx; |
3565 | struct perf_event_context *ctx = task_event->task_ctx; | 3820 | struct perf_event_context *ctx; |
3821 | struct pmu *pmu; | ||
3822 | int ctxn; | ||
3566 | 3823 | ||
3567 | rcu_read_lock(); | 3824 | rcu_read_lock(); |
3568 | cpuctx = &get_cpu_var(perf_cpu_context); | 3825 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3569 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3826 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3570 | if (!ctx) | 3827 | if (cpuctx->active_pmu != pmu) |
3571 | ctx = rcu_dereference(current->perf_event_ctxp); | 3828 | goto next; |
3572 | if (ctx) | 3829 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3573 | perf_event_task_ctx(ctx, task_event); | 3830 | |
3574 | put_cpu_var(perf_cpu_context); | 3831 | ctx = task_event->task_ctx; |
3832 | if (!ctx) { | ||
3833 | ctxn = pmu->task_ctx_nr; | ||
3834 | if (ctxn < 0) | ||
3835 | goto next; | ||
3836 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3837 | } | ||
3838 | if (ctx) | ||
3839 | perf_event_task_ctx(ctx, task_event); | ||
3840 | next: | ||
3841 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3842 | } | ||
3575 | rcu_read_unlock(); | 3843 | rcu_read_unlock(); |
3576 | } | 3844 | } |
3577 | 3845 | ||
@@ -3676,8 +3944,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3676 | { | 3944 | { |
3677 | struct perf_cpu_context *cpuctx; | 3945 | struct perf_cpu_context *cpuctx; |
3678 | struct perf_event_context *ctx; | 3946 | struct perf_event_context *ctx; |
3679 | unsigned int size; | ||
3680 | char comm[TASK_COMM_LEN]; | 3947 | char comm[TASK_COMM_LEN]; |
3948 | unsigned int size; | ||
3949 | struct pmu *pmu; | ||
3950 | int ctxn; | ||
3681 | 3951 | ||
3682 | memset(comm, 0, sizeof(comm)); | 3952 | memset(comm, 0, sizeof(comm)); |
3683 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 3953 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -3689,21 +3959,38 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3689 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 3959 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3690 | 3960 | ||
3691 | rcu_read_lock(); | 3961 | rcu_read_lock(); |
3692 | cpuctx = &get_cpu_var(perf_cpu_context); | 3962 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3693 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 3963 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3694 | ctx = rcu_dereference(current->perf_event_ctxp); | 3964 | if (cpuctx->active_pmu != pmu) |
3695 | if (ctx) | 3965 | goto next; |
3696 | perf_event_comm_ctx(ctx, comm_event); | 3966 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3697 | put_cpu_var(perf_cpu_context); | 3967 | |
3968 | ctxn = pmu->task_ctx_nr; | ||
3969 | if (ctxn < 0) | ||
3970 | goto next; | ||
3971 | |||
3972 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3973 | if (ctx) | ||
3974 | perf_event_comm_ctx(ctx, comm_event); | ||
3975 | next: | ||
3976 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3977 | } | ||
3698 | rcu_read_unlock(); | 3978 | rcu_read_unlock(); |
3699 | } | 3979 | } |
3700 | 3980 | ||
3701 | void perf_event_comm(struct task_struct *task) | 3981 | void perf_event_comm(struct task_struct *task) |
3702 | { | 3982 | { |
3703 | struct perf_comm_event comm_event; | 3983 | struct perf_comm_event comm_event; |
3984 | struct perf_event_context *ctx; | ||
3985 | int ctxn; | ||
3986 | |||
3987 | for_each_task_context_nr(ctxn) { | ||
3988 | ctx = task->perf_event_ctxp[ctxn]; | ||
3989 | if (!ctx) | ||
3990 | continue; | ||
3704 | 3991 | ||
3705 | if (task->perf_event_ctxp) | 3992 | perf_event_enable_on_exec(ctx); |
3706 | perf_event_enable_on_exec(task); | 3993 | } |
3707 | 3994 | ||
3708 | if (!atomic_read(&nr_comm_events)) | 3995 | if (!atomic_read(&nr_comm_events)) |
3709 | return; | 3996 | return; |
@@ -3805,6 +4092,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3805 | char tmp[16]; | 4092 | char tmp[16]; |
3806 | char *buf = NULL; | 4093 | char *buf = NULL; |
3807 | const char *name; | 4094 | const char *name; |
4095 | struct pmu *pmu; | ||
4096 | int ctxn; | ||
3808 | 4097 | ||
3809 | memset(tmp, 0, sizeof(tmp)); | 4098 | memset(tmp, 0, sizeof(tmp)); |
3810 | 4099 | ||
@@ -3857,12 +4146,25 @@ got_name: | |||
3857 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4146 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3858 | 4147 | ||
3859 | rcu_read_lock(); | 4148 | rcu_read_lock(); |
3860 | cpuctx = &get_cpu_var(perf_cpu_context); | 4149 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3861 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4150 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3862 | ctx = rcu_dereference(current->perf_event_ctxp); | 4151 | if (cpuctx->active_pmu != pmu) |
3863 | if (ctx) | 4152 | goto next; |
3864 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4153 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
3865 | put_cpu_var(perf_cpu_context); | 4154 | vma->vm_flags & VM_EXEC); |
4155 | |||
4156 | ctxn = pmu->task_ctx_nr; | ||
4157 | if (ctxn < 0) | ||
4158 | goto next; | ||
4159 | |||
4160 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4161 | if (ctx) { | ||
4162 | perf_event_mmap_ctx(ctx, mmap_event, | ||
4163 | vma->vm_flags & VM_EXEC); | ||
4164 | } | ||
4165 | next: | ||
4166 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4167 | } | ||
3866 | rcu_read_unlock(); | 4168 | rcu_read_unlock(); |
3867 | 4169 | ||
3868 | kfree(buf); | 4170 | kfree(buf); |
@@ -3944,8 +4246,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3944 | struct hw_perf_event *hwc = &event->hw; | 4246 | struct hw_perf_event *hwc = &event->hw; |
3945 | int ret = 0; | 4247 | int ret = 0; |
3946 | 4248 | ||
3947 | throttle = (throttle && event->pmu->unthrottle != NULL); | ||
3948 | |||
3949 | if (!throttle) { | 4249 | if (!throttle) { |
3950 | hwc->interrupts++; | 4250 | hwc->interrupts++; |
3951 | } else { | 4251 | } else { |
@@ -3988,8 +4288,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3988 | event->pending_kill = POLL_HUP; | 4288 | event->pending_kill = POLL_HUP; |
3989 | if (nmi) { | 4289 | if (nmi) { |
3990 | event->pending_disable = 1; | 4290 | event->pending_disable = 1; |
3991 | perf_pending_queue(&event->pending, | 4291 | irq_work_queue(&event->pending); |
3992 | perf_pending_event); | ||
3993 | } else | 4292 | } else |
3994 | perf_event_disable(event); | 4293 | perf_event_disable(event); |
3995 | } | 4294 | } |
@@ -4013,6 +4312,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
4013 | * Generic software event infrastructure | 4312 | * Generic software event infrastructure |
4014 | */ | 4313 | */ |
4015 | 4314 | ||
4315 | struct swevent_htable { | ||
4316 | struct swevent_hlist *swevent_hlist; | ||
4317 | struct mutex hlist_mutex; | ||
4318 | int hlist_refcount; | ||
4319 | |||
4320 | /* Recursion avoidance in each contexts */ | ||
4321 | int recursion[PERF_NR_CONTEXTS]; | ||
4322 | }; | ||
4323 | |||
4324 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
4325 | |||
4016 | /* | 4326 | /* |
4017 | * We directly increment event->count and keep a second value in | 4327 | * We directly increment event->count and keep a second value in |
4018 | * event->hw.period_left to count intervals. This period event | 4328 | * event->hw.period_left to count intervals. This period event |
@@ -4070,7 +4380,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4070 | } | 4380 | } |
4071 | } | 4381 | } |
4072 | 4382 | ||
4073 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 4383 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
4074 | int nmi, struct perf_sample_data *data, | 4384 | int nmi, struct perf_sample_data *data, |
4075 | struct pt_regs *regs) | 4385 | struct pt_regs *regs) |
4076 | { | 4386 | { |
@@ -4096,6 +4406,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4096 | static int perf_exclude_event(struct perf_event *event, | 4406 | static int perf_exclude_event(struct perf_event *event, |
4097 | struct pt_regs *regs) | 4407 | struct pt_regs *regs) |
4098 | { | 4408 | { |
4409 | if (event->hw.state & PERF_HES_STOPPED) | ||
4410 | return 0; | ||
4411 | |||
4099 | if (regs) { | 4412 | if (regs) { |
4100 | if (event->attr.exclude_user && user_mode(regs)) | 4413 | if (event->attr.exclude_user && user_mode(regs)) |
4101 | return 1; | 4414 | return 1; |
@@ -4142,11 +4455,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
4142 | 4455 | ||
4143 | /* For the read side: events when they trigger */ | 4456 | /* For the read side: events when they trigger */ |
4144 | static inline struct hlist_head * | 4457 | static inline struct hlist_head * |
4145 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4458 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
4146 | { | 4459 | { |
4147 | struct swevent_hlist *hlist; | 4460 | struct swevent_hlist *hlist; |
4148 | 4461 | ||
4149 | hlist = rcu_dereference(ctx->swevent_hlist); | 4462 | hlist = rcu_dereference(swhash->swevent_hlist); |
4150 | if (!hlist) | 4463 | if (!hlist) |
4151 | return NULL; | 4464 | return NULL; |
4152 | 4465 | ||
@@ -4155,7 +4468,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
4155 | 4468 | ||
4156 | /* For the event head insertion and removal in the hlist */ | 4469 | /* For the event head insertion and removal in the hlist */ |
4157 | static inline struct hlist_head * | 4470 | static inline struct hlist_head * |
4158 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 4471 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
4159 | { | 4472 | { |
4160 | struct swevent_hlist *hlist; | 4473 | struct swevent_hlist *hlist; |
4161 | u32 event_id = event->attr.config; | 4474 | u32 event_id = event->attr.config; |
@@ -4166,7 +4479,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
4166 | * and release. Which makes the protected version suitable here. | 4479 | * and release. Which makes the protected version suitable here. |
4167 | * The context lock guarantees that. | 4480 | * The context lock guarantees that. |
4168 | */ | 4481 | */ |
4169 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 4482 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
4170 | lockdep_is_held(&event->ctx->lock)); | 4483 | lockdep_is_held(&event->ctx->lock)); |
4171 | if (!hlist) | 4484 | if (!hlist) |
4172 | return NULL; | 4485 | return NULL; |
@@ -4179,23 +4492,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4179 | struct perf_sample_data *data, | 4492 | struct perf_sample_data *data, |
4180 | struct pt_regs *regs) | 4493 | struct pt_regs *regs) |
4181 | { | 4494 | { |
4182 | struct perf_cpu_context *cpuctx; | 4495 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4183 | struct perf_event *event; | 4496 | struct perf_event *event; |
4184 | struct hlist_node *node; | 4497 | struct hlist_node *node; |
4185 | struct hlist_head *head; | 4498 | struct hlist_head *head; |
4186 | 4499 | ||
4187 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4188 | |||
4189 | rcu_read_lock(); | 4500 | rcu_read_lock(); |
4190 | 4501 | head = find_swevent_head_rcu(swhash, type, event_id); | |
4191 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
4192 | |||
4193 | if (!head) | 4502 | if (!head) |
4194 | goto end; | 4503 | goto end; |
4195 | 4504 | ||
4196 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4505 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4197 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4506 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4198 | perf_swevent_add(event, nr, nmi, data, regs); | 4507 | perf_swevent_event(event, nr, nmi, data, regs); |
4199 | } | 4508 | } |
4200 | end: | 4509 | end: |
4201 | rcu_read_unlock(); | 4510 | rcu_read_unlock(); |
@@ -4203,33 +4512,17 @@ end: | |||
4203 | 4512 | ||
4204 | int perf_swevent_get_recursion_context(void) | 4513 | int perf_swevent_get_recursion_context(void) |
4205 | { | 4514 | { |
4206 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4515 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4207 | int rctx; | ||
4208 | |||
4209 | if (in_nmi()) | ||
4210 | rctx = 3; | ||
4211 | else if (in_irq()) | ||
4212 | rctx = 2; | ||
4213 | else if (in_softirq()) | ||
4214 | rctx = 1; | ||
4215 | else | ||
4216 | rctx = 0; | ||
4217 | |||
4218 | if (cpuctx->recursion[rctx]) | ||
4219 | return -1; | ||
4220 | 4516 | ||
4221 | cpuctx->recursion[rctx]++; | 4517 | return get_recursion_context(swhash->recursion); |
4222 | barrier(); | ||
4223 | |||
4224 | return rctx; | ||
4225 | } | 4518 | } |
4226 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4519 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4227 | 4520 | ||
4228 | void inline perf_swevent_put_recursion_context(int rctx) | 4521 | void inline perf_swevent_put_recursion_context(int rctx) |
4229 | { | 4522 | { |
4230 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4523 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4231 | barrier(); | 4524 | |
4232 | cpuctx->recursion[rctx]--; | 4525 | put_recursion_context(swhash->recursion, rctx); |
4233 | } | 4526 | } |
4234 | 4527 | ||
4235 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4528 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
@@ -4255,20 +4548,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
4255 | { | 4548 | { |
4256 | } | 4549 | } |
4257 | 4550 | ||
4258 | static int perf_swevent_enable(struct perf_event *event) | 4551 | static int perf_swevent_add(struct perf_event *event, int flags) |
4259 | { | 4552 | { |
4553 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
4260 | struct hw_perf_event *hwc = &event->hw; | 4554 | struct hw_perf_event *hwc = &event->hw; |
4261 | struct perf_cpu_context *cpuctx; | ||
4262 | struct hlist_head *head; | 4555 | struct hlist_head *head; |
4263 | 4556 | ||
4264 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4265 | |||
4266 | if (hwc->sample_period) { | 4557 | if (hwc->sample_period) { |
4267 | hwc->last_period = hwc->sample_period; | 4558 | hwc->last_period = hwc->sample_period; |
4268 | perf_swevent_set_period(event); | 4559 | perf_swevent_set_period(event); |
4269 | } | 4560 | } |
4270 | 4561 | ||
4271 | head = find_swevent_head(cpuctx, event); | 4562 | hwc->state = !(flags & PERF_EF_START); |
4563 | |||
4564 | head = find_swevent_head(swhash, event); | ||
4272 | if (WARN_ON_ONCE(!head)) | 4565 | if (WARN_ON_ONCE(!head)) |
4273 | return -EINVAL; | 4566 | return -EINVAL; |
4274 | 4567 | ||
@@ -4277,202 +4570,27 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4277 | return 0; | 4570 | return 0; |
4278 | } | 4571 | } |
4279 | 4572 | ||
4280 | static void perf_swevent_disable(struct perf_event *event) | 4573 | static void perf_swevent_del(struct perf_event *event, int flags) |
4281 | { | 4574 | { |
4282 | hlist_del_rcu(&event->hlist_entry); | 4575 | hlist_del_rcu(&event->hlist_entry); |
4283 | } | 4576 | } |
4284 | 4577 | ||
4285 | static void perf_swevent_void(struct perf_event *event) | 4578 | static void perf_swevent_start(struct perf_event *event, int flags) |
4286 | { | 4579 | { |
4580 | event->hw.state = 0; | ||
4287 | } | 4581 | } |
4288 | 4582 | ||
4289 | static int perf_swevent_int(struct perf_event *event) | 4583 | static void perf_swevent_stop(struct perf_event *event, int flags) |
4290 | { | 4584 | { |
4291 | return 0; | 4585 | event->hw.state = PERF_HES_STOPPED; |
4292 | } | 4586 | } |
4293 | 4587 | ||
4294 | static const struct pmu perf_ops_generic = { | ||
4295 | .enable = perf_swevent_enable, | ||
4296 | .disable = perf_swevent_disable, | ||
4297 | .start = perf_swevent_int, | ||
4298 | .stop = perf_swevent_void, | ||
4299 | .read = perf_swevent_read, | ||
4300 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
4301 | }; | ||
4302 | |||
4303 | /* | ||
4304 | * hrtimer based swevent callback | ||
4305 | */ | ||
4306 | |||
4307 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4308 | { | ||
4309 | enum hrtimer_restart ret = HRTIMER_RESTART; | ||
4310 | struct perf_sample_data data; | ||
4311 | struct pt_regs *regs; | ||
4312 | struct perf_event *event; | ||
4313 | u64 period; | ||
4314 | |||
4315 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
4316 | event->pmu->read(event); | ||
4317 | |||
4318 | perf_sample_data_init(&data, 0); | ||
4319 | data.period = event->hw.last_period; | ||
4320 | regs = get_irq_regs(); | ||
4321 | |||
4322 | if (regs && !perf_exclude_event(event, regs)) { | ||
4323 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4324 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4325 | ret = HRTIMER_NORESTART; | ||
4326 | } | ||
4327 | |||
4328 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4329 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4330 | |||
4331 | return ret; | ||
4332 | } | ||
4333 | |||
4334 | static void perf_swevent_start_hrtimer(struct perf_event *event) | ||
4335 | { | ||
4336 | struct hw_perf_event *hwc = &event->hw; | ||
4337 | |||
4338 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4339 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4340 | if (hwc->sample_period) { | ||
4341 | u64 period; | ||
4342 | |||
4343 | if (hwc->remaining) { | ||
4344 | if (hwc->remaining < 0) | ||
4345 | period = 10000; | ||
4346 | else | ||
4347 | period = hwc->remaining; | ||
4348 | hwc->remaining = 0; | ||
4349 | } else { | ||
4350 | period = max_t(u64, 10000, hwc->sample_period); | ||
4351 | } | ||
4352 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4353 | ns_to_ktime(period), 0, | ||
4354 | HRTIMER_MODE_REL, 0); | ||
4355 | } | ||
4356 | } | ||
4357 | |||
4358 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4359 | { | ||
4360 | struct hw_perf_event *hwc = &event->hw; | ||
4361 | |||
4362 | if (hwc->sample_period) { | ||
4363 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4364 | hwc->remaining = ktime_to_ns(remaining); | ||
4365 | |||
4366 | hrtimer_cancel(&hwc->hrtimer); | ||
4367 | } | ||
4368 | } | ||
4369 | |||
4370 | /* | ||
4371 | * Software event: cpu wall time clock | ||
4372 | */ | ||
4373 | |||
4374 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
4375 | { | ||
4376 | int cpu = raw_smp_processor_id(); | ||
4377 | s64 prev; | ||
4378 | u64 now; | ||
4379 | |||
4380 | now = cpu_clock(cpu); | ||
4381 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4382 | local64_add(now - prev, &event->count); | ||
4383 | } | ||
4384 | |||
4385 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
4386 | { | ||
4387 | struct hw_perf_event *hwc = &event->hw; | ||
4388 | int cpu = raw_smp_processor_id(); | ||
4389 | |||
4390 | local64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
4391 | perf_swevent_start_hrtimer(event); | ||
4392 | |||
4393 | return 0; | ||
4394 | } | ||
4395 | |||
4396 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
4397 | { | ||
4398 | perf_swevent_cancel_hrtimer(event); | ||
4399 | cpu_clock_perf_event_update(event); | ||
4400 | } | ||
4401 | |||
4402 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
4403 | { | ||
4404 | cpu_clock_perf_event_update(event); | ||
4405 | } | ||
4406 | |||
4407 | static const struct pmu perf_ops_cpu_clock = { | ||
4408 | .enable = cpu_clock_perf_event_enable, | ||
4409 | .disable = cpu_clock_perf_event_disable, | ||
4410 | .read = cpu_clock_perf_event_read, | ||
4411 | }; | ||
4412 | |||
4413 | /* | ||
4414 | * Software event: task time clock | ||
4415 | */ | ||
4416 | |||
4417 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
4418 | { | ||
4419 | u64 prev; | ||
4420 | s64 delta; | ||
4421 | |||
4422 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4423 | delta = now - prev; | ||
4424 | local64_add(delta, &event->count); | ||
4425 | } | ||
4426 | |||
4427 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
4428 | { | ||
4429 | struct hw_perf_event *hwc = &event->hw; | ||
4430 | u64 now; | ||
4431 | |||
4432 | now = event->ctx->time; | ||
4433 | |||
4434 | local64_set(&hwc->prev_count, now); | ||
4435 | |||
4436 | perf_swevent_start_hrtimer(event); | ||
4437 | |||
4438 | return 0; | ||
4439 | } | ||
4440 | |||
4441 | static void task_clock_perf_event_disable(struct perf_event *event) | ||
4442 | { | ||
4443 | perf_swevent_cancel_hrtimer(event); | ||
4444 | task_clock_perf_event_update(event, event->ctx->time); | ||
4445 | |||
4446 | } | ||
4447 | |||
4448 | static void task_clock_perf_event_read(struct perf_event *event) | ||
4449 | { | ||
4450 | u64 time; | ||
4451 | |||
4452 | if (!in_nmi()) { | ||
4453 | update_context_time(event->ctx); | ||
4454 | time = event->ctx->time; | ||
4455 | } else { | ||
4456 | u64 now = perf_clock(); | ||
4457 | u64 delta = now - event->ctx->timestamp; | ||
4458 | time = event->ctx->time + delta; | ||
4459 | } | ||
4460 | |||
4461 | task_clock_perf_event_update(event, time); | ||
4462 | } | ||
4463 | |||
4464 | static const struct pmu perf_ops_task_clock = { | ||
4465 | .enable = task_clock_perf_event_enable, | ||
4466 | .disable = task_clock_perf_event_disable, | ||
4467 | .read = task_clock_perf_event_read, | ||
4468 | }; | ||
4469 | |||
4470 | /* Deref the hlist from the update side */ | 4588 | /* Deref the hlist from the update side */ |
4471 | static inline struct swevent_hlist * | 4589 | static inline struct swevent_hlist * |
4472 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 4590 | swevent_hlist_deref(struct swevent_htable *swhash) |
4473 | { | 4591 | { |
4474 | return rcu_dereference_protected(cpuctx->swevent_hlist, | 4592 | return rcu_dereference_protected(swhash->swevent_hlist, |
4475 | lockdep_is_held(&cpuctx->hlist_mutex)); | 4593 | lockdep_is_held(&swhash->hlist_mutex)); |
4476 | } | 4594 | } |
4477 | 4595 | ||
4478 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4596 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
@@ -4483,27 +4601,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
4483 | kfree(hlist); | 4601 | kfree(hlist); |
4484 | } | 4602 | } |
4485 | 4603 | ||
4486 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4604 | static void swevent_hlist_release(struct swevent_htable *swhash) |
4487 | { | 4605 | { |
4488 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 4606 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
4489 | 4607 | ||
4490 | if (!hlist) | 4608 | if (!hlist) |
4491 | return; | 4609 | return; |
4492 | 4610 | ||
4493 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4611 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
4494 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4612 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4495 | } | 4613 | } |
4496 | 4614 | ||
4497 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4615 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4498 | { | 4616 | { |
4499 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4617 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4500 | 4618 | ||
4501 | mutex_lock(&cpuctx->hlist_mutex); | 4619 | mutex_lock(&swhash->hlist_mutex); |
4502 | 4620 | ||
4503 | if (!--cpuctx->hlist_refcount) | 4621 | if (!--swhash->hlist_refcount) |
4504 | swevent_hlist_release(cpuctx); | 4622 | swevent_hlist_release(swhash); |
4505 | 4623 | ||
4506 | mutex_unlock(&cpuctx->hlist_mutex); | 4624 | mutex_unlock(&swhash->hlist_mutex); |
4507 | } | 4625 | } |
4508 | 4626 | ||
4509 | static void swevent_hlist_put(struct perf_event *event) | 4627 | static void swevent_hlist_put(struct perf_event *event) |
@@ -4521,12 +4639,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
4521 | 4639 | ||
4522 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 4640 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4523 | { | 4641 | { |
4524 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4642 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4525 | int err = 0; | 4643 | int err = 0; |
4526 | 4644 | ||
4527 | mutex_lock(&cpuctx->hlist_mutex); | 4645 | mutex_lock(&swhash->hlist_mutex); |
4528 | 4646 | ||
4529 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 4647 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
4530 | struct swevent_hlist *hlist; | 4648 | struct swevent_hlist *hlist; |
4531 | 4649 | ||
4532 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4650 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4534,11 +4652,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4534 | err = -ENOMEM; | 4652 | err = -ENOMEM; |
4535 | goto exit; | 4653 | goto exit; |
4536 | } | 4654 | } |
4537 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 4655 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
4538 | } | 4656 | } |
4539 | cpuctx->hlist_refcount++; | 4657 | swhash->hlist_refcount++; |
4540 | exit: | 4658 | exit: |
4541 | mutex_unlock(&cpuctx->hlist_mutex); | 4659 | mutex_unlock(&swhash->hlist_mutex); |
4542 | 4660 | ||
4543 | return err; | 4661 | return err; |
4544 | } | 4662 | } |
@@ -4562,7 +4680,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4562 | put_online_cpus(); | 4680 | put_online_cpus(); |
4563 | 4681 | ||
4564 | return 0; | 4682 | return 0; |
4565 | fail: | 4683 | fail: |
4566 | for_each_possible_cpu(cpu) { | 4684 | for_each_possible_cpu(cpu) { |
4567 | if (cpu == failed_cpu) | 4685 | if (cpu == failed_cpu) |
4568 | break; | 4686 | break; |
@@ -4573,17 +4691,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4573 | return err; | 4691 | return err; |
4574 | } | 4692 | } |
4575 | 4693 | ||
4576 | #ifdef CONFIG_EVENT_TRACING | 4694 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4577 | 4695 | ||
4578 | static const struct pmu perf_ops_tracepoint = { | 4696 | static void sw_perf_event_destroy(struct perf_event *event) |
4579 | .enable = perf_trace_enable, | 4697 | { |
4580 | .disable = perf_trace_disable, | 4698 | u64 event_id = event->attr.config; |
4581 | .start = perf_swevent_int, | 4699 | |
4582 | .stop = perf_swevent_void, | 4700 | WARN_ON(event->parent); |
4701 | |||
4702 | jump_label_dec(&perf_swevent_enabled[event_id]); | ||
4703 | swevent_hlist_put(event); | ||
4704 | } | ||
4705 | |||
4706 | static int perf_swevent_init(struct perf_event *event) | ||
4707 | { | ||
4708 | int event_id = event->attr.config; | ||
4709 | |||
4710 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4711 | return -ENOENT; | ||
4712 | |||
4713 | switch (event_id) { | ||
4714 | case PERF_COUNT_SW_CPU_CLOCK: | ||
4715 | case PERF_COUNT_SW_TASK_CLOCK: | ||
4716 | return -ENOENT; | ||
4717 | |||
4718 | default: | ||
4719 | break; | ||
4720 | } | ||
4721 | |||
4722 | if (event_id >= PERF_COUNT_SW_MAX) | ||
4723 | return -ENOENT; | ||
4724 | |||
4725 | if (!event->parent) { | ||
4726 | int err; | ||
4727 | |||
4728 | err = swevent_hlist_get(event); | ||
4729 | if (err) | ||
4730 | return err; | ||
4731 | |||
4732 | jump_label_inc(&perf_swevent_enabled[event_id]); | ||
4733 | event->destroy = sw_perf_event_destroy; | ||
4734 | } | ||
4735 | |||
4736 | return 0; | ||
4737 | } | ||
4738 | |||
4739 | static struct pmu perf_swevent = { | ||
4740 | .task_ctx_nr = perf_sw_context, | ||
4741 | |||
4742 | .event_init = perf_swevent_init, | ||
4743 | .add = perf_swevent_add, | ||
4744 | .del = perf_swevent_del, | ||
4745 | .start = perf_swevent_start, | ||
4746 | .stop = perf_swevent_stop, | ||
4583 | .read = perf_swevent_read, | 4747 | .read = perf_swevent_read, |
4584 | .unthrottle = perf_swevent_void, | ||
4585 | }; | 4748 | }; |
4586 | 4749 | ||
4750 | #ifdef CONFIG_EVENT_TRACING | ||
4751 | |||
4587 | static int perf_tp_filter_match(struct perf_event *event, | 4752 | static int perf_tp_filter_match(struct perf_event *event, |
4588 | struct perf_sample_data *data) | 4753 | struct perf_sample_data *data) |
4589 | { | 4754 | { |
@@ -4627,7 +4792,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4627 | 4792 | ||
4628 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4793 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4629 | if (perf_tp_event_match(event, &data, regs)) | 4794 | if (perf_tp_event_match(event, &data, regs)) |
4630 | perf_swevent_add(event, count, 1, &data, regs); | 4795 | perf_swevent_event(event, count, 1, &data, regs); |
4631 | } | 4796 | } |
4632 | 4797 | ||
4633 | perf_swevent_put_recursion_context(rctx); | 4798 | perf_swevent_put_recursion_context(rctx); |
@@ -4639,10 +4804,13 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
4639 | perf_trace_destroy(event); | 4804 | perf_trace_destroy(event); |
4640 | } | 4805 | } |
4641 | 4806 | ||
4642 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4807 | static int perf_tp_event_init(struct perf_event *event) |
4643 | { | 4808 | { |
4644 | int err; | 4809 | int err; |
4645 | 4810 | ||
4811 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
4812 | return -ENOENT; | ||
4813 | |||
4646 | /* | 4814 | /* |
4647 | * Raw tracepoint data is a severe data leak, only allow root to | 4815 | * Raw tracepoint data is a severe data leak, only allow root to |
4648 | * have these. | 4816 | * have these. |
@@ -4650,15 +4818,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4650 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | 4818 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
4651 | perf_paranoid_tracepoint_raw() && | 4819 | perf_paranoid_tracepoint_raw() && |
4652 | !capable(CAP_SYS_ADMIN)) | 4820 | !capable(CAP_SYS_ADMIN)) |
4653 | return ERR_PTR(-EPERM); | 4821 | return -EPERM; |
4654 | 4822 | ||
4655 | err = perf_trace_init(event); | 4823 | err = perf_trace_init(event); |
4656 | if (err) | 4824 | if (err) |
4657 | return NULL; | 4825 | return err; |
4658 | 4826 | ||
4659 | event->destroy = tp_perf_event_destroy; | 4827 | event->destroy = tp_perf_event_destroy; |
4660 | 4828 | ||
4661 | return &perf_ops_tracepoint; | 4829 | return 0; |
4830 | } | ||
4831 | |||
4832 | static struct pmu perf_tracepoint = { | ||
4833 | .task_ctx_nr = perf_sw_context, | ||
4834 | |||
4835 | .event_init = perf_tp_event_init, | ||
4836 | .add = perf_trace_add, | ||
4837 | .del = perf_trace_del, | ||
4838 | .start = perf_swevent_start, | ||
4839 | .stop = perf_swevent_stop, | ||
4840 | .read = perf_swevent_read, | ||
4841 | }; | ||
4842 | |||
4843 | static inline void perf_tp_register(void) | ||
4844 | { | ||
4845 | perf_pmu_register(&perf_tracepoint); | ||
4662 | } | 4846 | } |
4663 | 4847 | ||
4664 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4848 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4686,9 +4870,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4686 | 4870 | ||
4687 | #else | 4871 | #else |
4688 | 4872 | ||
4689 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4873 | static inline void perf_tp_register(void) |
4690 | { | 4874 | { |
4691 | return NULL; | ||
4692 | } | 4875 | } |
4693 | 4876 | ||
4694 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4877 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4703,105 +4886,406 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4703 | #endif /* CONFIG_EVENT_TRACING */ | 4886 | #endif /* CONFIG_EVENT_TRACING */ |
4704 | 4887 | ||
4705 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4888 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4706 | static void bp_perf_event_destroy(struct perf_event *event) | 4889 | void perf_bp_event(struct perf_event *bp, void *data) |
4707 | { | 4890 | { |
4708 | release_bp_slot(event); | 4891 | struct perf_sample_data sample; |
4892 | struct pt_regs *regs = data; | ||
4893 | |||
4894 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
4895 | |||
4896 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
4897 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
4709 | } | 4898 | } |
4899 | #endif | ||
4710 | 4900 | ||
4711 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4901 | /* |
4902 | * hrtimer based swevent callback | ||
4903 | */ | ||
4904 | |||
4905 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4712 | { | 4906 | { |
4713 | int err; | 4907 | enum hrtimer_restart ret = HRTIMER_RESTART; |
4908 | struct perf_sample_data data; | ||
4909 | struct pt_regs *regs; | ||
4910 | struct perf_event *event; | ||
4911 | u64 period; | ||
4714 | 4912 | ||
4715 | err = register_perf_hw_breakpoint(bp); | 4913 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4716 | if (err) | 4914 | event->pmu->read(event); |
4717 | return ERR_PTR(err); | ||
4718 | 4915 | ||
4719 | bp->destroy = bp_perf_event_destroy; | 4916 | perf_sample_data_init(&data, 0); |
4917 | data.period = event->hw.last_period; | ||
4918 | regs = get_irq_regs(); | ||
4919 | |||
4920 | if (regs && !perf_exclude_event(event, regs)) { | ||
4921 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4922 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4923 | ret = HRTIMER_NORESTART; | ||
4924 | } | ||
4925 | |||
4926 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4927 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4720 | 4928 | ||
4721 | return &perf_ops_bp; | 4929 | return ret; |
4722 | } | 4930 | } |
4723 | 4931 | ||
4724 | void perf_bp_event(struct perf_event *bp, void *data) | 4932 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4725 | { | 4933 | { |
4726 | struct perf_sample_data sample; | 4934 | struct hw_perf_event *hwc = &event->hw; |
4727 | struct pt_regs *regs = data; | ||
4728 | 4935 | ||
4729 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 4936 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4937 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4938 | if (hwc->sample_period) { | ||
4939 | s64 period = local64_read(&hwc->period_left); | ||
4730 | 4940 | ||
4731 | if (!perf_exclude_event(bp, regs)) | 4941 | if (period) { |
4732 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4942 | if (period < 0) |
4943 | period = 10000; | ||
4944 | |||
4945 | local64_set(&hwc->period_left, 0); | ||
4946 | } else { | ||
4947 | period = max_t(u64, 10000, hwc->sample_period); | ||
4948 | } | ||
4949 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4950 | ns_to_ktime(period), 0, | ||
4951 | HRTIMER_MODE_REL_PINNED, 0); | ||
4952 | } | ||
4733 | } | 4953 | } |
4734 | #else | 4954 | |
4735 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4955 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4956 | { | ||
4957 | struct hw_perf_event *hwc = &event->hw; | ||
4958 | |||
4959 | if (hwc->sample_period) { | ||
4960 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4961 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
4962 | |||
4963 | hrtimer_cancel(&hwc->hrtimer); | ||
4964 | } | ||
4965 | } | ||
4966 | |||
4967 | /* | ||
4968 | * Software event: cpu wall time clock | ||
4969 | */ | ||
4970 | |||
4971 | static void cpu_clock_event_update(struct perf_event *event) | ||
4972 | { | ||
4973 | s64 prev; | ||
4974 | u64 now; | ||
4975 | |||
4976 | now = local_clock(); | ||
4977 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4978 | local64_add(now - prev, &event->count); | ||
4979 | } | ||
4980 | |||
4981 | static void cpu_clock_event_start(struct perf_event *event, int flags) | ||
4982 | { | ||
4983 | local64_set(&event->hw.prev_count, local_clock()); | ||
4984 | perf_swevent_start_hrtimer(event); | ||
4985 | } | ||
4986 | |||
4987 | static void cpu_clock_event_stop(struct perf_event *event, int flags) | ||
4988 | { | ||
4989 | perf_swevent_cancel_hrtimer(event); | ||
4990 | cpu_clock_event_update(event); | ||
4991 | } | ||
4992 | |||
4993 | static int cpu_clock_event_add(struct perf_event *event, int flags) | ||
4994 | { | ||
4995 | if (flags & PERF_EF_START) | ||
4996 | cpu_clock_event_start(event, flags); | ||
4997 | |||
4998 | return 0; | ||
4999 | } | ||
5000 | |||
5001 | static void cpu_clock_event_del(struct perf_event *event, int flags) | ||
5002 | { | ||
5003 | cpu_clock_event_stop(event, flags); | ||
5004 | } | ||
5005 | |||
5006 | static void cpu_clock_event_read(struct perf_event *event) | ||
5007 | { | ||
5008 | cpu_clock_event_update(event); | ||
5009 | } | ||
5010 | |||
5011 | static int cpu_clock_event_init(struct perf_event *event) | ||
5012 | { | ||
5013 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5014 | return -ENOENT; | ||
5015 | |||
5016 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
5017 | return -ENOENT; | ||
5018 | |||
5019 | return 0; | ||
5020 | } | ||
5021 | |||
5022 | static struct pmu perf_cpu_clock = { | ||
5023 | .task_ctx_nr = perf_sw_context, | ||
5024 | |||
5025 | .event_init = cpu_clock_event_init, | ||
5026 | .add = cpu_clock_event_add, | ||
5027 | .del = cpu_clock_event_del, | ||
5028 | .start = cpu_clock_event_start, | ||
5029 | .stop = cpu_clock_event_stop, | ||
5030 | .read = cpu_clock_event_read, | ||
5031 | }; | ||
5032 | |||
5033 | /* | ||
5034 | * Software event: task time clock | ||
5035 | */ | ||
5036 | |||
5037 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
5038 | { | ||
5039 | u64 prev; | ||
5040 | s64 delta; | ||
5041 | |||
5042 | prev = local64_xchg(&event->hw.prev_count, now); | ||
5043 | delta = now - prev; | ||
5044 | local64_add(delta, &event->count); | ||
5045 | } | ||
5046 | |||
5047 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
5048 | { | ||
5049 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
5050 | perf_swevent_start_hrtimer(event); | ||
5051 | } | ||
5052 | |||
5053 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
5054 | { | ||
5055 | perf_swevent_cancel_hrtimer(event); | ||
5056 | task_clock_event_update(event, event->ctx->time); | ||
5057 | } | ||
5058 | |||
5059 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
5060 | { | ||
5061 | if (flags & PERF_EF_START) | ||
5062 | task_clock_event_start(event, flags); | ||
5063 | |||
5064 | return 0; | ||
5065 | } | ||
5066 | |||
5067 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
5068 | { | ||
5069 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
5070 | } | ||
5071 | |||
5072 | static void task_clock_event_read(struct perf_event *event) | ||
5073 | { | ||
5074 | u64 time; | ||
5075 | |||
5076 | if (!in_nmi()) { | ||
5077 | update_context_time(event->ctx); | ||
5078 | time = event->ctx->time; | ||
5079 | } else { | ||
5080 | u64 now = perf_clock(); | ||
5081 | u64 delta = now - event->ctx->timestamp; | ||
5082 | time = event->ctx->time + delta; | ||
5083 | } | ||
5084 | |||
5085 | task_clock_event_update(event, time); | ||
5086 | } | ||
5087 | |||
5088 | static int task_clock_event_init(struct perf_event *event) | ||
5089 | { | ||
5090 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5091 | return -ENOENT; | ||
5092 | |||
5093 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
5094 | return -ENOENT; | ||
5095 | |||
5096 | return 0; | ||
5097 | } | ||
5098 | |||
5099 | static struct pmu perf_task_clock = { | ||
5100 | .task_ctx_nr = perf_sw_context, | ||
5101 | |||
5102 | .event_init = task_clock_event_init, | ||
5103 | .add = task_clock_event_add, | ||
5104 | .del = task_clock_event_del, | ||
5105 | .start = task_clock_event_start, | ||
5106 | .stop = task_clock_event_stop, | ||
5107 | .read = task_clock_event_read, | ||
5108 | }; | ||
5109 | |||
5110 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
5111 | { | ||
5112 | } | ||
5113 | |||
5114 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
4736 | { | 5115 | { |
5116 | return 0; | ||
5117 | } | ||
5118 | |||
5119 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
5120 | { | ||
5121 | perf_pmu_disable(pmu); | ||
5122 | } | ||
5123 | |||
5124 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
5125 | { | ||
5126 | perf_pmu_enable(pmu); | ||
5127 | return 0; | ||
5128 | } | ||
5129 | |||
5130 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
5131 | { | ||
5132 | perf_pmu_enable(pmu); | ||
5133 | } | ||
5134 | |||
5135 | /* | ||
5136 | * Ensures all contexts with the same task_ctx_nr have the same | ||
5137 | * pmu_cpu_context too. | ||
5138 | */ | ||
5139 | static void *find_pmu_context(int ctxn) | ||
5140 | { | ||
5141 | struct pmu *pmu; | ||
5142 | |||
5143 | if (ctxn < 0) | ||
5144 | return NULL; | ||
5145 | |||
5146 | list_for_each_entry(pmu, &pmus, entry) { | ||
5147 | if (pmu->task_ctx_nr == ctxn) | ||
5148 | return pmu->pmu_cpu_context; | ||
5149 | } | ||
5150 | |||
4737 | return NULL; | 5151 | return NULL; |
4738 | } | 5152 | } |
4739 | 5153 | ||
4740 | void perf_bp_event(struct perf_event *bp, void *regs) | 5154 | static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) |
4741 | { | 5155 | { |
5156 | int cpu; | ||
5157 | |||
5158 | for_each_possible_cpu(cpu) { | ||
5159 | struct perf_cpu_context *cpuctx; | ||
5160 | |||
5161 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5162 | |||
5163 | if (cpuctx->active_pmu == old_pmu) | ||
5164 | cpuctx->active_pmu = pmu; | ||
5165 | } | ||
4742 | } | 5166 | } |
4743 | #endif | ||
4744 | 5167 | ||
4745 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5168 | static void free_pmu_context(struct pmu *pmu) |
5169 | { | ||
5170 | struct pmu *i; | ||
4746 | 5171 | ||
4747 | static void sw_perf_event_destroy(struct perf_event *event) | 5172 | mutex_lock(&pmus_lock); |
5173 | /* | ||
5174 | * Like a real lame refcount. | ||
5175 | */ | ||
5176 | list_for_each_entry(i, &pmus, entry) { | ||
5177 | if (i->pmu_cpu_context == pmu->pmu_cpu_context) { | ||
5178 | update_pmu_context(i, pmu); | ||
5179 | goto out; | ||
5180 | } | ||
5181 | } | ||
5182 | |||
5183 | free_percpu(pmu->pmu_cpu_context); | ||
5184 | out: | ||
5185 | mutex_unlock(&pmus_lock); | ||
5186 | } | ||
5187 | |||
5188 | int perf_pmu_register(struct pmu *pmu) | ||
4748 | { | 5189 | { |
4749 | u64 event_id = event->attr.config; | 5190 | int cpu, ret; |
4750 | 5191 | ||
4751 | WARN_ON(event->parent); | 5192 | mutex_lock(&pmus_lock); |
5193 | ret = -ENOMEM; | ||
5194 | pmu->pmu_disable_count = alloc_percpu(int); | ||
5195 | if (!pmu->pmu_disable_count) | ||
5196 | goto unlock; | ||
4752 | 5197 | ||
4753 | atomic_dec(&perf_swevent_enabled[event_id]); | 5198 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
4754 | swevent_hlist_put(event); | 5199 | if (pmu->pmu_cpu_context) |
5200 | goto got_cpu_context; | ||
5201 | |||
5202 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
5203 | if (!pmu->pmu_cpu_context) | ||
5204 | goto free_pdc; | ||
5205 | |||
5206 | for_each_possible_cpu(cpu) { | ||
5207 | struct perf_cpu_context *cpuctx; | ||
5208 | |||
5209 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5210 | __perf_event_init_context(&cpuctx->ctx); | ||
5211 | cpuctx->ctx.type = cpu_context; | ||
5212 | cpuctx->ctx.pmu = pmu; | ||
5213 | cpuctx->jiffies_interval = 1; | ||
5214 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
5215 | cpuctx->active_pmu = pmu; | ||
5216 | } | ||
5217 | |||
5218 | got_cpu_context: | ||
5219 | if (!pmu->start_txn) { | ||
5220 | if (pmu->pmu_enable) { | ||
5221 | /* | ||
5222 | * If we have pmu_enable/pmu_disable calls, install | ||
5223 | * transaction stubs that use that to try and batch | ||
5224 | * hardware accesses. | ||
5225 | */ | ||
5226 | pmu->start_txn = perf_pmu_start_txn; | ||
5227 | pmu->commit_txn = perf_pmu_commit_txn; | ||
5228 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
5229 | } else { | ||
5230 | pmu->start_txn = perf_pmu_nop_void; | ||
5231 | pmu->commit_txn = perf_pmu_nop_int; | ||
5232 | pmu->cancel_txn = perf_pmu_nop_void; | ||
5233 | } | ||
5234 | } | ||
5235 | |||
5236 | if (!pmu->pmu_enable) { | ||
5237 | pmu->pmu_enable = perf_pmu_nop_void; | ||
5238 | pmu->pmu_disable = perf_pmu_nop_void; | ||
5239 | } | ||
5240 | |||
5241 | list_add_rcu(&pmu->entry, &pmus); | ||
5242 | ret = 0; | ||
5243 | unlock: | ||
5244 | mutex_unlock(&pmus_lock); | ||
5245 | |||
5246 | return ret; | ||
5247 | |||
5248 | free_pdc: | ||
5249 | free_percpu(pmu->pmu_disable_count); | ||
5250 | goto unlock; | ||
4755 | } | 5251 | } |
4756 | 5252 | ||
4757 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 5253 | void perf_pmu_unregister(struct pmu *pmu) |
4758 | { | 5254 | { |
4759 | const struct pmu *pmu = NULL; | 5255 | mutex_lock(&pmus_lock); |
4760 | u64 event_id = event->attr.config; | 5256 | list_del_rcu(&pmu->entry); |
5257 | mutex_unlock(&pmus_lock); | ||
4761 | 5258 | ||
4762 | /* | 5259 | /* |
4763 | * Software events (currently) can't in general distinguish | 5260 | * We dereference the pmu list under both SRCU and regular RCU, so |
4764 | * between user, kernel and hypervisor events. | 5261 | * synchronize against both of those. |
4765 | * However, context switches and cpu migrations are considered | ||
4766 | * to be kernel events, and page faults are never hypervisor | ||
4767 | * events. | ||
4768 | */ | 5262 | */ |
4769 | switch (event_id) { | 5263 | synchronize_srcu(&pmus_srcu); |
4770 | case PERF_COUNT_SW_CPU_CLOCK: | 5264 | synchronize_rcu(); |
4771 | pmu = &perf_ops_cpu_clock; | ||
4772 | 5265 | ||
4773 | break; | 5266 | free_percpu(pmu->pmu_disable_count); |
4774 | case PERF_COUNT_SW_TASK_CLOCK: | 5267 | free_pmu_context(pmu); |
4775 | /* | 5268 | } |
4776 | * If the user instantiates this as a per-cpu event, | ||
4777 | * use the cpu_clock event instead. | ||
4778 | */ | ||
4779 | if (event->ctx->task) | ||
4780 | pmu = &perf_ops_task_clock; | ||
4781 | else | ||
4782 | pmu = &perf_ops_cpu_clock; | ||
4783 | 5269 | ||
4784 | break; | 5270 | struct pmu *perf_init_event(struct perf_event *event) |
4785 | case PERF_COUNT_SW_PAGE_FAULTS: | 5271 | { |
4786 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 5272 | struct pmu *pmu = NULL; |
4787 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | 5273 | int idx; |
4788 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | ||
4789 | case PERF_COUNT_SW_CPU_MIGRATIONS: | ||
4790 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | ||
4791 | case PERF_COUNT_SW_EMULATION_FAULTS: | ||
4792 | if (!event->parent) { | ||
4793 | int err; | ||
4794 | |||
4795 | err = swevent_hlist_get(event); | ||
4796 | if (err) | ||
4797 | return ERR_PTR(err); | ||
4798 | 5274 | ||
4799 | atomic_inc(&perf_swevent_enabled[event_id]); | 5275 | idx = srcu_read_lock(&pmus_srcu); |
4800 | event->destroy = sw_perf_event_destroy; | 5276 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5277 | int ret = pmu->event_init(event); | ||
5278 | if (!ret) | ||
5279 | goto unlock; | ||
5280 | |||
5281 | if (ret != -ENOENT) { | ||
5282 | pmu = ERR_PTR(ret); | ||
5283 | goto unlock; | ||
4801 | } | 5284 | } |
4802 | pmu = &perf_ops_generic; | ||
4803 | break; | ||
4804 | } | 5285 | } |
5286 | pmu = ERR_PTR(-ENOENT); | ||
5287 | unlock: | ||
5288 | srcu_read_unlock(&pmus_srcu, idx); | ||
4805 | 5289 | ||
4806 | return pmu; | 5290 | return pmu; |
4807 | } | 5291 | } |
@@ -4810,20 +5294,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4810 | * Allocate and initialize a event structure | 5294 | * Allocate and initialize a event structure |
4811 | */ | 5295 | */ |
4812 | static struct perf_event * | 5296 | static struct perf_event * |
4813 | perf_event_alloc(struct perf_event_attr *attr, | 5297 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
4814 | int cpu, | 5298 | struct task_struct *task, |
4815 | struct perf_event_context *ctx, | 5299 | struct perf_event *group_leader, |
4816 | struct perf_event *group_leader, | 5300 | struct perf_event *parent_event, |
4817 | struct perf_event *parent_event, | 5301 | perf_overflow_handler_t overflow_handler) |
4818 | perf_overflow_handler_t overflow_handler, | 5302 | { |
4819 | gfp_t gfpflags) | 5303 | struct pmu *pmu; |
4820 | { | ||
4821 | const struct pmu *pmu; | ||
4822 | struct perf_event *event; | 5304 | struct perf_event *event; |
4823 | struct hw_perf_event *hwc; | 5305 | struct hw_perf_event *hwc; |
4824 | long err; | 5306 | long err; |
4825 | 5307 | ||
4826 | event = kzalloc(sizeof(*event), gfpflags); | 5308 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
4827 | if (!event) | 5309 | if (!event) |
4828 | return ERR_PTR(-ENOMEM); | 5310 | return ERR_PTR(-ENOMEM); |
4829 | 5311 | ||
@@ -4841,6 +5323,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4841 | INIT_LIST_HEAD(&event->event_entry); | 5323 | INIT_LIST_HEAD(&event->event_entry); |
4842 | INIT_LIST_HEAD(&event->sibling_list); | 5324 | INIT_LIST_HEAD(&event->sibling_list); |
4843 | init_waitqueue_head(&event->waitq); | 5325 | init_waitqueue_head(&event->waitq); |
5326 | init_irq_work(&event->pending, perf_pending_event); | ||
4844 | 5327 | ||
4845 | mutex_init(&event->mmap_mutex); | 5328 | mutex_init(&event->mmap_mutex); |
4846 | 5329 | ||
@@ -4848,7 +5331,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4848 | event->attr = *attr; | 5331 | event->attr = *attr; |
4849 | event->group_leader = group_leader; | 5332 | event->group_leader = group_leader; |
4850 | event->pmu = NULL; | 5333 | event->pmu = NULL; |
4851 | event->ctx = ctx; | ||
4852 | event->oncpu = -1; | 5334 | event->oncpu = -1; |
4853 | 5335 | ||
4854 | event->parent = parent_event; | 5336 | event->parent = parent_event; |
@@ -4858,6 +5340,17 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4858 | 5340 | ||
4859 | event->state = PERF_EVENT_STATE_INACTIVE; | 5341 | event->state = PERF_EVENT_STATE_INACTIVE; |
4860 | 5342 | ||
5343 | if (task) { | ||
5344 | event->attach_state = PERF_ATTACH_TASK; | ||
5345 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
5346 | /* | ||
5347 | * hw_breakpoint is a bit difficult here.. | ||
5348 | */ | ||
5349 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
5350 | event->hw.bp_target = task; | ||
5351 | #endif | ||
5352 | } | ||
5353 | |||
4861 | if (!overflow_handler && parent_event) | 5354 | if (!overflow_handler && parent_event) |
4862 | overflow_handler = parent_event->overflow_handler; | 5355 | overflow_handler = parent_event->overflow_handler; |
4863 | 5356 | ||
@@ -4882,29 +5375,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4882 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 5375 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4883 | goto done; | 5376 | goto done; |
4884 | 5377 | ||
4885 | switch (attr->type) { | 5378 | pmu = perf_init_event(event); |
4886 | case PERF_TYPE_RAW: | ||
4887 | case PERF_TYPE_HARDWARE: | ||
4888 | case PERF_TYPE_HW_CACHE: | ||
4889 | pmu = hw_perf_event_init(event); | ||
4890 | break; | ||
4891 | |||
4892 | case PERF_TYPE_SOFTWARE: | ||
4893 | pmu = sw_perf_event_init(event); | ||
4894 | break; | ||
4895 | 5379 | ||
4896 | case PERF_TYPE_TRACEPOINT: | ||
4897 | pmu = tp_perf_event_init(event); | ||
4898 | break; | ||
4899 | |||
4900 | case PERF_TYPE_BREAKPOINT: | ||
4901 | pmu = bp_perf_event_init(event); | ||
4902 | break; | ||
4903 | |||
4904 | |||
4905 | default: | ||
4906 | break; | ||
4907 | } | ||
4908 | done: | 5380 | done: |
4909 | err = 0; | 5381 | err = 0; |
4910 | if (!pmu) | 5382 | if (!pmu) |
@@ -4922,13 +5394,21 @@ done: | |||
4922 | event->pmu = pmu; | 5394 | event->pmu = pmu; |
4923 | 5395 | ||
4924 | if (!event->parent) { | 5396 | if (!event->parent) { |
4925 | atomic_inc(&nr_events); | 5397 | if (event->attach_state & PERF_ATTACH_TASK) |
5398 | jump_label_inc(&perf_task_events); | ||
4926 | if (event->attr.mmap || event->attr.mmap_data) | 5399 | if (event->attr.mmap || event->attr.mmap_data) |
4927 | atomic_inc(&nr_mmap_events); | 5400 | atomic_inc(&nr_mmap_events); |
4928 | if (event->attr.comm) | 5401 | if (event->attr.comm) |
4929 | atomic_inc(&nr_comm_events); | 5402 | atomic_inc(&nr_comm_events); |
4930 | if (event->attr.task) | 5403 | if (event->attr.task) |
4931 | atomic_inc(&nr_task_events); | 5404 | atomic_inc(&nr_task_events); |
5405 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
5406 | err = get_callchain_buffers(); | ||
5407 | if (err) { | ||
5408 | free_event(event); | ||
5409 | return ERR_PTR(err); | ||
5410 | } | ||
5411 | } | ||
4932 | } | 5412 | } |
4933 | 5413 | ||
4934 | return event; | 5414 | return event; |
@@ -5076,12 +5556,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5076 | struct perf_event_attr __user *, attr_uptr, | 5556 | struct perf_event_attr __user *, attr_uptr, |
5077 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5557 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5078 | { | 5558 | { |
5079 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 5559 | struct perf_event *group_leader = NULL, *output_event = NULL; |
5560 | struct perf_event *event, *sibling; | ||
5080 | struct perf_event_attr attr; | 5561 | struct perf_event_attr attr; |
5081 | struct perf_event_context *ctx; | 5562 | struct perf_event_context *ctx; |
5082 | struct file *event_file = NULL; | 5563 | struct file *event_file = NULL; |
5083 | struct file *group_file = NULL; | 5564 | struct file *group_file = NULL; |
5565 | struct task_struct *task = NULL; | ||
5566 | struct pmu *pmu; | ||
5084 | int event_fd; | 5567 | int event_fd; |
5568 | int move_group = 0; | ||
5085 | int fput_needed = 0; | 5569 | int fput_needed = 0; |
5086 | int err; | 5570 | int err; |
5087 | 5571 | ||
@@ -5107,20 +5591,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5107 | if (event_fd < 0) | 5591 | if (event_fd < 0) |
5108 | return event_fd; | 5592 | return event_fd; |
5109 | 5593 | ||
5110 | /* | ||
5111 | * Get the target context (task or percpu): | ||
5112 | */ | ||
5113 | ctx = find_get_context(pid, cpu); | ||
5114 | if (IS_ERR(ctx)) { | ||
5115 | err = PTR_ERR(ctx); | ||
5116 | goto err_fd; | ||
5117 | } | ||
5118 | |||
5119 | if (group_fd != -1) { | 5594 | if (group_fd != -1) { |
5120 | group_leader = perf_fget_light(group_fd, &fput_needed); | 5595 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5121 | if (IS_ERR(group_leader)) { | 5596 | if (IS_ERR(group_leader)) { |
5122 | err = PTR_ERR(group_leader); | 5597 | err = PTR_ERR(group_leader); |
5123 | goto err_put_context; | 5598 | goto err_fd; |
5124 | } | 5599 | } |
5125 | group_file = group_leader->filp; | 5600 | group_file = group_leader->filp; |
5126 | if (flags & PERF_FLAG_FD_OUTPUT) | 5601 | if (flags & PERF_FLAG_FD_OUTPUT) |
@@ -5129,6 +5604,58 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5129 | group_leader = NULL; | 5604 | group_leader = NULL; |
5130 | } | 5605 | } |
5131 | 5606 | ||
5607 | if (pid != -1) { | ||
5608 | task = find_lively_task_by_vpid(pid); | ||
5609 | if (IS_ERR(task)) { | ||
5610 | err = PTR_ERR(task); | ||
5611 | goto err_group_fd; | ||
5612 | } | ||
5613 | } | ||
5614 | |||
5615 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
5616 | if (IS_ERR(event)) { | ||
5617 | err = PTR_ERR(event); | ||
5618 | goto err_task; | ||
5619 | } | ||
5620 | |||
5621 | /* | ||
5622 | * Special case software events and allow them to be part of | ||
5623 | * any hardware group. | ||
5624 | */ | ||
5625 | pmu = event->pmu; | ||
5626 | |||
5627 | if (group_leader && | ||
5628 | (is_software_event(event) != is_software_event(group_leader))) { | ||
5629 | if (is_software_event(event)) { | ||
5630 | /* | ||
5631 | * If event and group_leader are not both a software | ||
5632 | * event, and event is, then group leader is not. | ||
5633 | * | ||
5634 | * Allow the addition of software events to !software | ||
5635 | * groups, this is safe because software events never | ||
5636 | * fail to schedule. | ||
5637 | */ | ||
5638 | pmu = group_leader->pmu; | ||
5639 | } else if (is_software_event(group_leader) && | ||
5640 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
5641 | /* | ||
5642 | * In case the group is a pure software group, and we | ||
5643 | * try to add a hardware event, move the whole group to | ||
5644 | * the hardware context. | ||
5645 | */ | ||
5646 | move_group = 1; | ||
5647 | } | ||
5648 | } | ||
5649 | |||
5650 | /* | ||
5651 | * Get the target context (task or percpu): | ||
5652 | */ | ||
5653 | ctx = find_get_context(pmu, task, cpu); | ||
5654 | if (IS_ERR(ctx)) { | ||
5655 | err = PTR_ERR(ctx); | ||
5656 | goto err_alloc; | ||
5657 | } | ||
5658 | |||
5132 | /* | 5659 | /* |
5133 | * Look up the group leader (we will attach this event to it): | 5660 | * Look up the group leader (we will attach this event to it): |
5134 | */ | 5661 | */ |
@@ -5140,48 +5667,72 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5140 | * becoming part of another group-sibling): | 5667 | * becoming part of another group-sibling): |
5141 | */ | 5668 | */ |
5142 | if (group_leader->group_leader != group_leader) | 5669 | if (group_leader->group_leader != group_leader) |
5143 | goto err_put_context; | 5670 | goto err_context; |
5144 | /* | 5671 | /* |
5145 | * Do not allow to attach to a group in a different | 5672 | * Do not allow to attach to a group in a different |
5146 | * task or CPU context: | 5673 | * task or CPU context: |
5147 | */ | 5674 | */ |
5148 | if (group_leader->ctx != ctx) | 5675 | if (move_group) { |
5149 | goto err_put_context; | 5676 | if (group_leader->ctx->type != ctx->type) |
5677 | goto err_context; | ||
5678 | } else { | ||
5679 | if (group_leader->ctx != ctx) | ||
5680 | goto err_context; | ||
5681 | } | ||
5682 | |||
5150 | /* | 5683 | /* |
5151 | * Only a group leader can be exclusive or pinned | 5684 | * Only a group leader can be exclusive or pinned |
5152 | */ | 5685 | */ |
5153 | if (attr.exclusive || attr.pinned) | 5686 | if (attr.exclusive || attr.pinned) |
5154 | goto err_put_context; | 5687 | goto err_context; |
5155 | } | ||
5156 | |||
5157 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
5158 | NULL, NULL, GFP_KERNEL); | ||
5159 | if (IS_ERR(event)) { | ||
5160 | err = PTR_ERR(event); | ||
5161 | goto err_put_context; | ||
5162 | } | 5688 | } |
5163 | 5689 | ||
5164 | if (output_event) { | 5690 | if (output_event) { |
5165 | err = perf_event_set_output(event, output_event); | 5691 | err = perf_event_set_output(event, output_event); |
5166 | if (err) | 5692 | if (err) |
5167 | goto err_free_put_context; | 5693 | goto err_context; |
5168 | } | 5694 | } |
5169 | 5695 | ||
5170 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5696 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5171 | if (IS_ERR(event_file)) { | 5697 | if (IS_ERR(event_file)) { |
5172 | err = PTR_ERR(event_file); | 5698 | err = PTR_ERR(event_file); |
5173 | goto err_free_put_context; | 5699 | goto err_context; |
5700 | } | ||
5701 | |||
5702 | if (move_group) { | ||
5703 | struct perf_event_context *gctx = group_leader->ctx; | ||
5704 | |||
5705 | mutex_lock(&gctx->mutex); | ||
5706 | perf_event_remove_from_context(group_leader); | ||
5707 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5708 | group_entry) { | ||
5709 | perf_event_remove_from_context(sibling); | ||
5710 | put_ctx(gctx); | ||
5711 | } | ||
5712 | mutex_unlock(&gctx->mutex); | ||
5713 | put_ctx(gctx); | ||
5174 | } | 5714 | } |
5175 | 5715 | ||
5176 | event->filp = event_file; | 5716 | event->filp = event_file; |
5177 | WARN_ON_ONCE(ctx->parent_ctx); | 5717 | WARN_ON_ONCE(ctx->parent_ctx); |
5178 | mutex_lock(&ctx->mutex); | 5718 | mutex_lock(&ctx->mutex); |
5719 | |||
5720 | if (move_group) { | ||
5721 | perf_install_in_context(ctx, group_leader, cpu); | ||
5722 | get_ctx(ctx); | ||
5723 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5724 | group_entry) { | ||
5725 | perf_install_in_context(ctx, sibling, cpu); | ||
5726 | get_ctx(ctx); | ||
5727 | } | ||
5728 | } | ||
5729 | |||
5179 | perf_install_in_context(ctx, event, cpu); | 5730 | perf_install_in_context(ctx, event, cpu); |
5180 | ++ctx->generation; | 5731 | ++ctx->generation; |
5181 | mutex_unlock(&ctx->mutex); | 5732 | mutex_unlock(&ctx->mutex); |
5182 | 5733 | ||
5183 | event->owner = current; | 5734 | event->owner = current; |
5184 | get_task_struct(current); | 5735 | |
5185 | mutex_lock(¤t->perf_event_mutex); | 5736 | mutex_lock(¤t->perf_event_mutex); |
5186 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 5737 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
5187 | mutex_unlock(¤t->perf_event_mutex); | 5738 | mutex_unlock(¤t->perf_event_mutex); |
@@ -5196,11 +5747,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5196 | fd_install(event_fd, event_file); | 5747 | fd_install(event_fd, event_file); |
5197 | return event_fd; | 5748 | return event_fd; |
5198 | 5749 | ||
5199 | err_free_put_context: | 5750 | err_context: |
5751 | put_ctx(ctx); | ||
5752 | err_alloc: | ||
5200 | free_event(event); | 5753 | free_event(event); |
5201 | err_put_context: | 5754 | err_task: |
5755 | if (task) | ||
5756 | put_task_struct(task); | ||
5757 | err_group_fd: | ||
5202 | fput_light(group_file, fput_needed); | 5758 | fput_light(group_file, fput_needed); |
5203 | put_ctx(ctx); | ||
5204 | err_fd: | 5759 | err_fd: |
5205 | put_unused_fd(event_fd); | 5760 | put_unused_fd(event_fd); |
5206 | return err; | 5761 | return err; |
@@ -5211,32 +5766,31 @@ err_fd: | |||
5211 | * | 5766 | * |
5212 | * @attr: attributes of the counter to create | 5767 | * @attr: attributes of the counter to create |
5213 | * @cpu: cpu in which the counter is bound | 5768 | * @cpu: cpu in which the counter is bound |
5214 | * @pid: task to profile | 5769 | * @task: task to profile (NULL for percpu) |
5215 | */ | 5770 | */ |
5216 | struct perf_event * | 5771 | struct perf_event * |
5217 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 5772 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5218 | pid_t pid, | 5773 | struct task_struct *task, |
5219 | perf_overflow_handler_t overflow_handler) | 5774 | perf_overflow_handler_t overflow_handler) |
5220 | { | 5775 | { |
5221 | struct perf_event *event; | ||
5222 | struct perf_event_context *ctx; | 5776 | struct perf_event_context *ctx; |
5777 | struct perf_event *event; | ||
5223 | int err; | 5778 | int err; |
5224 | 5779 | ||
5225 | /* | 5780 | /* |
5226 | * Get the target context (task or percpu): | 5781 | * Get the target context (task or percpu): |
5227 | */ | 5782 | */ |
5228 | 5783 | ||
5229 | ctx = find_get_context(pid, cpu); | 5784 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
5230 | if (IS_ERR(ctx)) { | ||
5231 | err = PTR_ERR(ctx); | ||
5232 | goto err_exit; | ||
5233 | } | ||
5234 | |||
5235 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
5236 | NULL, overflow_handler, GFP_KERNEL); | ||
5237 | if (IS_ERR(event)) { | 5785 | if (IS_ERR(event)) { |
5238 | err = PTR_ERR(event); | 5786 | err = PTR_ERR(event); |
5239 | goto err_put_context; | 5787 | goto err; |
5788 | } | ||
5789 | |||
5790 | ctx = find_get_context(event->pmu, task, cpu); | ||
5791 | if (IS_ERR(ctx)) { | ||
5792 | err = PTR_ERR(ctx); | ||
5793 | goto err_free; | ||
5240 | } | 5794 | } |
5241 | 5795 | ||
5242 | event->filp = NULL; | 5796 | event->filp = NULL; |
@@ -5246,120 +5800,15 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5246 | ++ctx->generation; | 5800 | ++ctx->generation; |
5247 | mutex_unlock(&ctx->mutex); | 5801 | mutex_unlock(&ctx->mutex); |
5248 | 5802 | ||
5249 | event->owner = current; | ||
5250 | get_task_struct(current); | ||
5251 | mutex_lock(¤t->perf_event_mutex); | ||
5252 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | ||
5253 | mutex_unlock(¤t->perf_event_mutex); | ||
5254 | |||
5255 | return event; | 5803 | return event; |
5256 | 5804 | ||
5257 | err_put_context: | 5805 | err_free: |
5258 | put_ctx(ctx); | 5806 | free_event(event); |
5259 | err_exit: | 5807 | err: |
5260 | return ERR_PTR(err); | 5808 | return ERR_PTR(err); |
5261 | } | 5809 | } |
5262 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 5810 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5263 | 5811 | ||
5264 | /* | ||
5265 | * inherit a event from parent task to child task: | ||
5266 | */ | ||
5267 | static struct perf_event * | ||
5268 | inherit_event(struct perf_event *parent_event, | ||
5269 | struct task_struct *parent, | ||
5270 | struct perf_event_context *parent_ctx, | ||
5271 | struct task_struct *child, | ||
5272 | struct perf_event *group_leader, | ||
5273 | struct perf_event_context *child_ctx) | ||
5274 | { | ||
5275 | struct perf_event *child_event; | ||
5276 | |||
5277 | /* | ||
5278 | * Instead of creating recursive hierarchies of events, | ||
5279 | * we link inherited events back to the original parent, | ||
5280 | * which has a filp for sure, which we use as the reference | ||
5281 | * count: | ||
5282 | */ | ||
5283 | if (parent_event->parent) | ||
5284 | parent_event = parent_event->parent; | ||
5285 | |||
5286 | child_event = perf_event_alloc(&parent_event->attr, | ||
5287 | parent_event->cpu, child_ctx, | ||
5288 | group_leader, parent_event, | ||
5289 | NULL, GFP_KERNEL); | ||
5290 | if (IS_ERR(child_event)) | ||
5291 | return child_event; | ||
5292 | get_ctx(child_ctx); | ||
5293 | |||
5294 | /* | ||
5295 | * Make the child state follow the state of the parent event, | ||
5296 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5297 | * so we won't race with perf_event_{en, dis}able_family. | ||
5298 | */ | ||
5299 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5300 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5301 | else | ||
5302 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5303 | |||
5304 | if (parent_event->attr.freq) { | ||
5305 | u64 sample_period = parent_event->hw.sample_period; | ||
5306 | struct hw_perf_event *hwc = &child_event->hw; | ||
5307 | |||
5308 | hwc->sample_period = sample_period; | ||
5309 | hwc->last_period = sample_period; | ||
5310 | |||
5311 | local64_set(&hwc->period_left, sample_period); | ||
5312 | } | ||
5313 | |||
5314 | child_event->overflow_handler = parent_event->overflow_handler; | ||
5315 | |||
5316 | /* | ||
5317 | * Link it up in the child's context: | ||
5318 | */ | ||
5319 | add_event_to_ctx(child_event, child_ctx); | ||
5320 | |||
5321 | /* | ||
5322 | * Get a reference to the parent filp - we will fput it | ||
5323 | * when the child event exits. This is safe to do because | ||
5324 | * we are in the parent and we know that the filp still | ||
5325 | * exists and has a nonzero count: | ||
5326 | */ | ||
5327 | atomic_long_inc(&parent_event->filp->f_count); | ||
5328 | |||
5329 | /* | ||
5330 | * Link this into the parent event's child list | ||
5331 | */ | ||
5332 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
5333 | mutex_lock(&parent_event->child_mutex); | ||
5334 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
5335 | mutex_unlock(&parent_event->child_mutex); | ||
5336 | |||
5337 | return child_event; | ||
5338 | } | ||
5339 | |||
5340 | static int inherit_group(struct perf_event *parent_event, | ||
5341 | struct task_struct *parent, | ||
5342 | struct perf_event_context *parent_ctx, | ||
5343 | struct task_struct *child, | ||
5344 | struct perf_event_context *child_ctx) | ||
5345 | { | ||
5346 | struct perf_event *leader; | ||
5347 | struct perf_event *sub; | ||
5348 | struct perf_event *child_ctr; | ||
5349 | |||
5350 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
5351 | child, NULL, child_ctx); | ||
5352 | if (IS_ERR(leader)) | ||
5353 | return PTR_ERR(leader); | ||
5354 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
5355 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
5356 | child, leader, child_ctx); | ||
5357 | if (IS_ERR(child_ctr)) | ||
5358 | return PTR_ERR(child_ctr); | ||
5359 | } | ||
5360 | return 0; | ||
5361 | } | ||
5362 | |||
5363 | static void sync_child_event(struct perf_event *child_event, | 5812 | static void sync_child_event(struct perf_event *child_event, |
5364 | struct task_struct *child) | 5813 | struct task_struct *child) |
5365 | { | 5814 | { |
@@ -5416,16 +5865,13 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
5416 | } | 5865 | } |
5417 | } | 5866 | } |
5418 | 5867 | ||
5419 | /* | 5868 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
5420 | * When a child task exits, feed back event values to parent events. | ||
5421 | */ | ||
5422 | void perf_event_exit_task(struct task_struct *child) | ||
5423 | { | 5869 | { |
5424 | struct perf_event *child_event, *tmp; | 5870 | struct perf_event *child_event, *tmp; |
5425 | struct perf_event_context *child_ctx; | 5871 | struct perf_event_context *child_ctx; |
5426 | unsigned long flags; | 5872 | unsigned long flags; |
5427 | 5873 | ||
5428 | if (likely(!child->perf_event_ctxp)) { | 5874 | if (likely(!child->perf_event_ctxp[ctxn])) { |
5429 | perf_event_task(child, NULL, 0); | 5875 | perf_event_task(child, NULL, 0); |
5430 | return; | 5876 | return; |
5431 | } | 5877 | } |
@@ -5437,8 +5883,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
5437 | * scheduled, so we are now safe from rescheduling changing | 5883 | * scheduled, so we are now safe from rescheduling changing |
5438 | * our context. | 5884 | * our context. |
5439 | */ | 5885 | */ |
5440 | child_ctx = child->perf_event_ctxp; | 5886 | child_ctx = child->perf_event_ctxp[ctxn]; |
5441 | __perf_event_task_sched_out(child_ctx); | 5887 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5442 | 5888 | ||
5443 | /* | 5889 | /* |
5444 | * Take the context lock here so that if find_get_context is | 5890 | * Take the context lock here so that if find_get_context is |
@@ -5446,7 +5892,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
5446 | * incremented the context's refcount before we do put_ctx below. | 5892 | * incremented the context's refcount before we do put_ctx below. |
5447 | */ | 5893 | */ |
5448 | raw_spin_lock(&child_ctx->lock); | 5894 | raw_spin_lock(&child_ctx->lock); |
5449 | child->perf_event_ctxp = NULL; | 5895 | child->perf_event_ctxp[ctxn] = NULL; |
5450 | /* | 5896 | /* |
5451 | * If this context is a clone; unclone it so it can't get | 5897 | * If this context is a clone; unclone it so it can't get |
5452 | * swapped to another process while we're removing all | 5898 | * swapped to another process while we're removing all |
@@ -5499,6 +5945,33 @@ again: | |||
5499 | put_ctx(child_ctx); | 5945 | put_ctx(child_ctx); |
5500 | } | 5946 | } |
5501 | 5947 | ||
5948 | /* | ||
5949 | * When a child task exits, feed back event values to parent events. | ||
5950 | */ | ||
5951 | void perf_event_exit_task(struct task_struct *child) | ||
5952 | { | ||
5953 | struct perf_event *event, *tmp; | ||
5954 | int ctxn; | ||
5955 | |||
5956 | mutex_lock(&child->perf_event_mutex); | ||
5957 | list_for_each_entry_safe(event, tmp, &child->perf_event_list, | ||
5958 | owner_entry) { | ||
5959 | list_del_init(&event->owner_entry); | ||
5960 | |||
5961 | /* | ||
5962 | * Ensure the list deletion is visible before we clear | ||
5963 | * the owner, closes a race against perf_release() where | ||
5964 | * we need to serialize on the owner->perf_event_mutex. | ||
5965 | */ | ||
5966 | smp_wmb(); | ||
5967 | event->owner = NULL; | ||
5968 | } | ||
5969 | mutex_unlock(&child->perf_event_mutex); | ||
5970 | |||
5971 | for_each_task_context_nr(ctxn) | ||
5972 | perf_event_exit_task_context(child, ctxn); | ||
5973 | } | ||
5974 | |||
5502 | static void perf_free_event(struct perf_event *event, | 5975 | static void perf_free_event(struct perf_event *event, |
5503 | struct perf_event_context *ctx) | 5976 | struct perf_event_context *ctx) |
5504 | { | 5977 | { |
@@ -5520,48 +5993,166 @@ static void perf_free_event(struct perf_event *event, | |||
5520 | 5993 | ||
5521 | /* | 5994 | /* |
5522 | * free an unexposed, unused context as created by inheritance by | 5995 | * free an unexposed, unused context as created by inheritance by |
5523 | * init_task below, used by fork() in case of fail. | 5996 | * perf_event_init_task below, used by fork() in case of fail. |
5524 | */ | 5997 | */ |
5525 | void perf_event_free_task(struct task_struct *task) | 5998 | void perf_event_free_task(struct task_struct *task) |
5526 | { | 5999 | { |
5527 | struct perf_event_context *ctx = task->perf_event_ctxp; | 6000 | struct perf_event_context *ctx; |
5528 | struct perf_event *event, *tmp; | 6001 | struct perf_event *event, *tmp; |
6002 | int ctxn; | ||
5529 | 6003 | ||
5530 | if (!ctx) | 6004 | for_each_task_context_nr(ctxn) { |
5531 | return; | 6005 | ctx = task->perf_event_ctxp[ctxn]; |
6006 | if (!ctx) | ||
6007 | continue; | ||
5532 | 6008 | ||
5533 | mutex_lock(&ctx->mutex); | 6009 | mutex_lock(&ctx->mutex); |
5534 | again: | 6010 | again: |
5535 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6011 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
5536 | perf_free_event(event, ctx); | 6012 | group_entry) |
6013 | perf_free_event(event, ctx); | ||
5537 | 6014 | ||
5538 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 6015 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5539 | group_entry) | 6016 | group_entry) |
5540 | perf_free_event(event, ctx); | 6017 | perf_free_event(event, ctx); |
5541 | 6018 | ||
5542 | if (!list_empty(&ctx->pinned_groups) || | 6019 | if (!list_empty(&ctx->pinned_groups) || |
5543 | !list_empty(&ctx->flexible_groups)) | 6020 | !list_empty(&ctx->flexible_groups)) |
5544 | goto again; | 6021 | goto again; |
5545 | 6022 | ||
5546 | mutex_unlock(&ctx->mutex); | 6023 | mutex_unlock(&ctx->mutex); |
5547 | 6024 | ||
5548 | put_ctx(ctx); | 6025 | put_ctx(ctx); |
6026 | } | ||
6027 | } | ||
6028 | |||
6029 | void perf_event_delayed_put(struct task_struct *task) | ||
6030 | { | ||
6031 | int ctxn; | ||
6032 | |||
6033 | for_each_task_context_nr(ctxn) | ||
6034 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
6035 | } | ||
6036 | |||
6037 | /* | ||
6038 | * inherit a event from parent task to child task: | ||
6039 | */ | ||
6040 | static struct perf_event * | ||
6041 | inherit_event(struct perf_event *parent_event, | ||
6042 | struct task_struct *parent, | ||
6043 | struct perf_event_context *parent_ctx, | ||
6044 | struct task_struct *child, | ||
6045 | struct perf_event *group_leader, | ||
6046 | struct perf_event_context *child_ctx) | ||
6047 | { | ||
6048 | struct perf_event *child_event; | ||
6049 | unsigned long flags; | ||
6050 | |||
6051 | /* | ||
6052 | * Instead of creating recursive hierarchies of events, | ||
6053 | * we link inherited events back to the original parent, | ||
6054 | * which has a filp for sure, which we use as the reference | ||
6055 | * count: | ||
6056 | */ | ||
6057 | if (parent_event->parent) | ||
6058 | parent_event = parent_event->parent; | ||
6059 | |||
6060 | child_event = perf_event_alloc(&parent_event->attr, | ||
6061 | parent_event->cpu, | ||
6062 | child, | ||
6063 | group_leader, parent_event, | ||
6064 | NULL); | ||
6065 | if (IS_ERR(child_event)) | ||
6066 | return child_event; | ||
6067 | get_ctx(child_ctx); | ||
6068 | |||
6069 | /* | ||
6070 | * Make the child state follow the state of the parent event, | ||
6071 | * not its attr.disabled bit. We hold the parent's mutex, | ||
6072 | * so we won't race with perf_event_{en, dis}able_family. | ||
6073 | */ | ||
6074 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
6075 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
6076 | else | ||
6077 | child_event->state = PERF_EVENT_STATE_OFF; | ||
6078 | |||
6079 | if (parent_event->attr.freq) { | ||
6080 | u64 sample_period = parent_event->hw.sample_period; | ||
6081 | struct hw_perf_event *hwc = &child_event->hw; | ||
6082 | |||
6083 | hwc->sample_period = sample_period; | ||
6084 | hwc->last_period = sample_period; | ||
6085 | |||
6086 | local64_set(&hwc->period_left, sample_period); | ||
6087 | } | ||
6088 | |||
6089 | child_event->ctx = child_ctx; | ||
6090 | child_event->overflow_handler = parent_event->overflow_handler; | ||
6091 | |||
6092 | /* | ||
6093 | * Link it up in the child's context: | ||
6094 | */ | ||
6095 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
6096 | add_event_to_ctx(child_event, child_ctx); | ||
6097 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
6098 | |||
6099 | /* | ||
6100 | * Get a reference to the parent filp - we will fput it | ||
6101 | * when the child event exits. This is safe to do because | ||
6102 | * we are in the parent and we know that the filp still | ||
6103 | * exists and has a nonzero count: | ||
6104 | */ | ||
6105 | atomic_long_inc(&parent_event->filp->f_count); | ||
6106 | |||
6107 | /* | ||
6108 | * Link this into the parent event's child list | ||
6109 | */ | ||
6110 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
6111 | mutex_lock(&parent_event->child_mutex); | ||
6112 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
6113 | mutex_unlock(&parent_event->child_mutex); | ||
6114 | |||
6115 | return child_event; | ||
6116 | } | ||
6117 | |||
6118 | static int inherit_group(struct perf_event *parent_event, | ||
6119 | struct task_struct *parent, | ||
6120 | struct perf_event_context *parent_ctx, | ||
6121 | struct task_struct *child, | ||
6122 | struct perf_event_context *child_ctx) | ||
6123 | { | ||
6124 | struct perf_event *leader; | ||
6125 | struct perf_event *sub; | ||
6126 | struct perf_event *child_ctr; | ||
6127 | |||
6128 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
6129 | child, NULL, child_ctx); | ||
6130 | if (IS_ERR(leader)) | ||
6131 | return PTR_ERR(leader); | ||
6132 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
6133 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
6134 | child, leader, child_ctx); | ||
6135 | if (IS_ERR(child_ctr)) | ||
6136 | return PTR_ERR(child_ctr); | ||
6137 | } | ||
6138 | return 0; | ||
5549 | } | 6139 | } |
5550 | 6140 | ||
5551 | static int | 6141 | static int |
5552 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 6142 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5553 | struct perf_event_context *parent_ctx, | 6143 | struct perf_event_context *parent_ctx, |
5554 | struct task_struct *child, | 6144 | struct task_struct *child, int ctxn, |
5555 | int *inherited_all) | 6145 | int *inherited_all) |
5556 | { | 6146 | { |
5557 | int ret; | 6147 | int ret; |
5558 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 6148 | struct perf_event_context *child_ctx; |
5559 | 6149 | ||
5560 | if (!event->attr.inherit) { | 6150 | if (!event->attr.inherit) { |
5561 | *inherited_all = 0; | 6151 | *inherited_all = 0; |
5562 | return 0; | 6152 | return 0; |
5563 | } | 6153 | } |
5564 | 6154 | ||
6155 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5565 | if (!child_ctx) { | 6156 | if (!child_ctx) { |
5566 | /* | 6157 | /* |
5567 | * This is executed from the parent task context, so | 6158 | * This is executed from the parent task context, so |
@@ -5570,14 +6161,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5570 | * child. | 6161 | * child. |
5571 | */ | 6162 | */ |
5572 | 6163 | ||
5573 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 6164 | child_ctx = alloc_perf_context(event->pmu, child); |
5574 | GFP_KERNEL); | ||
5575 | if (!child_ctx) | 6165 | if (!child_ctx) |
5576 | return -ENOMEM; | 6166 | return -ENOMEM; |
5577 | 6167 | ||
5578 | __perf_event_init_context(child_ctx, child); | 6168 | child->perf_event_ctxp[ctxn] = child_ctx; |
5579 | child->perf_event_ctxp = child_ctx; | ||
5580 | get_task_struct(child); | ||
5581 | } | 6169 | } |
5582 | 6170 | ||
5583 | ret = inherit_group(event, parent, parent_ctx, | 6171 | ret = inherit_group(event, parent, parent_ctx, |
@@ -5589,32 +6177,32 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5589 | return ret; | 6177 | return ret; |
5590 | } | 6178 | } |
5591 | 6179 | ||
5592 | |||
5593 | /* | 6180 | /* |
5594 | * Initialize the perf_event context in task_struct | 6181 | * Initialize the perf_event context in task_struct |
5595 | */ | 6182 | */ |
5596 | int perf_event_init_task(struct task_struct *child) | 6183 | int perf_event_init_context(struct task_struct *child, int ctxn) |
5597 | { | 6184 | { |
5598 | struct perf_event_context *child_ctx, *parent_ctx; | 6185 | struct perf_event_context *child_ctx, *parent_ctx; |
5599 | struct perf_event_context *cloned_ctx; | 6186 | struct perf_event_context *cloned_ctx; |
5600 | struct perf_event *event; | 6187 | struct perf_event *event; |
5601 | struct task_struct *parent = current; | 6188 | struct task_struct *parent = current; |
5602 | int inherited_all = 1; | 6189 | int inherited_all = 1; |
6190 | unsigned long flags; | ||
5603 | int ret = 0; | 6191 | int ret = 0; |
5604 | 6192 | ||
5605 | child->perf_event_ctxp = NULL; | 6193 | child->perf_event_ctxp[ctxn] = NULL; |
5606 | 6194 | ||
5607 | mutex_init(&child->perf_event_mutex); | 6195 | mutex_init(&child->perf_event_mutex); |
5608 | INIT_LIST_HEAD(&child->perf_event_list); | 6196 | INIT_LIST_HEAD(&child->perf_event_list); |
5609 | 6197 | ||
5610 | if (likely(!parent->perf_event_ctxp)) | 6198 | if (likely(!parent->perf_event_ctxp[ctxn])) |
5611 | return 0; | 6199 | return 0; |
5612 | 6200 | ||
5613 | /* | 6201 | /* |
5614 | * If the parent's context is a clone, pin it so it won't get | 6202 | * If the parent's context is a clone, pin it so it won't get |
5615 | * swapped under us. | 6203 | * swapped under us. |
5616 | */ | 6204 | */ |
5617 | parent_ctx = perf_pin_task_context(parent); | 6205 | parent_ctx = perf_pin_task_context(parent, ctxn); |
5618 | 6206 | ||
5619 | /* | 6207 | /* |
5620 | * No need to check if parent_ctx != NULL here; since we saw | 6208 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -5634,20 +6222,33 @@ int perf_event_init_task(struct task_struct *child) | |||
5634 | * the list, not manipulating it: | 6222 | * the list, not manipulating it: |
5635 | */ | 6223 | */ |
5636 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 6224 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5637 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6225 | ret = inherit_task_group(event, parent, parent_ctx, |
5638 | &inherited_all); | 6226 | child, ctxn, &inherited_all); |
5639 | if (ret) | 6227 | if (ret) |
5640 | break; | 6228 | break; |
5641 | } | 6229 | } |
5642 | 6230 | ||
6231 | /* | ||
6232 | * We can't hold ctx->lock when iterating the ->flexible_group list due | ||
6233 | * to allocations, but we need to prevent rotation because | ||
6234 | * rotate_ctx() will change the list from interrupt context. | ||
6235 | */ | ||
6236 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
6237 | parent_ctx->rotate_disable = 1; | ||
6238 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6239 | |||
5643 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6240 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5644 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6241 | ret = inherit_task_group(event, parent, parent_ctx, |
5645 | &inherited_all); | 6242 | child, ctxn, &inherited_all); |
5646 | if (ret) | 6243 | if (ret) |
5647 | break; | 6244 | break; |
5648 | } | 6245 | } |
5649 | 6246 | ||
5650 | child_ctx = child->perf_event_ctxp; | 6247 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); |
6248 | parent_ctx->rotate_disable = 0; | ||
6249 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6250 | |||
6251 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5651 | 6252 | ||
5652 | if (child_ctx && inherited_all) { | 6253 | if (child_ctx && inherited_all) { |
5653 | /* | 6254 | /* |
@@ -5676,63 +6277,98 @@ int perf_event_init_task(struct task_struct *child) | |||
5676 | return ret; | 6277 | return ret; |
5677 | } | 6278 | } |
5678 | 6279 | ||
6280 | /* | ||
6281 | * Initialize the perf_event context in task_struct | ||
6282 | */ | ||
6283 | int perf_event_init_task(struct task_struct *child) | ||
6284 | { | ||
6285 | int ctxn, ret; | ||
6286 | |||
6287 | for_each_task_context_nr(ctxn) { | ||
6288 | ret = perf_event_init_context(child, ctxn); | ||
6289 | if (ret) | ||
6290 | return ret; | ||
6291 | } | ||
6292 | |||
6293 | return 0; | ||
6294 | } | ||
6295 | |||
5679 | static void __init perf_event_init_all_cpus(void) | 6296 | static void __init perf_event_init_all_cpus(void) |
5680 | { | 6297 | { |
6298 | struct swevent_htable *swhash; | ||
5681 | int cpu; | 6299 | int cpu; |
5682 | struct perf_cpu_context *cpuctx; | ||
5683 | 6300 | ||
5684 | for_each_possible_cpu(cpu) { | 6301 | for_each_possible_cpu(cpu) { |
5685 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6302 | swhash = &per_cpu(swevent_htable, cpu); |
5686 | mutex_init(&cpuctx->hlist_mutex); | 6303 | mutex_init(&swhash->hlist_mutex); |
5687 | __perf_event_init_context(&cpuctx->ctx, NULL); | 6304 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
5688 | } | 6305 | } |
5689 | } | 6306 | } |
5690 | 6307 | ||
5691 | static void __cpuinit perf_event_init_cpu(int cpu) | 6308 | static void __cpuinit perf_event_init_cpu(int cpu) |
5692 | { | 6309 | { |
5693 | struct perf_cpu_context *cpuctx; | 6310 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5694 | |||
5695 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5696 | 6311 | ||
5697 | spin_lock(&perf_resource_lock); | 6312 | mutex_lock(&swhash->hlist_mutex); |
5698 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 6313 | if (swhash->hlist_refcount > 0) { |
5699 | spin_unlock(&perf_resource_lock); | ||
5700 | |||
5701 | mutex_lock(&cpuctx->hlist_mutex); | ||
5702 | if (cpuctx->hlist_refcount > 0) { | ||
5703 | struct swevent_hlist *hlist; | 6314 | struct swevent_hlist *hlist; |
5704 | 6315 | ||
5705 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 6316 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
5706 | WARN_ON_ONCE(!hlist); | 6317 | WARN_ON(!hlist); |
5707 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 6318 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
5708 | } | 6319 | } |
5709 | mutex_unlock(&cpuctx->hlist_mutex); | 6320 | mutex_unlock(&swhash->hlist_mutex); |
5710 | } | 6321 | } |
5711 | 6322 | ||
5712 | #ifdef CONFIG_HOTPLUG_CPU | 6323 | #ifdef CONFIG_HOTPLUG_CPU |
5713 | static void __perf_event_exit_cpu(void *info) | 6324 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
5714 | { | 6325 | { |
5715 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 6326 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
5716 | struct perf_event_context *ctx = &cpuctx->ctx; | 6327 | |
6328 | WARN_ON(!irqs_disabled()); | ||
6329 | |||
6330 | list_del_init(&cpuctx->rotation_list); | ||
6331 | } | ||
6332 | |||
6333 | static void __perf_event_exit_context(void *__info) | ||
6334 | { | ||
6335 | struct perf_event_context *ctx = __info; | ||
5717 | struct perf_event *event, *tmp; | 6336 | struct perf_event *event, *tmp; |
5718 | 6337 | ||
6338 | perf_pmu_rotate_stop(ctx->pmu); | ||
6339 | |||
5719 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6340 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5720 | __perf_event_remove_from_context(event); | 6341 | __perf_event_remove_from_context(event); |
5721 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 6342 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5722 | __perf_event_remove_from_context(event); | 6343 | __perf_event_remove_from_context(event); |
5723 | } | 6344 | } |
6345 | |||
6346 | static void perf_event_exit_cpu_context(int cpu) | ||
6347 | { | ||
6348 | struct perf_event_context *ctx; | ||
6349 | struct pmu *pmu; | ||
6350 | int idx; | ||
6351 | |||
6352 | idx = srcu_read_lock(&pmus_srcu); | ||
6353 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
6354 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
6355 | |||
6356 | mutex_lock(&ctx->mutex); | ||
6357 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
6358 | mutex_unlock(&ctx->mutex); | ||
6359 | } | ||
6360 | srcu_read_unlock(&pmus_srcu, idx); | ||
6361 | } | ||
6362 | |||
5724 | static void perf_event_exit_cpu(int cpu) | 6363 | static void perf_event_exit_cpu(int cpu) |
5725 | { | 6364 | { |
5726 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 6365 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5727 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
5728 | 6366 | ||
5729 | mutex_lock(&cpuctx->hlist_mutex); | 6367 | mutex_lock(&swhash->hlist_mutex); |
5730 | swevent_hlist_release(cpuctx); | 6368 | swevent_hlist_release(swhash); |
5731 | mutex_unlock(&cpuctx->hlist_mutex); | 6369 | mutex_unlock(&swhash->hlist_mutex); |
5732 | 6370 | ||
5733 | mutex_lock(&ctx->mutex); | 6371 | perf_event_exit_cpu_context(cpu); |
5734 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
5735 | mutex_unlock(&ctx->mutex); | ||
5736 | } | 6372 | } |
5737 | #else | 6373 | #else |
5738 | static inline void perf_event_exit_cpu(int cpu) { } | 6374 | static inline void perf_event_exit_cpu(int cpu) { } |
@@ -5743,15 +6379,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5743 | { | 6379 | { |
5744 | unsigned int cpu = (long)hcpu; | 6380 | unsigned int cpu = (long)hcpu; |
5745 | 6381 | ||
5746 | switch (action) { | 6382 | switch (action & ~CPU_TASKS_FROZEN) { |
5747 | 6383 | ||
5748 | case CPU_UP_PREPARE: | 6384 | case CPU_UP_PREPARE: |
5749 | case CPU_UP_PREPARE_FROZEN: | 6385 | case CPU_DOWN_FAILED: |
5750 | perf_event_init_cpu(cpu); | 6386 | perf_event_init_cpu(cpu); |
5751 | break; | 6387 | break; |
5752 | 6388 | ||
6389 | case CPU_UP_CANCELED: | ||
5753 | case CPU_DOWN_PREPARE: | 6390 | case CPU_DOWN_PREPARE: |
5754 | case CPU_DOWN_PREPARE_FROZEN: | ||
5755 | perf_event_exit_cpu(cpu); | 6391 | perf_event_exit_cpu(cpu); |
5756 | break; | 6392 | break; |
5757 | 6393 | ||
@@ -5762,118 +6398,18 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5762 | return NOTIFY_OK; | 6398 | return NOTIFY_OK; |
5763 | } | 6399 | } |
5764 | 6400 | ||
5765 | /* | ||
5766 | * This has to have a higher priority than migration_notifier in sched.c. | ||
5767 | */ | ||
5768 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
5769 | .notifier_call = perf_cpu_notify, | ||
5770 | .priority = 20, | ||
5771 | }; | ||
5772 | |||
5773 | void __init perf_event_init(void) | 6401 | void __init perf_event_init(void) |
5774 | { | 6402 | { |
5775 | perf_event_init_all_cpus(); | 6403 | int ret; |
5776 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | ||
5777 | (void *)(long)smp_processor_id()); | ||
5778 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | ||
5779 | (void *)(long)smp_processor_id()); | ||
5780 | register_cpu_notifier(&perf_cpu_nb); | ||
5781 | } | ||
5782 | |||
5783 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | ||
5784 | struct sysdev_class_attribute *attr, | ||
5785 | char *buf) | ||
5786 | { | ||
5787 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
5788 | } | ||
5789 | |||
5790 | static ssize_t | ||
5791 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
5792 | struct sysdev_class_attribute *attr, | ||
5793 | const char *buf, | ||
5794 | size_t count) | ||
5795 | { | ||
5796 | struct perf_cpu_context *cpuctx; | ||
5797 | unsigned long val; | ||
5798 | int err, cpu, mpt; | ||
5799 | |||
5800 | err = strict_strtoul(buf, 10, &val); | ||
5801 | if (err) | ||
5802 | return err; | ||
5803 | if (val > perf_max_events) | ||
5804 | return -EINVAL; | ||
5805 | |||
5806 | spin_lock(&perf_resource_lock); | ||
5807 | perf_reserved_percpu = val; | ||
5808 | for_each_online_cpu(cpu) { | ||
5809 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5810 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
5811 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
5812 | perf_max_events - perf_reserved_percpu); | ||
5813 | cpuctx->max_pertask = mpt; | ||
5814 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
5815 | } | ||
5816 | spin_unlock(&perf_resource_lock); | ||
5817 | |||
5818 | return count; | ||
5819 | } | ||
5820 | |||
5821 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | ||
5822 | struct sysdev_class_attribute *attr, | ||
5823 | char *buf) | ||
5824 | { | ||
5825 | return sprintf(buf, "%d\n", perf_overcommit); | ||
5826 | } | ||
5827 | |||
5828 | static ssize_t | ||
5829 | perf_set_overcommit(struct sysdev_class *class, | ||
5830 | struct sysdev_class_attribute *attr, | ||
5831 | const char *buf, size_t count) | ||
5832 | { | ||
5833 | unsigned long val; | ||
5834 | int err; | ||
5835 | |||
5836 | err = strict_strtoul(buf, 10, &val); | ||
5837 | if (err) | ||
5838 | return err; | ||
5839 | if (val > 1) | ||
5840 | return -EINVAL; | ||
5841 | |||
5842 | spin_lock(&perf_resource_lock); | ||
5843 | perf_overcommit = val; | ||
5844 | spin_unlock(&perf_resource_lock); | ||
5845 | |||
5846 | return count; | ||
5847 | } | ||
5848 | |||
5849 | static SYSDEV_CLASS_ATTR( | ||
5850 | reserve_percpu, | ||
5851 | 0644, | ||
5852 | perf_show_reserve_percpu, | ||
5853 | perf_set_reserve_percpu | ||
5854 | ); | ||
5855 | |||
5856 | static SYSDEV_CLASS_ATTR( | ||
5857 | overcommit, | ||
5858 | 0644, | ||
5859 | perf_show_overcommit, | ||
5860 | perf_set_overcommit | ||
5861 | ); | ||
5862 | |||
5863 | static struct attribute *perfclass_attrs[] = { | ||
5864 | &attr_reserve_percpu.attr, | ||
5865 | &attr_overcommit.attr, | ||
5866 | NULL | ||
5867 | }; | ||
5868 | 6404 | ||
5869 | static struct attribute_group perfclass_attr_group = { | 6405 | perf_event_init_all_cpus(); |
5870 | .attrs = perfclass_attrs, | 6406 | init_srcu_struct(&pmus_srcu); |
5871 | .name = "perf_events", | 6407 | perf_pmu_register(&perf_swevent); |
5872 | }; | 6408 | perf_pmu_register(&perf_cpu_clock); |
6409 | perf_pmu_register(&perf_task_clock); | ||
6410 | perf_tp_register(); | ||
6411 | perf_cpu_notifier(perf_cpu_notify); | ||
5873 | 6412 | ||
5874 | static int __init perf_event_sysfs_init(void) | 6413 | ret = init_hw_breakpoint(); |
5875 | { | 6414 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
5876 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
5877 | &perfclass_attr_group); | ||
5878 | } | 6415 | } |
5879 | device_initcall(perf_event_sysfs_init); | ||