aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c2628
1 files changed, 1543 insertions, 1085 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d1804b198..f309e8014c78 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/* 37atomic_t perf_task_events __read_mostly;
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66extern __weak const char *perf_pmu_name(void)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 return "pmu";
81} 69}
82 70
83void __weak hw_perf_disable(void) { barrier(); } 71void perf_pmu_disable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); } 72{
85 73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
86void __weak perf_event_print_debug(void) { } 74 if (!(*count)++)
87 75 pmu->pmu_disable(pmu);
88static DEFINE_PER_CPU(int, perf_disable_count); 76}
89 77
90void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
91{ 79{
92 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
94} 83}
95 84
96void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 93{
98 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
100} 101}
101 102
102static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 152 * the context could get moved to another task.
152 */ 153 */
153static struct perf_event_context * 154static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 156{
156 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
157 158
158 rcu_read_lock(); 159 rcu_read_lock();
159 retry: 160retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 162 if (ctx) {
162 /* 163 /*
163 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 171 * can't get swapped on us any more.
171 */ 172 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 176 goto retry;
176 } 177 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
191 */ 192 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 195{
194 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
195 unsigned long flags; 197 unsigned long flags;
196 198
197 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 200 if (ctx) {
199 ++ctx->pin_count; 201 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 304 }
303 305
304 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 309 ctx->nr_events++;
306 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
311{ 315{
312 struct perf_event *group_leader = event->group_leader; 316 struct perf_event *group_leader = event->group_leader;
313 317
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 318 /*
319 * We can have double attach due to group movement in perf_event_open.
320 */
321 if (event->attach_state & PERF_ATTACH_GROUP)
322 return;
323
315 event->attach_state |= PERF_ATTACH_GROUP; 324 event->attach_state |= PERF_ATTACH_GROUP;
316 325
317 if (group_leader == event) 326 if (group_leader == event)
@@ -402,21 +411,40 @@ static void perf_group_detach(struct perf_event *event)
402 } 411 }
403} 412}
404 413
405static void 414static inline int
406event_sched_out(struct perf_event *event, 415event_filter_match(struct perf_event *event)
416{
417 return event->cpu == -1 || event->cpu == smp_processor_id();
418}
419
420static int
421__event_sched_out(struct perf_event *event,
407 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
408 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
409{ 424{
425 u64 delta;
426 /*
427 * An event which could not be activated because of
428 * filter mismatch still needs to have its timings
429 * maintained, otherwise bogus information is return
430 * via read() for time_enabled, time_running:
431 */
432 if (event->state == PERF_EVENT_STATE_INACTIVE
433 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time;
437 }
438
410 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
411 return; 440 return 0;
412 441
413 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
414 if (event->pending_disable) { 443 if (event->pending_disable) {
415 event->pending_disable = 0; 444 event->pending_disable = 0;
416 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
417 } 446 }
418 event->tstamp_stopped = ctx->time; 447 event->pmu->del(event, 0);
419 event->pmu->disable(event);
420 event->oncpu = -1; 448 event->oncpu = -1;
421 449
422 if (!is_software_event(event)) 450 if (!is_software_event(event))
@@ -424,6 +452,19 @@ event_sched_out(struct perf_event *event,
424 ctx->nr_active--; 452 ctx->nr_active--;
425 if (event->attr.exclusive || !cpuctx->active_oncpu) 453 if (event->attr.exclusive || !cpuctx->active_oncpu)
426 cpuctx->exclusive = 0; 454 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
427} 468}
428 469
429static void 470static void
@@ -432,9 +473,7 @@ group_sched_out(struct perf_event *group_event,
432 struct perf_event_context *ctx) 473 struct perf_event_context *ctx)
433{ 474{
434 struct perf_event *event; 475 struct perf_event *event;
435 476 int state = group_event->state;
436 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
437 return;
438 477
439 event_sched_out(group_event, cpuctx, ctx); 478 event_sched_out(group_event, cpuctx, ctx);
440 479
@@ -444,10 +483,16 @@ group_sched_out(struct perf_event *group_event,
444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 483 list_for_each_entry(event, &group_event->sibling_list, group_entry)
445 event_sched_out(event, cpuctx, ctx); 484 event_sched_out(event, cpuctx, ctx);
446 485
447 if (group_event->attr.exclusive) 486 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
448 cpuctx->exclusive = 0; 487 cpuctx->exclusive = 0;
449} 488}
450 489
490static inline struct perf_cpu_context *
491__get_cpu_context(struct perf_event_context *ctx)
492{
493 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
494}
495
451/* 496/*
452 * Cross CPU call to remove a performance event 497 * Cross CPU call to remove a performance event
453 * 498 *
@@ -456,9 +501,9 @@ group_sched_out(struct perf_event *group_event,
456 */ 501 */
457static void __perf_event_remove_from_context(void *info) 502static void __perf_event_remove_from_context(void *info)
458{ 503{
459 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
460 struct perf_event *event = info; 504 struct perf_event *event = info;
461 struct perf_event_context *ctx = event->ctx; 505 struct perf_event_context *ctx = event->ctx;
506 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
462 507
463 /* 508 /*
464 * If this is a task context, we need to check whether it is 509 * If this is a task context, we need to check whether it is
@@ -469,27 +514,11 @@ static void __perf_event_remove_from_context(void *info)
469 return; 514 return;
470 515
471 raw_spin_lock(&ctx->lock); 516 raw_spin_lock(&ctx->lock);
472 /*
473 * Protect the list operation against NMI by disabling the
474 * events on a global level.
475 */
476 perf_disable();
477 517
478 event_sched_out(event, cpuctx, ctx); 518 event_sched_out(event, cpuctx, ctx);
479 519
480 list_del_event(event, ctx); 520 list_del_event(event, ctx);
481 521
482 if (!ctx->task) {
483 /*
484 * Allow more per task events with respect to the
485 * reservation:
486 */
487 cpuctx->max_pertask =
488 min(perf_max_events - ctx->nr_events,
489 perf_max_events - perf_reserved_percpu);
490 }
491
492 perf_enable();
493 raw_spin_unlock(&ctx->lock); 522 raw_spin_unlock(&ctx->lock);
494} 523}
495 524
@@ -554,8 +583,8 @@ retry:
554static void __perf_event_disable(void *info) 583static void __perf_event_disable(void *info)
555{ 584{
556 struct perf_event *event = info; 585 struct perf_event *event = info;
557 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
558 struct perf_event_context *ctx = event->ctx; 586 struct perf_event_context *ctx = event->ctx;
587 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
559 588
560 /* 589 /*
561 * If this is a per-task event, need to check whether this 590 * If this is a per-task event, need to check whether this
@@ -610,7 +639,7 @@ void perf_event_disable(struct perf_event *event)
610 return; 639 return;
611 } 640 }
612 641
613 retry: 642retry:
614 task_oncpu_function_call(task, __perf_event_disable, event); 643 task_oncpu_function_call(task, __perf_event_disable, event);
615 644
616 raw_spin_lock_irq(&ctx->lock); 645 raw_spin_lock_irq(&ctx->lock);
@@ -635,7 +664,7 @@ void perf_event_disable(struct perf_event *event)
635} 664}
636 665
637static int 666static int
638event_sched_in(struct perf_event *event, 667__event_sched_in(struct perf_event *event,
639 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
640 struct perf_event_context *ctx) 669 struct perf_event_context *ctx)
641{ 670{
@@ -649,14 +678,12 @@ event_sched_in(struct perf_event *event,
649 */ 678 */
650 smp_wmb(); 679 smp_wmb();
651 680
652 if (event->pmu->enable(event)) { 681 if (event->pmu->add(event, PERF_EF_START)) {
653 event->state = PERF_EVENT_STATE_INACTIVE; 682 event->state = PERF_EVENT_STATE_INACTIVE;
654 event->oncpu = -1; 683 event->oncpu = -1;
655 return -EAGAIN; 684 return -EAGAIN;
656 } 685 }
657 686
658 event->tstamp_running += ctx->time - event->tstamp_stopped;
659
660 if (!is_software_event(event)) 687 if (!is_software_event(event))
661 cpuctx->active_oncpu++; 688 cpuctx->active_oncpu++;
662 ctx->nr_active++; 689 ctx->nr_active++;
@@ -667,28 +694,56 @@ event_sched_in(struct perf_event *event,
667 return 0; 694 return 0;
668} 695}
669 696
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
670static int 726static int
671group_sched_in(struct perf_event *group_event, 727group_sched_in(struct perf_event *group_event,
672 struct perf_cpu_context *cpuctx, 728 struct perf_cpu_context *cpuctx,
673 struct perf_event_context *ctx) 729 struct perf_event_context *ctx)
674{ 730{
675 struct perf_event *event, *partial_group = NULL; 731 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 732 struct pmu *pmu = group_event->pmu;
677 bool txn = false;
678 733
679 if (group_event->state == PERF_EVENT_STATE_OFF) 734 if (group_event->state == PERF_EVENT_STATE_OFF)
680 return 0; 735 return 0;
681 736
682 /* Check if group transaction availabe */ 737 pmu->start_txn(pmu);
683 if (pmu->start_txn)
684 txn = true;
685
686 if (txn)
687 pmu->start_txn(pmu);
688 738
689 if (event_sched_in(group_event, cpuctx, ctx)) { 739 /*
690 if (txn) 740 * use __event_sched_in() to delay updating tstamp_running
691 pmu->cancel_txn(pmu); 741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu);
692 return -EAGAIN; 747 return -EAGAIN;
693 } 748 }
694 749
@@ -696,29 +751,33 @@ group_sched_in(struct perf_event *group_event,
696 * Schedule in siblings as one group (if any): 751 * Schedule in siblings as one group (if any):
697 */ 752 */
698 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 753 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
699 if (event_sched_in(event, cpuctx, ctx)) { 754 if (__event_sched_in(event, cpuctx, ctx)) {
700 partial_group = event; 755 partial_group = event;
701 goto group_error; 756 goto group_error;
702 } 757 }
703 } 758 }
704 759
705 if (!txn || !pmu->commit_txn(pmu)) 760 if (!pmu->commit_txn(pmu)) {
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
706 return 0; 763 return 0;
707 764 }
708group_error: 765group_error:
709 /* 766 /*
710 * Groups can be scheduled in as one unit only, so undo any 767 * Groups can be scheduled in as one unit only, so undo any
711 * partial group before returning: 768 * partial group before returning:
769 *
770 * use __event_sched_out() to avoid updating tstamp_stopped
771 * because the event never actually ran
712 */ 772 */
713 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 773 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
714 if (event == partial_group) 774 if (event == partial_group)
715 break; 775 break;
716 event_sched_out(event, cpuctx, ctx); 776 __event_sched_out(event, cpuctx, ctx);
717 } 777 }
718 event_sched_out(group_event, cpuctx, ctx); 778 __event_sched_out(group_event, cpuctx, ctx);
719 779
720 if (txn) 780 pmu->cancel_txn(pmu);
721 pmu->cancel_txn(pmu);
722 781
723 return -EAGAIN; 782 return -EAGAIN;
724} 783}
@@ -771,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event,
771 */ 830 */
772static void __perf_install_in_context(void *info) 831static void __perf_install_in_context(void *info)
773{ 832{
774 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
775 struct perf_event *event = info; 833 struct perf_event *event = info;
776 struct perf_event_context *ctx = event->ctx; 834 struct perf_event_context *ctx = event->ctx;
777 struct perf_event *leader = event->group_leader; 835 struct perf_event *leader = event->group_leader;
836 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
778 int err; 837 int err;
779 838
780 /* 839 /*
@@ -794,12 +853,6 @@ static void __perf_install_in_context(void *info)
794 ctx->is_active = 1; 853 ctx->is_active = 1;
795 update_context_time(ctx); 854 update_context_time(ctx);
796 855
797 /*
798 * Protect the list operation against NMI by disabling the
799 * events on a global level. NOP for non NMI based events.
800 */
801 perf_disable();
802
803 add_event_to_ctx(event, ctx); 856 add_event_to_ctx(event, ctx);
804 857
805 if (event->cpu != -1 && event->cpu != smp_processor_id()) 858 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -837,12 +890,7 @@ static void __perf_install_in_context(void *info)
837 } 890 }
838 } 891 }
839 892
840 if (!err && !ctx->task && cpuctx->max_pertask) 893unlock:
841 cpuctx->max_pertask--;
842
843 unlock:
844 perf_enable();
845
846 raw_spin_unlock(&ctx->lock); 894 raw_spin_unlock(&ctx->lock);
847} 895}
848 896
@@ -865,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx,
865{ 913{
866 struct task_struct *task = ctx->task; 914 struct task_struct *task = ctx->task;
867 915
916 event->ctx = ctx;
917
868 if (!task) { 918 if (!task) {
869 /* 919 /*
870 * Per cpu events are installed via an smp call and 920 * Per cpu events are installed via an smp call and
@@ -913,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
913 963
914 event->state = PERF_EVENT_STATE_INACTIVE; 964 event->state = PERF_EVENT_STATE_INACTIVE;
915 event->tstamp_enabled = ctx->time - event->total_time_enabled; 965 event->tstamp_enabled = ctx->time - event->total_time_enabled;
916 list_for_each_entry(sub, &event->sibling_list, group_entry) 966 list_for_each_entry(sub, &event->sibling_list, group_entry) {
917 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 967 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
918 sub->tstamp_enabled = 968 sub->tstamp_enabled =
919 ctx->time - sub->total_time_enabled; 969 ctx->time - sub->total_time_enabled;
970 }
971 }
920} 972}
921 973
922/* 974/*
@@ -925,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
925static void __perf_event_enable(void *info) 977static void __perf_event_enable(void *info)
926{ 978{
927 struct perf_event *event = info; 979 struct perf_event *event = info;
928 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
929 struct perf_event_context *ctx = event->ctx; 980 struct perf_event_context *ctx = event->ctx;
930 struct perf_event *leader = event->group_leader; 981 struct perf_event *leader = event->group_leader;
982 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
931 int err; 983 int err;
932 984
933 /* 985 /*
@@ -961,12 +1013,10 @@ static void __perf_event_enable(void *info)
961 if (!group_can_go_on(event, cpuctx, 1)) { 1013 if (!group_can_go_on(event, cpuctx, 1)) {
962 err = -EEXIST; 1014 err = -EEXIST;
963 } else { 1015 } else {
964 perf_disable();
965 if (event == leader) 1016 if (event == leader)
966 err = group_sched_in(event, cpuctx, ctx); 1017 err = group_sched_in(event, cpuctx, ctx);
967 else 1018 else
968 err = event_sched_in(event, cpuctx, ctx); 1019 err = event_sched_in(event, cpuctx, ctx);
969 perf_enable();
970 } 1020 }
971 1021
972 if (err) { 1022 if (err) {
@@ -982,7 +1032,7 @@ static void __perf_event_enable(void *info)
982 } 1032 }
983 } 1033 }
984 1034
985 unlock: 1035unlock:
986 raw_spin_unlock(&ctx->lock); 1036 raw_spin_unlock(&ctx->lock);
987} 1037}
988 1038
@@ -1023,7 +1073,7 @@ void perf_event_enable(struct perf_event *event)
1023 if (event->state == PERF_EVENT_STATE_ERROR) 1073 if (event->state == PERF_EVENT_STATE_ERROR)
1024 event->state = PERF_EVENT_STATE_OFF; 1074 event->state = PERF_EVENT_STATE_OFF;
1025 1075
1026 retry: 1076retry:
1027 raw_spin_unlock_irq(&ctx->lock); 1077 raw_spin_unlock_irq(&ctx->lock);
1028 task_oncpu_function_call(task, __perf_event_enable, event); 1078 task_oncpu_function_call(task, __perf_event_enable, event);
1029 1079
@@ -1043,7 +1093,7 @@ void perf_event_enable(struct perf_event *event)
1043 if (event->state == PERF_EVENT_STATE_OFF) 1093 if (event->state == PERF_EVENT_STATE_OFF)
1044 __perf_event_mark_enabled(event, ctx); 1094 __perf_event_mark_enabled(event, ctx);
1045 1095
1046 out: 1096out:
1047 raw_spin_unlock_irq(&ctx->lock); 1097 raw_spin_unlock_irq(&ctx->lock);
1048} 1098}
1049 1099
@@ -1074,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1074 struct perf_event *event; 1124 struct perf_event *event;
1075 1125
1076 raw_spin_lock(&ctx->lock); 1126 raw_spin_lock(&ctx->lock);
1127 perf_pmu_disable(ctx->pmu);
1077 ctx->is_active = 0; 1128 ctx->is_active = 0;
1078 if (likely(!ctx->nr_events)) 1129 if (likely(!ctx->nr_events))
1079 goto out; 1130 goto out;
1080 update_context_time(ctx); 1131 update_context_time(ctx);
1081 1132
1082 perf_disable();
1083 if (!ctx->nr_active) 1133 if (!ctx->nr_active)
1084 goto out_enable; 1134 goto out;
1085 1135
1086 if (event_type & EVENT_PINNED) 1136 if (event_type & EVENT_PINNED) {
1087 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1137 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1088 group_sched_out(event, cpuctx, ctx); 1138 group_sched_out(event, cpuctx, ctx);
1139 }
1089 1140
1090 if (event_type & EVENT_FLEXIBLE) 1141 if (event_type & EVENT_FLEXIBLE) {
1091 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1142 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1092 group_sched_out(event, cpuctx, ctx); 1143 group_sched_out(event, cpuctx, ctx);
1093 1144 }
1094 out_enable: 1145out:
1095 perf_enable(); 1146 perf_pmu_enable(ctx->pmu);
1096 out:
1097 raw_spin_unlock(&ctx->lock); 1147 raw_spin_unlock(&ctx->lock);
1098} 1148}
1099 1149
@@ -1191,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1191 } 1241 }
1192} 1242}
1193 1243
1194/* 1244void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1195 * Called from scheduler to remove the events of the current task, 1245 struct task_struct *next)
1196 * with interrupts disabled.
1197 *
1198 * We stop each event and update the event value in event->count.
1199 *
1200 * This does not protect us against NMI, but disable()
1201 * sets the disabled bit in the control field of event _before_
1202 * accessing the event control register. If a NMI hits, then it will
1203 * not restart the event.
1204 */
1205void perf_event_task_sched_out(struct task_struct *task,
1206 struct task_struct *next)
1207{ 1246{
1208 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1247 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1209 struct perf_event_context *ctx = task->perf_event_ctxp;
1210 struct perf_event_context *next_ctx; 1248 struct perf_event_context *next_ctx;
1211 struct perf_event_context *parent; 1249 struct perf_event_context *parent;
1250 struct perf_cpu_context *cpuctx;
1212 int do_switch = 1; 1251 int do_switch = 1;
1213 1252
1214 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1253 if (likely(!ctx))
1254 return;
1215 1255
1216 if (likely(!ctx || !cpuctx->task_ctx)) 1256 cpuctx = __get_cpu_context(ctx);
1257 if (!cpuctx->task_ctx)
1217 return; 1258 return;
1218 1259
1219 rcu_read_lock(); 1260 rcu_read_lock();
1220 parent = rcu_dereference(ctx->parent_ctx); 1261 parent = rcu_dereference(ctx->parent_ctx);
1221 next_ctx = next->perf_event_ctxp; 1262 next_ctx = next->perf_event_ctxp[ctxn];
1222 if (parent && next_ctx && 1263 if (parent && next_ctx &&
1223 rcu_dereference(next_ctx->parent_ctx) == parent) { 1264 rcu_dereference(next_ctx->parent_ctx) == parent) {
1224 /* 1265 /*
@@ -1237,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1237 * XXX do we need a memory barrier of sorts 1278 * XXX do we need a memory barrier of sorts
1238 * wrt to rcu_dereference() of perf_event_ctxp 1279 * wrt to rcu_dereference() of perf_event_ctxp
1239 */ 1280 */
1240 task->perf_event_ctxp = next_ctx; 1281 task->perf_event_ctxp[ctxn] = next_ctx;
1241 next->perf_event_ctxp = ctx; 1282 next->perf_event_ctxp[ctxn] = ctx;
1242 ctx->task = next; 1283 ctx->task = next;
1243 next_ctx->task = task; 1284 next_ctx->task = task;
1244 do_switch = 0; 1285 do_switch = 0;
@@ -1256,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1256 } 1297 }
1257} 1298}
1258 1299
1300#define for_each_task_context_nr(ctxn) \
1301 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1302
1303/*
1304 * Called from scheduler to remove the events of the current task,
1305 * with interrupts disabled.
1306 *
1307 * We stop each event and update the event value in event->count.
1308 *
1309 * This does not protect us against NMI, but disable()
1310 * sets the disabled bit in the control field of event _before_
1311 * accessing the event control register. If a NMI hits, then it will
1312 * not restart the event.
1313 */
1314void __perf_event_task_sched_out(struct task_struct *task,
1315 struct task_struct *next)
1316{
1317 int ctxn;
1318
1319 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1320
1321 for_each_task_context_nr(ctxn)
1322 perf_event_context_sched_out(task, ctxn, next);
1323}
1324
1259static void task_ctx_sched_out(struct perf_event_context *ctx, 1325static void task_ctx_sched_out(struct perf_event_context *ctx,
1260 enum event_type_t event_type) 1326 enum event_type_t event_type)
1261{ 1327{
1262 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1328 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1263 1329
1264 if (!cpuctx->task_ctx) 1330 if (!cpuctx->task_ctx)
1265 return; 1331 return;
@@ -1274,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1274/* 1340/*
1275 * Called with IRQs disabled 1341 * Called with IRQs disabled
1276 */ 1342 */
1277static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1278{
1279 task_ctx_sched_out(ctx, EVENT_ALL);
1280}
1281
1282/*
1283 * Called with IRQs disabled
1284 */
1285static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1343static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1286 enum event_type_t event_type) 1344 enum event_type_t event_type)
1287{ 1345{
@@ -1332,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1332 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1390 if (event->cpu != -1 && event->cpu != smp_processor_id())
1333 continue; 1391 continue;
1334 1392
1335 if (group_can_go_on(event, cpuctx, can_add_hw)) 1393 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1336 if (group_sched_in(event, cpuctx, ctx)) 1394 if (group_sched_in(event, cpuctx, ctx))
1337 can_add_hw = 0; 1395 can_add_hw = 0;
1396 }
1338 } 1397 }
1339} 1398}
1340 1399
@@ -1350,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1350 1409
1351 ctx->timestamp = perf_clock(); 1410 ctx->timestamp = perf_clock();
1352 1411
1353 perf_disable();
1354
1355 /* 1412 /*
1356 * First go through the list and put on any pinned groups 1413 * First go through the list and put on any pinned groups
1357 * in order to give them the best chance of going on. 1414 * in order to give them the best chance of going on.
@@ -1363,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1363 if (event_type & EVENT_FLEXIBLE) 1420 if (event_type & EVENT_FLEXIBLE)
1364 ctx_flexible_sched_in(ctx, cpuctx); 1421 ctx_flexible_sched_in(ctx, cpuctx);
1365 1422
1366 perf_enable(); 1423out:
1367 out:
1368 raw_spin_unlock(&ctx->lock); 1424 raw_spin_unlock(&ctx->lock);
1369} 1425}
1370 1426
@@ -1376,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1376 ctx_sched_in(ctx, cpuctx, event_type); 1432 ctx_sched_in(ctx, cpuctx, event_type);
1377} 1433}
1378 1434
1379static void task_ctx_sched_in(struct task_struct *task, 1435static void task_ctx_sched_in(struct perf_event_context *ctx,
1380 enum event_type_t event_type) 1436 enum event_type_t event_type)
1381{ 1437{
1382 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1438 struct perf_cpu_context *cpuctx;
1383 struct perf_event_context *ctx = task->perf_event_ctxp;
1384 1439
1385 if (likely(!ctx)) 1440 cpuctx = __get_cpu_context(ctx);
1386 return;
1387 if (cpuctx->task_ctx == ctx) 1441 if (cpuctx->task_ctx == ctx)
1388 return; 1442 return;
1443
1389 ctx_sched_in(ctx, cpuctx, event_type); 1444 ctx_sched_in(ctx, cpuctx, event_type);
1390 cpuctx->task_ctx = ctx; 1445 cpuctx->task_ctx = ctx;
1391} 1446}
1392/*
1393 * Called from scheduler to add the events of the current task
1394 * with interrupts disabled.
1395 *
1396 * We restore the event value and then enable it.
1397 *
1398 * This does not protect us against NMI, but enable()
1399 * sets the enabled bit in the control field of event _before_
1400 * accessing the event control register. If a NMI hits, then it will
1401 * keep the event running.
1402 */
1403void perf_event_task_sched_in(struct task_struct *task)
1404{
1405 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1406 struct perf_event_context *ctx = task->perf_event_ctxp;
1407 1447
1408 if (likely(!ctx)) 1448void perf_event_context_sched_in(struct perf_event_context *ctx)
1409 return; 1449{
1450 struct perf_cpu_context *cpuctx;
1410 1451
1452 cpuctx = __get_cpu_context(ctx);
1411 if (cpuctx->task_ctx == ctx) 1453 if (cpuctx->task_ctx == ctx)
1412 return; 1454 return;
1413 1455
1414 perf_disable(); 1456 perf_pmu_disable(ctx->pmu);
1415
1416 /* 1457 /*
1417 * We want to keep the following priority order: 1458 * We want to keep the following priority order:
1418 * cpu pinned (that don't need to move), task pinned, 1459 * cpu pinned (that don't need to move), task pinned,
@@ -1426,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1426 1467
1427 cpuctx->task_ctx = ctx; 1468 cpuctx->task_ctx = ctx;
1428 1469
1429 perf_enable(); 1470 /*
1471 * Since these rotations are per-cpu, we need to ensure the
1472 * cpu-context we got scheduled on is actually rotating.
1473 */
1474 perf_pmu_rotate_start(ctx->pmu);
1475 perf_pmu_enable(ctx->pmu);
1476}
1477
1478/*
1479 * Called from scheduler to add the events of the current task
1480 * with interrupts disabled.
1481 *
1482 * We restore the event value and then enable it.
1483 *
1484 * This does not protect us against NMI, but enable()
1485 * sets the enabled bit in the control field of event _before_
1486 * accessing the event control register. If a NMI hits, then it will
1487 * keep the event running.
1488 */
1489void __perf_event_task_sched_in(struct task_struct *task)
1490{
1491 struct perf_event_context *ctx;
1492 int ctxn;
1493
1494 for_each_task_context_nr(ctxn) {
1495 ctx = task->perf_event_ctxp[ctxn];
1496 if (likely(!ctx))
1497 continue;
1498
1499 perf_event_context_sched_in(ctx);
1500 }
1430} 1501}
1431 1502
1432#define MAX_INTERRUPTS (~0ULL) 1503#define MAX_INTERRUPTS (~0ULL)
@@ -1506,22 +1577,6 @@ do { \
1506 return div64_u64(dividend, divisor); 1577 return div64_u64(dividend, divisor);
1507} 1578}
1508 1579
1509static void perf_event_stop(struct perf_event *event)
1510{
1511 if (!event->pmu->stop)
1512 return event->pmu->disable(event);
1513
1514 return event->pmu->stop(event);
1515}
1516
1517static int perf_event_start(struct perf_event *event)
1518{
1519 if (!event->pmu->start)
1520 return event->pmu->enable(event);
1521
1522 return event->pmu->start(event);
1523}
1524
1525static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1580static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1526{ 1581{
1527 struct hw_perf_event *hwc = &event->hw; 1582 struct hw_perf_event *hwc = &event->hw;
@@ -1541,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1541 hwc->sample_period = sample_period; 1596 hwc->sample_period = sample_period;
1542 1597
1543 if (local64_read(&hwc->period_left) > 8*sample_period) { 1598 if (local64_read(&hwc->period_left) > 8*sample_period) {
1544 perf_disable(); 1599 event->pmu->stop(event, PERF_EF_UPDATE);
1545 perf_event_stop(event);
1546 local64_set(&hwc->period_left, 0); 1600 local64_set(&hwc->period_left, 0);
1547 perf_event_start(event); 1601 event->pmu->start(event, PERF_EF_RELOAD);
1548 perf_enable();
1549 } 1602 }
1550} 1603}
1551 1604
1552static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1605static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1553{ 1606{
1554 struct perf_event *event; 1607 struct perf_event *event;
1555 struct hw_perf_event *hwc; 1608 struct hw_perf_event *hwc;
@@ -1574,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1574 */ 1627 */
1575 if (interrupts == MAX_INTERRUPTS) { 1628 if (interrupts == MAX_INTERRUPTS) {
1576 perf_log_throttle(event, 1); 1629 perf_log_throttle(event, 1);
1577 perf_disable(); 1630 event->pmu->start(event, 0);
1578 event->pmu->unthrottle(event);
1579 perf_enable();
1580 } 1631 }
1581 1632
1582 if (!event->attr.freq || !event->attr.sample_freq) 1633 if (!event->attr.freq || !event->attr.sample_freq)
1583 continue; 1634 continue;
1584 1635
1585 perf_disable();
1586 event->pmu->read(event); 1636 event->pmu->read(event);
1587 now = local64_read(&event->count); 1637 now = local64_read(&event->count);
1588 delta = now - hwc->freq_count_stamp; 1638 delta = now - hwc->freq_count_stamp;
1589 hwc->freq_count_stamp = now; 1639 hwc->freq_count_stamp = now;
1590 1640
1591 if (delta > 0) 1641 if (delta > 0)
1592 perf_adjust_period(event, TICK_NSEC, delta); 1642 perf_adjust_period(event, period, delta);
1593 perf_enable();
1594 } 1643 }
1595 raw_spin_unlock(&ctx->lock); 1644 raw_spin_unlock(&ctx->lock);
1596} 1645}
@@ -1608,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1608 raw_spin_unlock(&ctx->lock); 1657 raw_spin_unlock(&ctx->lock);
1609} 1658}
1610 1659
1611void perf_event_task_tick(struct task_struct *curr) 1660/*
1661 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1662 * because they're strictly cpu affine and rotate_start is called with IRQs
1663 * disabled, while rotate_context is called from IRQ context.
1664 */
1665static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1612{ 1666{
1613 struct perf_cpu_context *cpuctx; 1667 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1614 struct perf_event_context *ctx; 1668 struct perf_event_context *ctx = NULL;
1615 int rotate = 0; 1669 int rotate = 0, remove = 1;
1616
1617 if (!atomic_read(&nr_events))
1618 return;
1619 1670
1620 cpuctx = &__get_cpu_var(perf_cpu_context); 1671 if (cpuctx->ctx.nr_events) {
1621 if (cpuctx->ctx.nr_events && 1672 remove = 0;
1622 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1673 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1623 rotate = 1; 1674 rotate = 1;
1675 }
1624 1676
1625 ctx = curr->perf_event_ctxp; 1677 ctx = cpuctx->task_ctx;
1626 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1678 if (ctx && ctx->nr_events) {
1627 rotate = 1; 1679 remove = 0;
1680 if (ctx->nr_events != ctx->nr_active)
1681 rotate = 1;
1682 }
1628 1683
1629 perf_ctx_adjust_freq(&cpuctx->ctx); 1684 perf_pmu_disable(cpuctx->ctx.pmu);
1685 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1630 if (ctx) 1686 if (ctx)
1631 perf_ctx_adjust_freq(ctx); 1687 perf_ctx_adjust_freq(ctx, interval);
1632 1688
1633 if (!rotate) 1689 if (!rotate)
1634 return; 1690 goto done;
1635 1691
1636 perf_disable();
1637 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1692 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1638 if (ctx) 1693 if (ctx)
1639 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1694 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1644,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr)
1644 1699
1645 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1700 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1646 if (ctx) 1701 if (ctx)
1647 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1702 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1648 perf_enable(); 1703
1704done:
1705 if (remove)
1706 list_del_init(&cpuctx->rotation_list);
1707
1708 perf_pmu_enable(cpuctx->ctx.pmu);
1709}
1710
1711void perf_event_task_tick(void)
1712{
1713 struct list_head *head = &__get_cpu_var(rotation_list);
1714 struct perf_cpu_context *cpuctx, *tmp;
1715
1716 WARN_ON(!irqs_disabled());
1717
1718 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1719 if (cpuctx->jiffies_interval == 1 ||
1720 !(jiffies % cpuctx->jiffies_interval))
1721 perf_rotate_context(cpuctx);
1722 }
1649} 1723}
1650 1724
1651static int event_enable_on_exec(struct perf_event *event, 1725static int event_enable_on_exec(struct perf_event *event,
@@ -1667,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event,
1667 * Enable all of a task's events that have been marked enable-on-exec. 1741 * Enable all of a task's events that have been marked enable-on-exec.
1668 * This expects task == current. 1742 * This expects task == current.
1669 */ 1743 */
1670static void perf_event_enable_on_exec(struct task_struct *task) 1744static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1671{ 1745{
1672 struct perf_event_context *ctx;
1673 struct perf_event *event; 1746 struct perf_event *event;
1674 unsigned long flags; 1747 unsigned long flags;
1675 int enabled = 0; 1748 int enabled = 0;
1676 int ret; 1749 int ret;
1677 1750
1678 local_irq_save(flags); 1751 local_irq_save(flags);
1679 ctx = task->perf_event_ctxp;
1680 if (!ctx || !ctx->nr_events) 1752 if (!ctx || !ctx->nr_events)
1681 goto out; 1753 goto out;
1682 1754
1683 __perf_event_task_sched_out(ctx); 1755 task_ctx_sched_out(ctx, EVENT_ALL);
1684 1756
1685 raw_spin_lock(&ctx->lock); 1757 raw_spin_lock(&ctx->lock);
1686 1758
@@ -1704,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1704 1776
1705 raw_spin_unlock(&ctx->lock); 1777 raw_spin_unlock(&ctx->lock);
1706 1778
1707 perf_event_task_sched_in(task); 1779 perf_event_context_sched_in(ctx);
1708 out: 1780out:
1709 local_irq_restore(flags); 1781 local_irq_restore(flags);
1710} 1782}
1711 1783
@@ -1714,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1714 */ 1786 */
1715static void __perf_event_read(void *info) 1787static void __perf_event_read(void *info)
1716{ 1788{
1717 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1718 struct perf_event *event = info; 1789 struct perf_event *event = info;
1719 struct perf_event_context *ctx = event->ctx; 1790 struct perf_event_context *ctx = event->ctx;
1791 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1720 1792
1721 /* 1793 /*
1722 * If this is a task context, we need to check whether it is 1794 * If this is a task context, we need to check whether it is
@@ -1755,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
1755 unsigned long flags; 1827 unsigned long flags;
1756 1828
1757 raw_spin_lock_irqsave(&ctx->lock, flags); 1829 raw_spin_lock_irqsave(&ctx->lock, flags);
1758 update_context_time(ctx); 1830 /*
1831 * may read while context is not active
1832 * (e.g., thread is blocked), in that case
1833 * we cannot update context time
1834 */
1835 if (ctx->is_active)
1836 update_context_time(ctx);
1759 update_event_times(event); 1837 update_event_times(event);
1760 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1838 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1761 } 1839 }
@@ -1764,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event)
1764} 1842}
1765 1843
1766/* 1844/*
1767 * Initialize the perf_event context in a task_struct: 1845 * Callchain support
1768 */ 1846 */
1847
1848struct callchain_cpus_entries {
1849 struct rcu_head rcu_head;
1850 struct perf_callchain_entry *cpu_entries[0];
1851};
1852
1853static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1854static atomic_t nr_callchain_events;
1855static DEFINE_MUTEX(callchain_mutex);
1856struct callchain_cpus_entries *callchain_cpus_entries;
1857
1858
1859__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1860 struct pt_regs *regs)
1861{
1862}
1863
1864__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1865 struct pt_regs *regs)
1866{
1867}
1868
1869static void release_callchain_buffers_rcu(struct rcu_head *head)
1870{
1871 struct callchain_cpus_entries *entries;
1872 int cpu;
1873
1874 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1875
1876 for_each_possible_cpu(cpu)
1877 kfree(entries->cpu_entries[cpu]);
1878
1879 kfree(entries);
1880}
1881
1882static void release_callchain_buffers(void)
1883{
1884 struct callchain_cpus_entries *entries;
1885
1886 entries = callchain_cpus_entries;
1887 rcu_assign_pointer(callchain_cpus_entries, NULL);
1888 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1889}
1890
1891static int alloc_callchain_buffers(void)
1892{
1893 int cpu;
1894 int size;
1895 struct callchain_cpus_entries *entries;
1896
1897 /*
1898 * We can't use the percpu allocation API for data that can be
1899 * accessed from NMI. Use a temporary manual per cpu allocation
1900 * until that gets sorted out.
1901 */
1902 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1903 num_possible_cpus();
1904
1905 entries = kzalloc(size, GFP_KERNEL);
1906 if (!entries)
1907 return -ENOMEM;
1908
1909 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1910
1911 for_each_possible_cpu(cpu) {
1912 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1913 cpu_to_node(cpu));
1914 if (!entries->cpu_entries[cpu])
1915 goto fail;
1916 }
1917
1918 rcu_assign_pointer(callchain_cpus_entries, entries);
1919
1920 return 0;
1921
1922fail:
1923 for_each_possible_cpu(cpu)
1924 kfree(entries->cpu_entries[cpu]);
1925 kfree(entries);
1926
1927 return -ENOMEM;
1928}
1929
1930static int get_callchain_buffers(void)
1931{
1932 int err = 0;
1933 int count;
1934
1935 mutex_lock(&callchain_mutex);
1936
1937 count = atomic_inc_return(&nr_callchain_events);
1938 if (WARN_ON_ONCE(count < 1)) {
1939 err = -EINVAL;
1940 goto exit;
1941 }
1942
1943 if (count > 1) {
1944 /* If the allocation failed, give up */
1945 if (!callchain_cpus_entries)
1946 err = -ENOMEM;
1947 goto exit;
1948 }
1949
1950 err = alloc_callchain_buffers();
1951 if (err)
1952 release_callchain_buffers();
1953exit:
1954 mutex_unlock(&callchain_mutex);
1955
1956 return err;
1957}
1958
1959static void put_callchain_buffers(void)
1960{
1961 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1962 release_callchain_buffers();
1963 mutex_unlock(&callchain_mutex);
1964 }
1965}
1966
1967static int get_recursion_context(int *recursion)
1968{
1969 int rctx;
1970
1971 if (in_nmi())
1972 rctx = 3;
1973 else if (in_irq())
1974 rctx = 2;
1975 else if (in_softirq())
1976 rctx = 1;
1977 else
1978 rctx = 0;
1979
1980 if (recursion[rctx])
1981 return -1;
1982
1983 recursion[rctx]++;
1984 barrier();
1985
1986 return rctx;
1987}
1988
1989static inline void put_recursion_context(int *recursion, int rctx)
1990{
1991 barrier();
1992 recursion[rctx]--;
1993}
1994
1995static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1996{
1997 int cpu;
1998 struct callchain_cpus_entries *entries;
1999
2000 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2001 if (*rctx == -1)
2002 return NULL;
2003
2004 entries = rcu_dereference(callchain_cpus_entries);
2005 if (!entries)
2006 return NULL;
2007
2008 cpu = smp_processor_id();
2009
2010 return &entries->cpu_entries[cpu][*rctx];
2011}
2012
1769static void 2013static void
1770__perf_event_init_context(struct perf_event_context *ctx, 2014put_callchain_entry(int rctx)
1771 struct task_struct *task) 2015{
2016 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2017}
2018
2019static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2020{
2021 int rctx;
2022 struct perf_callchain_entry *entry;
2023
2024
2025 entry = get_callchain_entry(&rctx);
2026 if (rctx == -1)
2027 return NULL;
2028
2029 if (!entry)
2030 goto exit_put;
2031
2032 entry->nr = 0;
2033
2034 if (!user_mode(regs)) {
2035 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2036 perf_callchain_kernel(entry, regs);
2037 if (current->mm)
2038 regs = task_pt_regs(current);
2039 else
2040 regs = NULL;
2041 }
2042
2043 if (regs) {
2044 perf_callchain_store(entry, PERF_CONTEXT_USER);
2045 perf_callchain_user(entry, regs);
2046 }
2047
2048exit_put:
2049 put_callchain_entry(rctx);
2050
2051 return entry;
2052}
2053
2054/*
2055 * Initialize the perf_event context in a task_struct:
2056 */
2057static void __perf_event_init_context(struct perf_event_context *ctx)
1772{ 2058{
1773 raw_spin_lock_init(&ctx->lock); 2059 raw_spin_lock_init(&ctx->lock);
1774 mutex_init(&ctx->mutex); 2060 mutex_init(&ctx->mutex);
@@ -1776,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1776 INIT_LIST_HEAD(&ctx->flexible_groups); 2062 INIT_LIST_HEAD(&ctx->flexible_groups);
1777 INIT_LIST_HEAD(&ctx->event_list); 2063 INIT_LIST_HEAD(&ctx->event_list);
1778 atomic_set(&ctx->refcount, 1); 2064 atomic_set(&ctx->refcount, 1);
1779 ctx->task = task;
1780} 2065}
1781 2066
1782static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2067static struct perf_event_context *
2068alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1783{ 2069{
1784 struct perf_event_context *ctx; 2070 struct perf_event_context *ctx;
1785 struct perf_cpu_context *cpuctx;
1786 struct task_struct *task;
1787 unsigned long flags;
1788 int err;
1789 2071
1790 if (pid == -1 && cpu != -1) { 2072 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1791 /* Must be root to operate on a CPU event: */ 2073 if (!ctx)
1792 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2074 return NULL;
1793 return ERR_PTR(-EACCES);
1794
1795 if (cpu < 0 || cpu >= nr_cpumask_bits)
1796 return ERR_PTR(-EINVAL);
1797 2075
1798 /* 2076 __perf_event_init_context(ctx);
1799 * We could be clever and allow to attach a event to an 2077 if (task) {
1800 * offline CPU and activate it when the CPU comes up, but 2078 ctx->task = task;
1801 * that's for later. 2079 get_task_struct(task);
1802 */ 2080 }
1803 if (!cpu_online(cpu)) 2081 ctx->pmu = pmu;
1804 return ERR_PTR(-ENODEV);
1805 2082
1806 cpuctx = &per_cpu(perf_cpu_context, cpu); 2083 return ctx;
1807 ctx = &cpuctx->ctx; 2084}
1808 get_ctx(ctx);
1809 2085
1810 return ctx; 2086static struct task_struct *
1811 } 2087find_lively_task_by_vpid(pid_t vpid)
2088{
2089 struct task_struct *task;
2090 int err;
1812 2091
1813 rcu_read_lock(); 2092 rcu_read_lock();
1814 if (!pid) 2093 if (!vpid)
1815 task = current; 2094 task = current;
1816 else 2095 else
1817 task = find_task_by_vpid(pid); 2096 task = find_task_by_vpid(vpid);
1818 if (task) 2097 if (task)
1819 get_task_struct(task); 2098 get_task_struct(task);
1820 rcu_read_unlock(); 2099 rcu_read_unlock();
@@ -1834,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1834 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2113 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1835 goto errout; 2114 goto errout;
1836 2115
1837 retry: 2116 return task;
1838 ctx = perf_lock_task_context(task, &flags); 2117errout:
2118 put_task_struct(task);
2119 return ERR_PTR(err);
2120
2121}
2122
2123static struct perf_event_context *
2124find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2125{
2126 struct perf_event_context *ctx;
2127 struct perf_cpu_context *cpuctx;
2128 unsigned long flags;
2129 int ctxn, err;
2130
2131 if (!task && cpu != -1) {
2132 /* Must be root to operate on a CPU event: */
2133 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2134 return ERR_PTR(-EACCES);
2135
2136 if (cpu < 0 || cpu >= nr_cpumask_bits)
2137 return ERR_PTR(-EINVAL);
2138
2139 /*
2140 * We could be clever and allow to attach a event to an
2141 * offline CPU and activate it when the CPU comes up, but
2142 * that's for later.
2143 */
2144 if (!cpu_online(cpu))
2145 return ERR_PTR(-ENODEV);
2146
2147 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2148 ctx = &cpuctx->ctx;
2149 get_ctx(ctx);
2150
2151 return ctx;
2152 }
2153
2154 err = -EINVAL;
2155 ctxn = pmu->task_ctx_nr;
2156 if (ctxn < 0)
2157 goto errout;
2158
2159retry:
2160 ctx = perf_lock_task_context(task, ctxn, &flags);
1839 if (ctx) { 2161 if (ctx) {
1840 unclone_ctx(ctx); 2162 unclone_ctx(ctx);
1841 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2163 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1842 } 2164 }
1843 2165
1844 if (!ctx) { 2166 if (!ctx) {
1845 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2167 ctx = alloc_perf_context(pmu, task);
1846 err = -ENOMEM; 2168 err = -ENOMEM;
1847 if (!ctx) 2169 if (!ctx)
1848 goto errout; 2170 goto errout;
1849 __perf_event_init_context(ctx, task); 2171
1850 get_ctx(ctx); 2172 get_ctx(ctx);
1851 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2173
2174 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1852 /* 2175 /*
1853 * We raced with some other task; use 2176 * We raced with some other task; use
1854 * the context they set. 2177 * the context they set.
1855 */ 2178 */
2179 put_task_struct(task);
1856 kfree(ctx); 2180 kfree(ctx);
1857 goto retry; 2181 goto retry;
1858 } 2182 }
1859 get_task_struct(task);
1860 } 2183 }
1861 2184
1862 put_task_struct(task);
1863 return ctx; 2185 return ctx;
1864 2186
1865 errout: 2187errout:
1866 put_task_struct(task);
1867 return ERR_PTR(err); 2188 return ERR_PTR(err);
1868} 2189}
1869 2190
@@ -1880,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head)
1880 kfree(event); 2201 kfree(event);
1881} 2202}
1882 2203
1883static void perf_pending_sync(struct perf_event *event);
1884static void perf_buffer_put(struct perf_buffer *buffer); 2204static void perf_buffer_put(struct perf_buffer *buffer);
1885 2205
1886static void free_event(struct perf_event *event) 2206static void free_event(struct perf_event *event)
1887{ 2207{
1888 perf_pending_sync(event); 2208 irq_work_sync(&event->pending);
1889 2209
1890 if (!event->parent) { 2210 if (!event->parent) {
1891 atomic_dec(&nr_events); 2211 if (event->attach_state & PERF_ATTACH_TASK)
2212 jump_label_dec(&perf_task_events);
1892 if (event->attr.mmap || event->attr.mmap_data) 2213 if (event->attr.mmap || event->attr.mmap_data)
1893 atomic_dec(&nr_mmap_events); 2214 atomic_dec(&nr_mmap_events);
1894 if (event->attr.comm) 2215 if (event->attr.comm)
1895 atomic_dec(&nr_comm_events); 2216 atomic_dec(&nr_comm_events);
1896 if (event->attr.task) 2217 if (event->attr.task)
1897 atomic_dec(&nr_task_events); 2218 atomic_dec(&nr_task_events);
2219 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2220 put_callchain_buffers();
1898 } 2221 }
1899 2222
1900 if (event->buffer) { 2223 if (event->buffer) {
@@ -1905,7 +2228,9 @@ static void free_event(struct perf_event *event)
1905 if (event->destroy) 2228 if (event->destroy)
1906 event->destroy(event); 2229 event->destroy(event);
1907 2230
1908 put_ctx(event->ctx); 2231 if (event->ctx)
2232 put_ctx(event->ctx);
2233
1909 call_rcu(&event->rcu_head, free_event_rcu); 2234 call_rcu(&event->rcu_head, free_event_rcu);
1910} 2235}
1911 2236
@@ -2184,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event,
2184static int perf_event_period(struct perf_event *event, u64 __user *arg) 2509static int perf_event_period(struct perf_event *event, u64 __user *arg)
2185{ 2510{
2186 struct perf_event_context *ctx = event->ctx; 2511 struct perf_event_context *ctx = event->ctx;
2187 unsigned long size;
2188 int ret = 0; 2512 int ret = 0;
2189 u64 value; 2513 u64 value;
2190 2514
2191 if (!event->attr.sample_period) 2515 if (!event->attr.sample_period)
2192 return -EINVAL; 2516 return -EINVAL;
2193 2517
2194 size = copy_from_user(&value, arg, sizeof(value)); 2518 if (copy_from_user(&value, arg, sizeof(value)))
2195 if (size != sizeof(value))
2196 return -EFAULT; 2519 return -EFAULT;
2197 2520
2198 if (!value) 2521 if (!value)
@@ -2326,6 +2649,9 @@ int perf_event_task_disable(void)
2326 2649
2327static int perf_event_index(struct perf_event *event) 2650static int perf_event_index(struct perf_event *event)
2328{ 2651{
2652 if (event->hw.state & PERF_HES_STOPPED)
2653 return 0;
2654
2329 if (event->state != PERF_EVENT_STATE_ACTIVE) 2655 if (event->state != PERF_EVENT_STATE_ACTIVE)
2330 return 0; 2656 return 0;
2331 2657
@@ -2829,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
2829 } 3155 }
2830} 3156}
2831 3157
2832/* 3158static void perf_pending_event(struct irq_work *entry)
2833 * Pending wakeups
2834 *
2835 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2836 *
2837 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2838 * single linked list and use cmpxchg() to add entries lockless.
2839 */
2840
2841static void perf_pending_event(struct perf_pending_entry *entry)
2842{ 3159{
2843 struct perf_event *event = container_of(entry, 3160 struct perf_event *event = container_of(entry,
2844 struct perf_event, pending); 3161 struct perf_event, pending);
@@ -2854,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2854 } 3171 }
2855} 3172}
2856 3173
2857#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2858
2859static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2860 PENDING_TAIL,
2861};
2862
2863static void perf_pending_queue(struct perf_pending_entry *entry,
2864 void (*func)(struct perf_pending_entry *))
2865{
2866 struct perf_pending_entry **head;
2867
2868 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2869 return;
2870
2871 entry->func = func;
2872
2873 head = &get_cpu_var(perf_pending_head);
2874
2875 do {
2876 entry->next = *head;
2877 } while (cmpxchg(head, entry->next, entry) != entry->next);
2878
2879 set_perf_event_pending();
2880
2881 put_cpu_var(perf_pending_head);
2882}
2883
2884static int __perf_pending_run(void)
2885{
2886 struct perf_pending_entry *list;
2887 int nr = 0;
2888
2889 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2890 while (list != PENDING_TAIL) {
2891 void (*func)(struct perf_pending_entry *);
2892 struct perf_pending_entry *entry = list;
2893
2894 list = list->next;
2895
2896 func = entry->func;
2897 entry->next = NULL;
2898 /*
2899 * Ensure we observe the unqueue before we issue the wakeup,
2900 * so that we won't be waiting forever.
2901 * -- see perf_not_pending().
2902 */
2903 smp_wmb();
2904
2905 func(entry);
2906 nr++;
2907 }
2908
2909 return nr;
2910}
2911
2912static inline int perf_not_pending(struct perf_event *event)
2913{
2914 /*
2915 * If we flush on whatever cpu we run, there is a chance we don't
2916 * need to wait.
2917 */
2918 get_cpu();
2919 __perf_pending_run();
2920 put_cpu();
2921
2922 /*
2923 * Ensure we see the proper queue state before going to sleep
2924 * so that we do not miss the wakeup. -- see perf_pending_handle()
2925 */
2926 smp_rmb();
2927 return event->pending.next == NULL;
2928}
2929
2930static void perf_pending_sync(struct perf_event *event)
2931{
2932 wait_event(event->waitq, perf_not_pending(event));
2933}
2934
2935void perf_event_do_pending(void)
2936{
2937 __perf_pending_run();
2938}
2939
2940/*
2941 * Callchain support -- arch specific
2942 */
2943
2944__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2945{
2946 return NULL;
2947}
2948
2949
2950/* 3174/*
2951 * We assume there is only KVM supporting the callbacks. 3175 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is 3176 * Later on, we might change it to a list if there is
@@ -2996,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2996 3220
2997 if (handle->nmi) { 3221 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 3222 handle->event->pending_wakeup = 1;
2999 perf_pending_queue(&handle->event->pending, 3223 irq_work_queue(&handle->event->pending);
3000 perf_pending_event);
3001 } else 3224 } else
3002 perf_event_wakeup(handle->event); 3225 perf_event_wakeup(handle->event);
3003} 3226}
@@ -3053,7 +3276,7 @@ again:
3053 if (handle->wakeup != local_read(&buffer->wakeup)) 3276 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3277 perf_output_wakeup(handle);
3055 3278
3056 out: 3279out:
3057 preempt_enable(); 3280 preempt_enable();
3058} 3281}
3059 3282
@@ -3441,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3441 struct perf_output_handle handle; 3664 struct perf_output_handle handle;
3442 struct perf_event_header header; 3665 struct perf_event_header header;
3443 3666
3667 /* protect the callchain buffers */
3668 rcu_read_lock();
3669
3444 perf_prepare_sample(&header, data, event, regs); 3670 perf_prepare_sample(&header, data, event, regs);
3445 3671
3446 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3672 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3447 return; 3673 goto exit;
3448 3674
3449 perf_output_sample(&handle, &header, data, event); 3675 perf_output_sample(&handle, &header, data, event);
3450 3676
3451 perf_output_end(&handle); 3677 perf_output_end(&handle);
3678
3679exit:
3680 rcu_read_unlock();
3452} 3681}
3453 3682
3454/* 3683/*
@@ -3562,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3562static void perf_event_task_event(struct perf_task_event *task_event) 3791static void perf_event_task_event(struct perf_task_event *task_event)
3563{ 3792{
3564 struct perf_cpu_context *cpuctx; 3793 struct perf_cpu_context *cpuctx;
3565 struct perf_event_context *ctx = task_event->task_ctx; 3794 struct perf_event_context *ctx;
3795 struct pmu *pmu;
3796 int ctxn;
3566 3797
3567 rcu_read_lock(); 3798 rcu_read_lock();
3568 cpuctx = &get_cpu_var(perf_cpu_context); 3799 list_for_each_entry_rcu(pmu, &pmus, entry) {
3569 perf_event_task_ctx(&cpuctx->ctx, task_event); 3800 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3570 if (!ctx) 3801 perf_event_task_ctx(&cpuctx->ctx, task_event);
3571 ctx = rcu_dereference(current->perf_event_ctxp); 3802
3572 if (ctx) 3803 ctx = task_event->task_ctx;
3573 perf_event_task_ctx(ctx, task_event); 3804 if (!ctx) {
3574 put_cpu_var(perf_cpu_context); 3805 ctxn = pmu->task_ctx_nr;
3806 if (ctxn < 0)
3807 goto next;
3808 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3809 }
3810 if (ctx)
3811 perf_event_task_ctx(ctx, task_event);
3812next:
3813 put_cpu_ptr(pmu->pmu_cpu_context);
3814 }
3575 rcu_read_unlock(); 3815 rcu_read_unlock();
3576} 3816}
3577 3817
@@ -3676,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3676{ 3916{
3677 struct perf_cpu_context *cpuctx; 3917 struct perf_cpu_context *cpuctx;
3678 struct perf_event_context *ctx; 3918 struct perf_event_context *ctx;
3679 unsigned int size;
3680 char comm[TASK_COMM_LEN]; 3919 char comm[TASK_COMM_LEN];
3920 unsigned int size;
3921 struct pmu *pmu;
3922 int ctxn;
3681 3923
3682 memset(comm, 0, sizeof(comm)); 3924 memset(comm, 0, sizeof(comm));
3683 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3925 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3689,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3689 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3931 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3690 3932
3691 rcu_read_lock(); 3933 rcu_read_lock();
3692 cpuctx = &get_cpu_var(perf_cpu_context); 3934 list_for_each_entry_rcu(pmu, &pmus, entry) {
3693 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3935 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3694 ctx = rcu_dereference(current->perf_event_ctxp); 3936 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3695 if (ctx) 3937
3696 perf_event_comm_ctx(ctx, comm_event); 3938 ctxn = pmu->task_ctx_nr;
3697 put_cpu_var(perf_cpu_context); 3939 if (ctxn < 0)
3940 goto next;
3941
3942 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3943 if (ctx)
3944 perf_event_comm_ctx(ctx, comm_event);
3945next:
3946 put_cpu_ptr(pmu->pmu_cpu_context);
3947 }
3698 rcu_read_unlock(); 3948 rcu_read_unlock();
3699} 3949}
3700 3950
3701void perf_event_comm(struct task_struct *task) 3951void perf_event_comm(struct task_struct *task)
3702{ 3952{
3703 struct perf_comm_event comm_event; 3953 struct perf_comm_event comm_event;
3954 struct perf_event_context *ctx;
3955 int ctxn;
3704 3956
3705 if (task->perf_event_ctxp) 3957 for_each_task_context_nr(ctxn) {
3706 perf_event_enable_on_exec(task); 3958 ctx = task->perf_event_ctxp[ctxn];
3959 if (!ctx)
3960 continue;
3961
3962 perf_event_enable_on_exec(ctx);
3963 }
3707 3964
3708 if (!atomic_read(&nr_comm_events)) 3965 if (!atomic_read(&nr_comm_events))
3709 return; 3966 return;
@@ -3805,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3805 char tmp[16]; 4062 char tmp[16];
3806 char *buf = NULL; 4063 char *buf = NULL;
3807 const char *name; 4064 const char *name;
4065 struct pmu *pmu;
4066 int ctxn;
3808 4067
3809 memset(tmp, 0, sizeof(tmp)); 4068 memset(tmp, 0, sizeof(tmp));
3810 4069
@@ -3857,12 +4116,23 @@ got_name:
3857 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4116 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3858 4117
3859 rcu_read_lock(); 4118 rcu_read_lock();
3860 cpuctx = &get_cpu_var(perf_cpu_context); 4119 list_for_each_entry_rcu(pmu, &pmus, entry) {
3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4120 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3862 ctx = rcu_dereference(current->perf_event_ctxp); 4121 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3863 if (ctx) 4122 vma->vm_flags & VM_EXEC);
3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4123
3865 put_cpu_var(perf_cpu_context); 4124 ctxn = pmu->task_ctx_nr;
4125 if (ctxn < 0)
4126 goto next;
4127
4128 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4129 if (ctx) {
4130 perf_event_mmap_ctx(ctx, mmap_event,
4131 vma->vm_flags & VM_EXEC);
4132 }
4133next:
4134 put_cpu_ptr(pmu->pmu_cpu_context);
4135 }
3866 rcu_read_unlock(); 4136 rcu_read_unlock();
3867 4137
3868 kfree(buf); 4138 kfree(buf);
@@ -3944,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3944 struct hw_perf_event *hwc = &event->hw; 4214 struct hw_perf_event *hwc = &event->hw;
3945 int ret = 0; 4215 int ret = 0;
3946 4216
3947 throttle = (throttle && event->pmu->unthrottle != NULL);
3948
3949 if (!throttle) { 4217 if (!throttle) {
3950 hwc->interrupts++; 4218 hwc->interrupts++;
3951 } else { 4219 } else {
@@ -3988,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3988 event->pending_kill = POLL_HUP; 4256 event->pending_kill = POLL_HUP;
3989 if (nmi) { 4257 if (nmi) {
3990 event->pending_disable = 1; 4258 event->pending_disable = 1;
3991 perf_pending_queue(&event->pending, 4259 irq_work_queue(&event->pending);
3992 perf_pending_event);
3993 } else 4260 } else
3994 perf_event_disable(event); 4261 perf_event_disable(event);
3995 } 4262 }
@@ -4013,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4013 * Generic software event infrastructure 4280 * Generic software event infrastructure
4014 */ 4281 */
4015 4282
4283struct swevent_htable {
4284 struct swevent_hlist *swevent_hlist;
4285 struct mutex hlist_mutex;
4286 int hlist_refcount;
4287
4288 /* Recursion avoidance in each contexts */
4289 int recursion[PERF_NR_CONTEXTS];
4290};
4291
4292static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4293
4016/* 4294/*
4017 * We directly increment event->count and keep a second value in 4295 * We directly increment event->count and keep a second value in
4018 * event->hw.period_left to count intervals. This period event 4296 * event->hw.period_left to count intervals. This period event
@@ -4070,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4070 } 4348 }
4071} 4349}
4072 4350
4073static void perf_swevent_add(struct perf_event *event, u64 nr, 4351static void perf_swevent_event(struct perf_event *event, u64 nr,
4074 int nmi, struct perf_sample_data *data, 4352 int nmi, struct perf_sample_data *data,
4075 struct pt_regs *regs) 4353 struct pt_regs *regs)
4076{ 4354{
@@ -4096,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4096static int perf_exclude_event(struct perf_event *event, 4374static int perf_exclude_event(struct perf_event *event,
4097 struct pt_regs *regs) 4375 struct pt_regs *regs)
4098{ 4376{
4377 if (event->hw.state & PERF_HES_STOPPED)
4378 return 0;
4379
4099 if (regs) { 4380 if (regs) {
4100 if (event->attr.exclude_user && user_mode(regs)) 4381 if (event->attr.exclude_user && user_mode(regs))
4101 return 1; 4382 return 1;
@@ -4142,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4142 4423
4143/* For the read side: events when they trigger */ 4424/* For the read side: events when they trigger */
4144static inline struct hlist_head * 4425static inline struct hlist_head *
4145find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4426find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4146{ 4427{
4147 struct swevent_hlist *hlist; 4428 struct swevent_hlist *hlist;
4148 4429
4149 hlist = rcu_dereference(ctx->swevent_hlist); 4430 hlist = rcu_dereference(swhash->swevent_hlist);
4150 if (!hlist) 4431 if (!hlist)
4151 return NULL; 4432 return NULL;
4152 4433
@@ -4155,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4155 4436
4156/* For the event head insertion and removal in the hlist */ 4437/* For the event head insertion and removal in the hlist */
4157static inline struct hlist_head * 4438static inline struct hlist_head *
4158find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4439find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4159{ 4440{
4160 struct swevent_hlist *hlist; 4441 struct swevent_hlist *hlist;
4161 u32 event_id = event->attr.config; 4442 u32 event_id = event->attr.config;
@@ -4166,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4166 * and release. Which makes the protected version suitable here. 4447 * and release. Which makes the protected version suitable here.
4167 * The context lock guarantees that. 4448 * The context lock guarantees that.
4168 */ 4449 */
4169 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4450 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4170 lockdep_is_held(&event->ctx->lock)); 4451 lockdep_is_held(&event->ctx->lock));
4171 if (!hlist) 4452 if (!hlist)
4172 return NULL; 4453 return NULL;
@@ -4179,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4179 struct perf_sample_data *data, 4460 struct perf_sample_data *data,
4180 struct pt_regs *regs) 4461 struct pt_regs *regs)
4181{ 4462{
4182 struct perf_cpu_context *cpuctx; 4463 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4183 struct perf_event *event; 4464 struct perf_event *event;
4184 struct hlist_node *node; 4465 struct hlist_node *node;
4185 struct hlist_head *head; 4466 struct hlist_head *head;
4186 4467
4187 cpuctx = &__get_cpu_var(perf_cpu_context);
4188
4189 rcu_read_lock(); 4468 rcu_read_lock();
4190 4469 head = find_swevent_head_rcu(swhash, type, event_id);
4191 head = find_swevent_head_rcu(cpuctx, type, event_id);
4192
4193 if (!head) 4470 if (!head)
4194 goto end; 4471 goto end;
4195 4472
4196 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4473 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4197 if (perf_swevent_match(event, type, event_id, data, regs)) 4474 if (perf_swevent_match(event, type, event_id, data, regs))
4198 perf_swevent_add(event, nr, nmi, data, regs); 4475 perf_swevent_event(event, nr, nmi, data, regs);
4199 } 4476 }
4200end: 4477end:
4201 rcu_read_unlock(); 4478 rcu_read_unlock();
@@ -4203,33 +4480,17 @@ end:
4203 4480
4204int perf_swevent_get_recursion_context(void) 4481int perf_swevent_get_recursion_context(void)
4205{ 4482{
4206 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4483 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4207 int rctx;
4208
4209 if (in_nmi())
4210 rctx = 3;
4211 else if (in_irq())
4212 rctx = 2;
4213 else if (in_softirq())
4214 rctx = 1;
4215 else
4216 rctx = 0;
4217 4484
4218 if (cpuctx->recursion[rctx]) 4485 return get_recursion_context(swhash->recursion);
4219 return -1;
4220
4221 cpuctx->recursion[rctx]++;
4222 barrier();
4223
4224 return rctx;
4225} 4486}
4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4487EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4227 4488
4228void inline perf_swevent_put_recursion_context(int rctx) 4489void inline perf_swevent_put_recursion_context(int rctx)
4229{ 4490{
4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4491 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4231 barrier(); 4492
4232 cpuctx->recursion[rctx]--; 4493 put_recursion_context(swhash->recursion, rctx);
4233} 4494}
4234 4495
4235void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4496void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4255,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event)
4255{ 4516{
4256} 4517}
4257 4518
4258static int perf_swevent_enable(struct perf_event *event) 4519static int perf_swevent_add(struct perf_event *event, int flags)
4259{ 4520{
4521 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4260 struct hw_perf_event *hwc = &event->hw; 4522 struct hw_perf_event *hwc = &event->hw;
4261 struct perf_cpu_context *cpuctx;
4262 struct hlist_head *head; 4523 struct hlist_head *head;
4263 4524
4264 cpuctx = &__get_cpu_var(perf_cpu_context);
4265
4266 if (hwc->sample_period) { 4525 if (hwc->sample_period) {
4267 hwc->last_period = hwc->sample_period; 4526 hwc->last_period = hwc->sample_period;
4268 perf_swevent_set_period(event); 4527 perf_swevent_set_period(event);
4269 } 4528 }
4270 4529
4271 head = find_swevent_head(cpuctx, event); 4530 hwc->state = !(flags & PERF_EF_START);
4531
4532 head = find_swevent_head(swhash, event);
4272 if (WARN_ON_ONCE(!head)) 4533 if (WARN_ON_ONCE(!head))
4273 return -EINVAL; 4534 return -EINVAL;
4274 4535
@@ -4277,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event)
4277 return 0; 4538 return 0;
4278} 4539}
4279 4540
4280static void perf_swevent_disable(struct perf_event *event) 4541static void perf_swevent_del(struct perf_event *event, int flags)
4281{ 4542{
4282 hlist_del_rcu(&event->hlist_entry); 4543 hlist_del_rcu(&event->hlist_entry);
4283} 4544}
4284 4545
4285static void perf_swevent_void(struct perf_event *event) 4546static void perf_swevent_start(struct perf_event *event, int flags)
4286{
4287}
4288
4289static int perf_swevent_int(struct perf_event *event)
4290{
4291 return 0;
4292}
4293
4294static const struct pmu perf_ops_generic = {
4295 .enable = perf_swevent_enable,
4296 .disable = perf_swevent_disable,
4297 .start = perf_swevent_int,
4298 .stop = perf_swevent_void,
4299 .read = perf_swevent_read,
4300 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4301};
4302
4303/*
4304 * hrtimer based swevent callback
4305 */
4306
4307static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4308{
4309 enum hrtimer_restart ret = HRTIMER_RESTART;
4310 struct perf_sample_data data;
4311 struct pt_regs *regs;
4312 struct perf_event *event;
4313 u64 period;
4314
4315 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4316 event->pmu->read(event);
4317
4318 perf_sample_data_init(&data, 0);
4319 data.period = event->hw.last_period;
4320 regs = get_irq_regs();
4321
4322 if (regs && !perf_exclude_event(event, regs)) {
4323 if (!(event->attr.exclude_idle && current->pid == 0))
4324 if (perf_event_overflow(event, 0, &data, regs))
4325 ret = HRTIMER_NORESTART;
4326 }
4327
4328 period = max_t(u64, 10000, event->hw.sample_period);
4329 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4330
4331 return ret;
4332}
4333
4334static void perf_swevent_start_hrtimer(struct perf_event *event)
4335{
4336 struct hw_perf_event *hwc = &event->hw;
4337
4338 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4339 hwc->hrtimer.function = perf_swevent_hrtimer;
4340 if (hwc->sample_period) {
4341 u64 period;
4342
4343 if (hwc->remaining) {
4344 if (hwc->remaining < 0)
4345 period = 10000;
4346 else
4347 period = hwc->remaining;
4348 hwc->remaining = 0;
4349 } else {
4350 period = max_t(u64, 10000, hwc->sample_period);
4351 }
4352 __hrtimer_start_range_ns(&hwc->hrtimer,
4353 ns_to_ktime(period), 0,
4354 HRTIMER_MODE_REL, 0);
4355 }
4356}
4357
4358static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4359{
4360 struct hw_perf_event *hwc = &event->hw;
4361
4362 if (hwc->sample_period) {
4363 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4364 hwc->remaining = ktime_to_ns(remaining);
4365
4366 hrtimer_cancel(&hwc->hrtimer);
4367 }
4368}
4369
4370/*
4371 * Software event: cpu wall time clock
4372 */
4373
4374static void cpu_clock_perf_event_update(struct perf_event *event)
4375{
4376 int cpu = raw_smp_processor_id();
4377 s64 prev;
4378 u64 now;
4379
4380 now = cpu_clock(cpu);
4381 prev = local64_xchg(&event->hw.prev_count, now);
4382 local64_add(now - prev, &event->count);
4383}
4384
4385static int cpu_clock_perf_event_enable(struct perf_event *event)
4386{
4387 struct hw_perf_event *hwc = &event->hw;
4388 int cpu = raw_smp_processor_id();
4389
4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4391 perf_swevent_start_hrtimer(event);
4392
4393 return 0;
4394}
4395
4396static void cpu_clock_perf_event_disable(struct perf_event *event)
4397{ 4547{
4398 perf_swevent_cancel_hrtimer(event); 4548 event->hw.state = 0;
4399 cpu_clock_perf_event_update(event);
4400}
4401
4402static void cpu_clock_perf_event_read(struct perf_event *event)
4403{
4404 cpu_clock_perf_event_update(event);
4405}
4406
4407static const struct pmu perf_ops_cpu_clock = {
4408 .enable = cpu_clock_perf_event_enable,
4409 .disable = cpu_clock_perf_event_disable,
4410 .read = cpu_clock_perf_event_read,
4411};
4412
4413/*
4414 * Software event: task time clock
4415 */
4416
4417static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4418{
4419 u64 prev;
4420 s64 delta;
4421
4422 prev = local64_xchg(&event->hw.prev_count, now);
4423 delta = now - prev;
4424 local64_add(delta, &event->count);
4425}
4426
4427static int task_clock_perf_event_enable(struct perf_event *event)
4428{
4429 struct hw_perf_event *hwc = &event->hw;
4430 u64 now;
4431
4432 now = event->ctx->time;
4433
4434 local64_set(&hwc->prev_count, now);
4435
4436 perf_swevent_start_hrtimer(event);
4437
4438 return 0;
4439} 4549}
4440 4550
4441static void task_clock_perf_event_disable(struct perf_event *event) 4551static void perf_swevent_stop(struct perf_event *event, int flags)
4442{ 4552{
4443 perf_swevent_cancel_hrtimer(event); 4553 event->hw.state = PERF_HES_STOPPED;
4444 task_clock_perf_event_update(event, event->ctx->time);
4445
4446} 4554}
4447 4555
4448static void task_clock_perf_event_read(struct perf_event *event)
4449{
4450 u64 time;
4451
4452 if (!in_nmi()) {
4453 update_context_time(event->ctx);
4454 time = event->ctx->time;
4455 } else {
4456 u64 now = perf_clock();
4457 u64 delta = now - event->ctx->timestamp;
4458 time = event->ctx->time + delta;
4459 }
4460
4461 task_clock_perf_event_update(event, time);
4462}
4463
4464static const struct pmu perf_ops_task_clock = {
4465 .enable = task_clock_perf_event_enable,
4466 .disable = task_clock_perf_event_disable,
4467 .read = task_clock_perf_event_read,
4468};
4469
4470/* Deref the hlist from the update side */ 4556/* Deref the hlist from the update side */
4471static inline struct swevent_hlist * 4557static inline struct swevent_hlist *
4472swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4558swevent_hlist_deref(struct swevent_htable *swhash)
4473{ 4559{
4474 return rcu_dereference_protected(cpuctx->swevent_hlist, 4560 return rcu_dereference_protected(swhash->swevent_hlist,
4475 lockdep_is_held(&cpuctx->hlist_mutex)); 4561 lockdep_is_held(&swhash->hlist_mutex));
4476} 4562}
4477 4563
4478static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4564static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4483,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4483 kfree(hlist); 4569 kfree(hlist);
4484} 4570}
4485 4571
4486static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4572static void swevent_hlist_release(struct swevent_htable *swhash)
4487{ 4573{
4488 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4574 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4489 4575
4490 if (!hlist) 4576 if (!hlist)
4491 return; 4577 return;
4492 4578
4493 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4579 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4494 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4580 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4495} 4581}
4496 4582
4497static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4583static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4498{ 4584{
4499 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4585 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4500 4586
4501 mutex_lock(&cpuctx->hlist_mutex); 4587 mutex_lock(&swhash->hlist_mutex);
4502 4588
4503 if (!--cpuctx->hlist_refcount) 4589 if (!--swhash->hlist_refcount)
4504 swevent_hlist_release(cpuctx); 4590 swevent_hlist_release(swhash);
4505 4591
4506 mutex_unlock(&cpuctx->hlist_mutex); 4592 mutex_unlock(&swhash->hlist_mutex);
4507} 4593}
4508 4594
4509static void swevent_hlist_put(struct perf_event *event) 4595static void swevent_hlist_put(struct perf_event *event)
@@ -4521,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event)
4521 4607
4522static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4608static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4523{ 4609{
4524 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4610 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4525 int err = 0; 4611 int err = 0;
4526 4612
4527 mutex_lock(&cpuctx->hlist_mutex); 4613 mutex_lock(&swhash->hlist_mutex);
4528 4614
4529 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4615 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4530 struct swevent_hlist *hlist; 4616 struct swevent_hlist *hlist;
4531 4617
4532 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4618 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4534,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4534 err = -ENOMEM; 4620 err = -ENOMEM;
4535 goto exit; 4621 goto exit;
4536 } 4622 }
4537 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4623 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4538 } 4624 }
4539 cpuctx->hlist_refcount++; 4625 swhash->hlist_refcount++;
4540 exit: 4626exit:
4541 mutex_unlock(&cpuctx->hlist_mutex); 4627 mutex_unlock(&swhash->hlist_mutex);
4542 4628
4543 return err; 4629 return err;
4544} 4630}
@@ -4562,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event)
4562 put_online_cpus(); 4648 put_online_cpus();
4563 4649
4564 return 0; 4650 return 0;
4565 fail: 4651fail:
4566 for_each_possible_cpu(cpu) { 4652 for_each_possible_cpu(cpu) {
4567 if (cpu == failed_cpu) 4653 if (cpu == failed_cpu)
4568 break; 4654 break;
@@ -4573,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event)
4573 return err; 4659 return err;
4574} 4660}
4575 4661
4576#ifdef CONFIG_EVENT_TRACING 4662atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4663
4664static void sw_perf_event_destroy(struct perf_event *event)
4665{
4666 u64 event_id = event->attr.config;
4667
4668 WARN_ON(event->parent);
4577 4669
4578static const struct pmu perf_ops_tracepoint = { 4670 jump_label_dec(&perf_swevent_enabled[event_id]);
4579 .enable = perf_trace_enable, 4671 swevent_hlist_put(event);
4580 .disable = perf_trace_disable, 4672}
4581 .start = perf_swevent_int, 4673
4582 .stop = perf_swevent_void, 4674static int perf_swevent_init(struct perf_event *event)
4675{
4676 int event_id = event->attr.config;
4677
4678 if (event->attr.type != PERF_TYPE_SOFTWARE)
4679 return -ENOENT;
4680
4681 switch (event_id) {
4682 case PERF_COUNT_SW_CPU_CLOCK:
4683 case PERF_COUNT_SW_TASK_CLOCK:
4684 return -ENOENT;
4685
4686 default:
4687 break;
4688 }
4689
4690 if (event_id > PERF_COUNT_SW_MAX)
4691 return -ENOENT;
4692
4693 if (!event->parent) {
4694 int err;
4695
4696 err = swevent_hlist_get(event);
4697 if (err)
4698 return err;
4699
4700 jump_label_inc(&perf_swevent_enabled[event_id]);
4701 event->destroy = sw_perf_event_destroy;
4702 }
4703
4704 return 0;
4705}
4706
4707static struct pmu perf_swevent = {
4708 .task_ctx_nr = perf_sw_context,
4709
4710 .event_init = perf_swevent_init,
4711 .add = perf_swevent_add,
4712 .del = perf_swevent_del,
4713 .start = perf_swevent_start,
4714 .stop = perf_swevent_stop,
4583 .read = perf_swevent_read, 4715 .read = perf_swevent_read,
4584 .unthrottle = perf_swevent_void,
4585}; 4716};
4586 4717
4718#ifdef CONFIG_EVENT_TRACING
4719
4587static int perf_tp_filter_match(struct perf_event *event, 4720static int perf_tp_filter_match(struct perf_event *event,
4588 struct perf_sample_data *data) 4721 struct perf_sample_data *data)
4589{ 4722{
@@ -4627,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4627 4760
4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4761 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4629 if (perf_tp_event_match(event, &data, regs)) 4762 if (perf_tp_event_match(event, &data, regs))
4630 perf_swevent_add(event, count, 1, &data, regs); 4763 perf_swevent_event(event, count, 1, &data, regs);
4631 } 4764 }
4632 4765
4633 perf_swevent_put_recursion_context(rctx); 4766 perf_swevent_put_recursion_context(rctx);
@@ -4639,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4639 perf_trace_destroy(event); 4772 perf_trace_destroy(event);
4640} 4773}
4641 4774
4642static const struct pmu *tp_perf_event_init(struct perf_event *event) 4775static int perf_tp_event_init(struct perf_event *event)
4643{ 4776{
4644 int err; 4777 int err;
4645 4778
4779 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4780 return -ENOENT;
4781
4646 /* 4782 /*
4647 * Raw tracepoint data is a severe data leak, only allow root to 4783 * Raw tracepoint data is a severe data leak, only allow root to
4648 * have these. 4784 * have these.
@@ -4650,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4650 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4786 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4651 perf_paranoid_tracepoint_raw() && 4787 perf_paranoid_tracepoint_raw() &&
4652 !capable(CAP_SYS_ADMIN)) 4788 !capable(CAP_SYS_ADMIN))
4653 return ERR_PTR(-EPERM); 4789 return -EPERM;
4654 4790
4655 err = perf_trace_init(event); 4791 err = perf_trace_init(event);
4656 if (err) 4792 if (err)
4657 return NULL; 4793 return err;
4658 4794
4659 event->destroy = tp_perf_event_destroy; 4795 event->destroy = tp_perf_event_destroy;
4660 4796
4661 return &perf_ops_tracepoint; 4797 return 0;
4798}
4799
4800static struct pmu perf_tracepoint = {
4801 .task_ctx_nr = perf_sw_context,
4802
4803 .event_init = perf_tp_event_init,
4804 .add = perf_trace_add,
4805 .del = perf_trace_del,
4806 .start = perf_swevent_start,
4807 .stop = perf_swevent_stop,
4808 .read = perf_swevent_read,
4809};
4810
4811static inline void perf_tp_register(void)
4812{
4813 perf_pmu_register(&perf_tracepoint);
4662} 4814}
4663 4815
4664static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4816static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4686,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event)
4686 4838
4687#else 4839#else
4688 4840
4689static const struct pmu *tp_perf_event_init(struct perf_event *event) 4841static inline void perf_tp_register(void)
4690{ 4842{
4691 return NULL;
4692} 4843}
4693 4844
4694static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4845static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4703,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event)
4703#endif /* CONFIG_EVENT_TRACING */ 4854#endif /* CONFIG_EVENT_TRACING */
4704 4855
4705#ifdef CONFIG_HAVE_HW_BREAKPOINT 4856#ifdef CONFIG_HAVE_HW_BREAKPOINT
4706static void bp_perf_event_destroy(struct perf_event *event) 4857void perf_bp_event(struct perf_event *bp, void *data)
4707{ 4858{
4708 release_bp_slot(event); 4859 struct perf_sample_data sample;
4860 struct pt_regs *regs = data;
4861
4862 perf_sample_data_init(&sample, bp->attr.bp_addr);
4863
4864 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4865 perf_swevent_event(bp, 1, 1, &sample, regs);
4709} 4866}
4867#endif
4868
4869/*
4870 * hrtimer based swevent callback
4871 */
4710 4872
4711static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4873static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4712{ 4874{
4713 int err; 4875 enum hrtimer_restart ret = HRTIMER_RESTART;
4876 struct perf_sample_data data;
4877 struct pt_regs *regs;
4878 struct perf_event *event;
4879 u64 period;
4714 4880
4715 err = register_perf_hw_breakpoint(bp); 4881 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4716 if (err) 4882 event->pmu->read(event);
4717 return ERR_PTR(err);
4718 4883
4719 bp->destroy = bp_perf_event_destroy; 4884 perf_sample_data_init(&data, 0);
4885 data.period = event->hw.last_period;
4886 regs = get_irq_regs();
4720 4887
4721 return &perf_ops_bp; 4888 if (regs && !perf_exclude_event(event, regs)) {
4889 if (!(event->attr.exclude_idle && current->pid == 0))
4890 if (perf_event_overflow(event, 0, &data, regs))
4891 ret = HRTIMER_NORESTART;
4892 }
4893
4894 period = max_t(u64, 10000, event->hw.sample_period);
4895 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4896
4897 return ret;
4722} 4898}
4723 4899
4724void perf_bp_event(struct perf_event *bp, void *data) 4900static void perf_swevent_start_hrtimer(struct perf_event *event)
4725{ 4901{
4726 struct perf_sample_data sample; 4902 struct hw_perf_event *hwc = &event->hw;
4727 struct pt_regs *regs = data;
4728 4903
4729 perf_sample_data_init(&sample, bp->attr.bp_addr); 4904 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4905 hwc->hrtimer.function = perf_swevent_hrtimer;
4906 if (hwc->sample_period) {
4907 s64 period = local64_read(&hwc->period_left);
4908
4909 if (period) {
4910 if (period < 0)
4911 period = 10000;
4912
4913 local64_set(&hwc->period_left, 0);
4914 } else {
4915 period = max_t(u64, 10000, hwc->sample_period);
4916 }
4917 __hrtimer_start_range_ns(&hwc->hrtimer,
4918 ns_to_ktime(period), 0,
4919 HRTIMER_MODE_REL_PINNED, 0);
4920 }
4921}
4730 4922
4731 if (!perf_exclude_event(bp, regs)) 4923static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4732 perf_swevent_add(bp, 1, 1, &sample, regs); 4924{
4925 struct hw_perf_event *hwc = &event->hw;
4926
4927 if (hwc->sample_period) {
4928 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4929 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4930
4931 hrtimer_cancel(&hwc->hrtimer);
4932 }
4733} 4933}
4734#else 4934
4735static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4935/*
4936 * Software event: cpu wall time clock
4937 */
4938
4939static void cpu_clock_event_update(struct perf_event *event)
4736{ 4940{
4737 return NULL; 4941 s64 prev;
4942 u64 now;
4943
4944 now = local_clock();
4945 prev = local64_xchg(&event->hw.prev_count, now);
4946 local64_add(now - prev, &event->count);
4738} 4947}
4739 4948
4740void perf_bp_event(struct perf_event *bp, void *regs) 4949static void cpu_clock_event_start(struct perf_event *event, int flags)
4741{ 4950{
4951 local64_set(&event->hw.prev_count, local_clock());
4952 perf_swevent_start_hrtimer(event);
4742} 4953}
4743#endif
4744 4954
4745atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4955static void cpu_clock_event_stop(struct perf_event *event, int flags)
4956{
4957 perf_swevent_cancel_hrtimer(event);
4958 cpu_clock_event_update(event);
4959}
4746 4960
4747static void sw_perf_event_destroy(struct perf_event *event) 4961static int cpu_clock_event_add(struct perf_event *event, int flags)
4748{ 4962{
4749 u64 event_id = event->attr.config; 4963 if (flags & PERF_EF_START)
4964 cpu_clock_event_start(event, flags);
4750 4965
4751 WARN_ON(event->parent); 4966 return 0;
4967}
4752 4968
4753 atomic_dec(&perf_swevent_enabled[event_id]); 4969static void cpu_clock_event_del(struct perf_event *event, int flags)
4754 swevent_hlist_put(event); 4970{
4971 cpu_clock_event_stop(event, flags);
4755} 4972}
4756 4973
4757static const struct pmu *sw_perf_event_init(struct perf_event *event) 4974static void cpu_clock_event_read(struct perf_event *event)
4758{ 4975{
4759 const struct pmu *pmu = NULL; 4976 cpu_clock_event_update(event);
4760 u64 event_id = event->attr.config; 4977}
4978
4979static int cpu_clock_event_init(struct perf_event *event)
4980{
4981 if (event->attr.type != PERF_TYPE_SOFTWARE)
4982 return -ENOENT;
4983
4984 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
4985 return -ENOENT;
4761 4986
4987 return 0;
4988}
4989
4990static struct pmu perf_cpu_clock = {
4991 .task_ctx_nr = perf_sw_context,
4992
4993 .event_init = cpu_clock_event_init,
4994 .add = cpu_clock_event_add,
4995 .del = cpu_clock_event_del,
4996 .start = cpu_clock_event_start,
4997 .stop = cpu_clock_event_stop,
4998 .read = cpu_clock_event_read,
4999};
5000
5001/*
5002 * Software event: task time clock
5003 */
5004
5005static void task_clock_event_update(struct perf_event *event, u64 now)
5006{
5007 u64 prev;
5008 s64 delta;
5009
5010 prev = local64_xchg(&event->hw.prev_count, now);
5011 delta = now - prev;
5012 local64_add(delta, &event->count);
5013}
5014
5015static void task_clock_event_start(struct perf_event *event, int flags)
5016{
5017 local64_set(&event->hw.prev_count, event->ctx->time);
5018 perf_swevent_start_hrtimer(event);
5019}
5020
5021static void task_clock_event_stop(struct perf_event *event, int flags)
5022{
5023 perf_swevent_cancel_hrtimer(event);
5024 task_clock_event_update(event, event->ctx->time);
5025}
5026
5027static int task_clock_event_add(struct perf_event *event, int flags)
5028{
5029 if (flags & PERF_EF_START)
5030 task_clock_event_start(event, flags);
5031
5032 return 0;
5033}
5034
5035static void task_clock_event_del(struct perf_event *event, int flags)
5036{
5037 task_clock_event_stop(event, PERF_EF_UPDATE);
5038}
5039
5040static void task_clock_event_read(struct perf_event *event)
5041{
5042 u64 time;
5043
5044 if (!in_nmi()) {
5045 update_context_time(event->ctx);
5046 time = event->ctx->time;
5047 } else {
5048 u64 now = perf_clock();
5049 u64 delta = now - event->ctx->timestamp;
5050 time = event->ctx->time + delta;
5051 }
5052
5053 task_clock_event_update(event, time);
5054}
5055
5056static int task_clock_event_init(struct perf_event *event)
5057{
5058 if (event->attr.type != PERF_TYPE_SOFTWARE)
5059 return -ENOENT;
5060
5061 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5062 return -ENOENT;
5063
5064 return 0;
5065}
5066
5067static struct pmu perf_task_clock = {
5068 .task_ctx_nr = perf_sw_context,
5069
5070 .event_init = task_clock_event_init,
5071 .add = task_clock_event_add,
5072 .del = task_clock_event_del,
5073 .start = task_clock_event_start,
5074 .stop = task_clock_event_stop,
5075 .read = task_clock_event_read,
5076};
5077
5078static void perf_pmu_nop_void(struct pmu *pmu)
5079{
5080}
5081
5082static int perf_pmu_nop_int(struct pmu *pmu)
5083{
5084 return 0;
5085}
5086
5087static void perf_pmu_start_txn(struct pmu *pmu)
5088{
5089 perf_pmu_disable(pmu);
5090}
5091
5092static int perf_pmu_commit_txn(struct pmu *pmu)
5093{
5094 perf_pmu_enable(pmu);
5095 return 0;
5096}
5097
5098static void perf_pmu_cancel_txn(struct pmu *pmu)
5099{
5100 perf_pmu_enable(pmu);
5101}
5102
5103/*
5104 * Ensures all contexts with the same task_ctx_nr have the same
5105 * pmu_cpu_context too.
5106 */
5107static void *find_pmu_context(int ctxn)
5108{
5109 struct pmu *pmu;
5110
5111 if (ctxn < 0)
5112 return NULL;
5113
5114 list_for_each_entry(pmu, &pmus, entry) {
5115 if (pmu->task_ctx_nr == ctxn)
5116 return pmu->pmu_cpu_context;
5117 }
5118
5119 return NULL;
5120}
5121
5122static void free_pmu_context(void * __percpu cpu_context)
5123{
5124 struct pmu *pmu;
5125
5126 mutex_lock(&pmus_lock);
4762 /* 5127 /*
4763 * Software events (currently) can't in general distinguish 5128 * Like a real lame refcount.
4764 * between user, kernel and hypervisor events.
4765 * However, context switches and cpu migrations are considered
4766 * to be kernel events, and page faults are never hypervisor
4767 * events.
4768 */ 5129 */
4769 switch (event_id) { 5130 list_for_each_entry(pmu, &pmus, entry) {
4770 case PERF_COUNT_SW_CPU_CLOCK: 5131 if (pmu->pmu_cpu_context == cpu_context)
4771 pmu = &perf_ops_cpu_clock; 5132 goto out;
5133 }
4772 5134
4773 break; 5135 free_percpu(cpu_context);
4774 case PERF_COUNT_SW_TASK_CLOCK: 5136out:
4775 /* 5137 mutex_unlock(&pmus_lock);
4776 * If the user instantiates this as a per-cpu event, 5138}
4777 * use the cpu_clock event instead.
4778 */
4779 if (event->ctx->task)
4780 pmu = &perf_ops_task_clock;
4781 else
4782 pmu = &perf_ops_cpu_clock;
4783 5139
4784 break; 5140int perf_pmu_register(struct pmu *pmu)
4785 case PERF_COUNT_SW_PAGE_FAULTS: 5141{
4786 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5142 int cpu, ret;
4787 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 5143
4788 case PERF_COUNT_SW_CONTEXT_SWITCHES: 5144 mutex_lock(&pmus_lock);
4789 case PERF_COUNT_SW_CPU_MIGRATIONS: 5145 ret = -ENOMEM;
4790 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 5146 pmu->pmu_disable_count = alloc_percpu(int);
4791 case PERF_COUNT_SW_EMULATION_FAULTS: 5147 if (!pmu->pmu_disable_count)
4792 if (!event->parent) { 5148 goto unlock;
4793 int err; 5149
4794 5150 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
4795 err = swevent_hlist_get(event); 5151 if (pmu->pmu_cpu_context)
4796 if (err) 5152 goto got_cpu_context;
4797 return ERR_PTR(err); 5153
5154 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5155 if (!pmu->pmu_cpu_context)
5156 goto free_pdc;
4798 5157
4799 atomic_inc(&perf_swevent_enabled[event_id]); 5158 for_each_possible_cpu(cpu) {
4800 event->destroy = sw_perf_event_destroy; 5159 struct perf_cpu_context *cpuctx;
5160
5161 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5162 __perf_event_init_context(&cpuctx->ctx);
5163 cpuctx->ctx.type = cpu_context;
5164 cpuctx->ctx.pmu = pmu;
5165 cpuctx->jiffies_interval = 1;
5166 INIT_LIST_HEAD(&cpuctx->rotation_list);
5167 }
5168
5169got_cpu_context:
5170 if (!pmu->start_txn) {
5171 if (pmu->pmu_enable) {
5172 /*
5173 * If we have pmu_enable/pmu_disable calls, install
5174 * transaction stubs that use that to try and batch
5175 * hardware accesses.
5176 */
5177 pmu->start_txn = perf_pmu_start_txn;
5178 pmu->commit_txn = perf_pmu_commit_txn;
5179 pmu->cancel_txn = perf_pmu_cancel_txn;
5180 } else {
5181 pmu->start_txn = perf_pmu_nop_void;
5182 pmu->commit_txn = perf_pmu_nop_int;
5183 pmu->cancel_txn = perf_pmu_nop_void;
5184 }
5185 }
5186
5187 if (!pmu->pmu_enable) {
5188 pmu->pmu_enable = perf_pmu_nop_void;
5189 pmu->pmu_disable = perf_pmu_nop_void;
5190 }
5191
5192 list_add_rcu(&pmu->entry, &pmus);
5193 ret = 0;
5194unlock:
5195 mutex_unlock(&pmus_lock);
5196
5197 return ret;
5198
5199free_pdc:
5200 free_percpu(pmu->pmu_disable_count);
5201 goto unlock;
5202}
5203
5204void perf_pmu_unregister(struct pmu *pmu)
5205{
5206 mutex_lock(&pmus_lock);
5207 list_del_rcu(&pmu->entry);
5208 mutex_unlock(&pmus_lock);
5209
5210 /*
5211 * We dereference the pmu list under both SRCU and regular RCU, so
5212 * synchronize against both of those.
5213 */
5214 synchronize_srcu(&pmus_srcu);
5215 synchronize_rcu();
5216
5217 free_percpu(pmu->pmu_disable_count);
5218 free_pmu_context(pmu->pmu_cpu_context);
5219}
5220
5221struct pmu *perf_init_event(struct perf_event *event)
5222{
5223 struct pmu *pmu = NULL;
5224 int idx;
5225
5226 idx = srcu_read_lock(&pmus_srcu);
5227 list_for_each_entry_rcu(pmu, &pmus, entry) {
5228 int ret = pmu->event_init(event);
5229 if (!ret)
5230 goto unlock;
5231
5232 if (ret != -ENOENT) {
5233 pmu = ERR_PTR(ret);
5234 goto unlock;
4801 } 5235 }
4802 pmu = &perf_ops_generic;
4803 break;
4804 } 5236 }
5237 pmu = ERR_PTR(-ENOENT);
5238unlock:
5239 srcu_read_unlock(&pmus_srcu, idx);
4805 5240
4806 return pmu; 5241 return pmu;
4807} 5242}
@@ -4810,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4810 * Allocate and initialize a event structure 5245 * Allocate and initialize a event structure
4811 */ 5246 */
4812static struct perf_event * 5247static struct perf_event *
4813perf_event_alloc(struct perf_event_attr *attr, 5248perf_event_alloc(struct perf_event_attr *attr, int cpu,
4814 int cpu, 5249 struct task_struct *task,
4815 struct perf_event_context *ctx, 5250 struct perf_event *group_leader,
4816 struct perf_event *group_leader, 5251 struct perf_event *parent_event,
4817 struct perf_event *parent_event, 5252 perf_overflow_handler_t overflow_handler)
4818 perf_overflow_handler_t overflow_handler, 5253{
4819 gfp_t gfpflags) 5254 struct pmu *pmu;
4820{
4821 const struct pmu *pmu;
4822 struct perf_event *event; 5255 struct perf_event *event;
4823 struct hw_perf_event *hwc; 5256 struct hw_perf_event *hwc;
4824 long err; 5257 long err;
4825 5258
4826 event = kzalloc(sizeof(*event), gfpflags); 5259 event = kzalloc(sizeof(*event), GFP_KERNEL);
4827 if (!event) 5260 if (!event)
4828 return ERR_PTR(-ENOMEM); 5261 return ERR_PTR(-ENOMEM);
4829 5262
@@ -4841,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4841 INIT_LIST_HEAD(&event->event_entry); 5274 INIT_LIST_HEAD(&event->event_entry);
4842 INIT_LIST_HEAD(&event->sibling_list); 5275 INIT_LIST_HEAD(&event->sibling_list);
4843 init_waitqueue_head(&event->waitq); 5276 init_waitqueue_head(&event->waitq);
5277 init_irq_work(&event->pending, perf_pending_event);
4844 5278
4845 mutex_init(&event->mmap_mutex); 5279 mutex_init(&event->mmap_mutex);
4846 5280
@@ -4848,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4848 event->attr = *attr; 5282 event->attr = *attr;
4849 event->group_leader = group_leader; 5283 event->group_leader = group_leader;
4850 event->pmu = NULL; 5284 event->pmu = NULL;
4851 event->ctx = ctx;
4852 event->oncpu = -1; 5285 event->oncpu = -1;
4853 5286
4854 event->parent = parent_event; 5287 event->parent = parent_event;
@@ -4858,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr,
4858 5291
4859 event->state = PERF_EVENT_STATE_INACTIVE; 5292 event->state = PERF_EVENT_STATE_INACTIVE;
4860 5293
5294 if (task) {
5295 event->attach_state = PERF_ATTACH_TASK;
5296#ifdef CONFIG_HAVE_HW_BREAKPOINT
5297 /*
5298 * hw_breakpoint is a bit difficult here..
5299 */
5300 if (attr->type == PERF_TYPE_BREAKPOINT)
5301 event->hw.bp_target = task;
5302#endif
5303 }
5304
4861 if (!overflow_handler && parent_event) 5305 if (!overflow_handler && parent_event)
4862 overflow_handler = parent_event->overflow_handler; 5306 overflow_handler = parent_event->overflow_handler;
4863 5307
@@ -4882,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4882 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5326 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4883 goto done; 5327 goto done;
4884 5328
4885 switch (attr->type) { 5329 pmu = perf_init_event(event);
4886 case PERF_TYPE_RAW:
4887 case PERF_TYPE_HARDWARE:
4888 case PERF_TYPE_HW_CACHE:
4889 pmu = hw_perf_event_init(event);
4890 break;
4891
4892 case PERF_TYPE_SOFTWARE:
4893 pmu = sw_perf_event_init(event);
4894 break;
4895
4896 case PERF_TYPE_TRACEPOINT:
4897 pmu = tp_perf_event_init(event);
4898 break;
4899
4900 case PERF_TYPE_BREAKPOINT:
4901 pmu = bp_perf_event_init(event);
4902 break;
4903
4904 5330
4905 default:
4906 break;
4907 }
4908done: 5331done:
4909 err = 0; 5332 err = 0;
4910 if (!pmu) 5333 if (!pmu)
@@ -4922,13 +5345,21 @@ done:
4922 event->pmu = pmu; 5345 event->pmu = pmu;
4923 5346
4924 if (!event->parent) { 5347 if (!event->parent) {
4925 atomic_inc(&nr_events); 5348 if (event->attach_state & PERF_ATTACH_TASK)
5349 jump_label_inc(&perf_task_events);
4926 if (event->attr.mmap || event->attr.mmap_data) 5350 if (event->attr.mmap || event->attr.mmap_data)
4927 atomic_inc(&nr_mmap_events); 5351 atomic_inc(&nr_mmap_events);
4928 if (event->attr.comm) 5352 if (event->attr.comm)
4929 atomic_inc(&nr_comm_events); 5353 atomic_inc(&nr_comm_events);
4930 if (event->attr.task) 5354 if (event->attr.task)
4931 atomic_inc(&nr_task_events); 5355 atomic_inc(&nr_task_events);
5356 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5357 err = get_callchain_buffers();
5358 if (err) {
5359 free_event(event);
5360 return ERR_PTR(err);
5361 }
5362 }
4932 } 5363 }
4933 5364
4934 return event; 5365 return event;
@@ -5076,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open,
5076 struct perf_event_attr __user *, attr_uptr, 5507 struct perf_event_attr __user *, attr_uptr,
5077 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5508 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5078{ 5509{
5079 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5510 struct perf_event *group_leader = NULL, *output_event = NULL;
5511 struct perf_event *event, *sibling;
5080 struct perf_event_attr attr; 5512 struct perf_event_attr attr;
5081 struct perf_event_context *ctx; 5513 struct perf_event_context *ctx;
5082 struct file *event_file = NULL; 5514 struct file *event_file = NULL;
5083 struct file *group_file = NULL; 5515 struct file *group_file = NULL;
5516 struct task_struct *task = NULL;
5517 struct pmu *pmu;
5084 int event_fd; 5518 int event_fd;
5519 int move_group = 0;
5085 int fput_needed = 0; 5520 int fput_needed = 0;
5086 int err; 5521 int err;
5087 5522
@@ -5107,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open,
5107 if (event_fd < 0) 5542 if (event_fd < 0)
5108 return event_fd; 5543 return event_fd;
5109 5544
5110 /*
5111 * Get the target context (task or percpu):
5112 */
5113 ctx = find_get_context(pid, cpu);
5114 if (IS_ERR(ctx)) {
5115 err = PTR_ERR(ctx);
5116 goto err_fd;
5117 }
5118
5119 if (group_fd != -1) { 5545 if (group_fd != -1) {
5120 group_leader = perf_fget_light(group_fd, &fput_needed); 5546 group_leader = perf_fget_light(group_fd, &fput_needed);
5121 if (IS_ERR(group_leader)) { 5547 if (IS_ERR(group_leader)) {
5122 err = PTR_ERR(group_leader); 5548 err = PTR_ERR(group_leader);
5123 goto err_put_context; 5549 goto err_fd;
5124 } 5550 }
5125 group_file = group_leader->filp; 5551 group_file = group_leader->filp;
5126 if (flags & PERF_FLAG_FD_OUTPUT) 5552 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5129,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open,
5129 group_leader = NULL; 5555 group_leader = NULL;
5130 } 5556 }
5131 5557
5558 if (pid != -1) {
5559 task = find_lively_task_by_vpid(pid);
5560 if (IS_ERR(task)) {
5561 err = PTR_ERR(task);
5562 goto err_group_fd;
5563 }
5564 }
5565
5566 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5567 if (IS_ERR(event)) {
5568 err = PTR_ERR(event);
5569 goto err_task;
5570 }
5571
5572 /*
5573 * Special case software events and allow them to be part of
5574 * any hardware group.
5575 */
5576 pmu = event->pmu;
5577
5578 if (group_leader &&
5579 (is_software_event(event) != is_software_event(group_leader))) {
5580 if (is_software_event(event)) {
5581 /*
5582 * If event and group_leader are not both a software
5583 * event, and event is, then group leader is not.
5584 *
5585 * Allow the addition of software events to !software
5586 * groups, this is safe because software events never
5587 * fail to schedule.
5588 */
5589 pmu = group_leader->pmu;
5590 } else if (is_software_event(group_leader) &&
5591 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5592 /*
5593 * In case the group is a pure software group, and we
5594 * try to add a hardware event, move the whole group to
5595 * the hardware context.
5596 */
5597 move_group = 1;
5598 }
5599 }
5600
5601 /*
5602 * Get the target context (task or percpu):
5603 */
5604 ctx = find_get_context(pmu, task, cpu);
5605 if (IS_ERR(ctx)) {
5606 err = PTR_ERR(ctx);
5607 goto err_alloc;
5608 }
5609
5132 /* 5610 /*
5133 * Look up the group leader (we will attach this event to it): 5611 * Look up the group leader (we will attach this event to it):
5134 */ 5612 */
@@ -5140,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open,
5140 * becoming part of another group-sibling): 5618 * becoming part of another group-sibling):
5141 */ 5619 */
5142 if (group_leader->group_leader != group_leader) 5620 if (group_leader->group_leader != group_leader)
5143 goto err_put_context; 5621 goto err_context;
5144 /* 5622 /*
5145 * Do not allow to attach to a group in a different 5623 * Do not allow to attach to a group in a different
5146 * task or CPU context: 5624 * task or CPU context:
5147 */ 5625 */
5148 if (group_leader->ctx != ctx) 5626 if (move_group) {
5149 goto err_put_context; 5627 if (group_leader->ctx->type != ctx->type)
5628 goto err_context;
5629 } else {
5630 if (group_leader->ctx != ctx)
5631 goto err_context;
5632 }
5633
5150 /* 5634 /*
5151 * Only a group leader can be exclusive or pinned 5635 * Only a group leader can be exclusive or pinned
5152 */ 5636 */
5153 if (attr.exclusive || attr.pinned) 5637 if (attr.exclusive || attr.pinned)
5154 goto err_put_context; 5638 goto err_context;
5155 }
5156
5157 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5158 NULL, NULL, GFP_KERNEL);
5159 if (IS_ERR(event)) {
5160 err = PTR_ERR(event);
5161 goto err_put_context;
5162 } 5639 }
5163 5640
5164 if (output_event) { 5641 if (output_event) {
5165 err = perf_event_set_output(event, output_event); 5642 err = perf_event_set_output(event, output_event);
5166 if (err) 5643 if (err)
5167 goto err_free_put_context; 5644 goto err_context;
5168 } 5645 }
5169 5646
5170 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5647 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5171 if (IS_ERR(event_file)) { 5648 if (IS_ERR(event_file)) {
5172 err = PTR_ERR(event_file); 5649 err = PTR_ERR(event_file);
5173 goto err_free_put_context; 5650 goto err_context;
5651 }
5652
5653 if (move_group) {
5654 struct perf_event_context *gctx = group_leader->ctx;
5655
5656 mutex_lock(&gctx->mutex);
5657 perf_event_remove_from_context(group_leader);
5658 list_for_each_entry(sibling, &group_leader->sibling_list,
5659 group_entry) {
5660 perf_event_remove_from_context(sibling);
5661 put_ctx(gctx);
5662 }
5663 mutex_unlock(&gctx->mutex);
5664 put_ctx(gctx);
5174 } 5665 }
5175 5666
5176 event->filp = event_file; 5667 event->filp = event_file;
5177 WARN_ON_ONCE(ctx->parent_ctx); 5668 WARN_ON_ONCE(ctx->parent_ctx);
5178 mutex_lock(&ctx->mutex); 5669 mutex_lock(&ctx->mutex);
5670
5671 if (move_group) {
5672 perf_install_in_context(ctx, group_leader, cpu);
5673 get_ctx(ctx);
5674 list_for_each_entry(sibling, &group_leader->sibling_list,
5675 group_entry) {
5676 perf_install_in_context(ctx, sibling, cpu);
5677 get_ctx(ctx);
5678 }
5679 }
5680
5179 perf_install_in_context(ctx, event, cpu); 5681 perf_install_in_context(ctx, event, cpu);
5180 ++ctx->generation; 5682 ++ctx->generation;
5181 mutex_unlock(&ctx->mutex); 5683 mutex_unlock(&ctx->mutex);
@@ -5196,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open,
5196 fd_install(event_fd, event_file); 5698 fd_install(event_fd, event_file);
5197 return event_fd; 5699 return event_fd;
5198 5700
5199err_free_put_context: 5701err_context:
5702 put_ctx(ctx);
5703err_alloc:
5200 free_event(event); 5704 free_event(event);
5201err_put_context: 5705err_task:
5706 if (task)
5707 put_task_struct(task);
5708err_group_fd:
5202 fput_light(group_file, fput_needed); 5709 fput_light(group_file, fput_needed);
5203 put_ctx(ctx);
5204err_fd: 5710err_fd:
5205 put_unused_fd(event_fd); 5711 put_unused_fd(event_fd);
5206 return err; 5712 return err;
@@ -5211,32 +5717,31 @@ err_fd:
5211 * 5717 *
5212 * @attr: attributes of the counter to create 5718 * @attr: attributes of the counter to create
5213 * @cpu: cpu in which the counter is bound 5719 * @cpu: cpu in which the counter is bound
5214 * @pid: task to profile 5720 * @task: task to profile (NULL for percpu)
5215 */ 5721 */
5216struct perf_event * 5722struct perf_event *
5217perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5723perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5218 pid_t pid, 5724 struct task_struct *task,
5219 perf_overflow_handler_t overflow_handler) 5725 perf_overflow_handler_t overflow_handler)
5220{ 5726{
5221 struct perf_event *event;
5222 struct perf_event_context *ctx; 5727 struct perf_event_context *ctx;
5728 struct perf_event *event;
5223 int err; 5729 int err;
5224 5730
5225 /* 5731 /*
5226 * Get the target context (task or percpu): 5732 * Get the target context (task or percpu):
5227 */ 5733 */
5228 5734
5229 ctx = find_get_context(pid, cpu); 5735 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5230 if (IS_ERR(ctx)) {
5231 err = PTR_ERR(ctx);
5232 goto err_exit;
5233 }
5234
5235 event = perf_event_alloc(attr, cpu, ctx, NULL,
5236 NULL, overflow_handler, GFP_KERNEL);
5237 if (IS_ERR(event)) { 5736 if (IS_ERR(event)) {
5238 err = PTR_ERR(event); 5737 err = PTR_ERR(event);
5239 goto err_put_context; 5738 goto err;
5739 }
5740
5741 ctx = find_get_context(event->pmu, task, cpu);
5742 if (IS_ERR(ctx)) {
5743 err = PTR_ERR(ctx);
5744 goto err_free;
5240 } 5745 }
5241 5746
5242 event->filp = NULL; 5747 event->filp = NULL;
@@ -5254,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5254 5759
5255 return event; 5760 return event;
5256 5761
5257 err_put_context: 5762err_free:
5258 put_ctx(ctx); 5763 free_event(event);
5259 err_exit: 5764err:
5260 return ERR_PTR(err); 5765 return ERR_PTR(err);
5261} 5766}
5262EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5767EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5263 5768
5264/*
5265 * inherit a event from parent task to child task:
5266 */
5267static struct perf_event *
5268inherit_event(struct perf_event *parent_event,
5269 struct task_struct *parent,
5270 struct perf_event_context *parent_ctx,
5271 struct task_struct *child,
5272 struct perf_event *group_leader,
5273 struct perf_event_context *child_ctx)
5274{
5275 struct perf_event *child_event;
5276
5277 /*
5278 * Instead of creating recursive hierarchies of events,
5279 * we link inherited events back to the original parent,
5280 * which has a filp for sure, which we use as the reference
5281 * count:
5282 */
5283 if (parent_event->parent)
5284 parent_event = parent_event->parent;
5285
5286 child_event = perf_event_alloc(&parent_event->attr,
5287 parent_event->cpu, child_ctx,
5288 group_leader, parent_event,
5289 NULL, GFP_KERNEL);
5290 if (IS_ERR(child_event))
5291 return child_event;
5292 get_ctx(child_ctx);
5293
5294 /*
5295 * Make the child state follow the state of the parent event,
5296 * not its attr.disabled bit. We hold the parent's mutex,
5297 * so we won't race with perf_event_{en, dis}able_family.
5298 */
5299 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5300 child_event->state = PERF_EVENT_STATE_INACTIVE;
5301 else
5302 child_event->state = PERF_EVENT_STATE_OFF;
5303
5304 if (parent_event->attr.freq) {
5305 u64 sample_period = parent_event->hw.sample_period;
5306 struct hw_perf_event *hwc = &child_event->hw;
5307
5308 hwc->sample_period = sample_period;
5309 hwc->last_period = sample_period;
5310
5311 local64_set(&hwc->period_left, sample_period);
5312 }
5313
5314 child_event->overflow_handler = parent_event->overflow_handler;
5315
5316 /*
5317 * Link it up in the child's context:
5318 */
5319 add_event_to_ctx(child_event, child_ctx);
5320
5321 /*
5322 * Get a reference to the parent filp - we will fput it
5323 * when the child event exits. This is safe to do because
5324 * we are in the parent and we know that the filp still
5325 * exists and has a nonzero count:
5326 */
5327 atomic_long_inc(&parent_event->filp->f_count);
5328
5329 /*
5330 * Link this into the parent event's child list
5331 */
5332 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5333 mutex_lock(&parent_event->child_mutex);
5334 list_add_tail(&child_event->child_list, &parent_event->child_list);
5335 mutex_unlock(&parent_event->child_mutex);
5336
5337 return child_event;
5338}
5339
5340static int inherit_group(struct perf_event *parent_event,
5341 struct task_struct *parent,
5342 struct perf_event_context *parent_ctx,
5343 struct task_struct *child,
5344 struct perf_event_context *child_ctx)
5345{
5346 struct perf_event *leader;
5347 struct perf_event *sub;
5348 struct perf_event *child_ctr;
5349
5350 leader = inherit_event(parent_event, parent, parent_ctx,
5351 child, NULL, child_ctx);
5352 if (IS_ERR(leader))
5353 return PTR_ERR(leader);
5354 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5355 child_ctr = inherit_event(sub, parent, parent_ctx,
5356 child, leader, child_ctx);
5357 if (IS_ERR(child_ctr))
5358 return PTR_ERR(child_ctr);
5359 }
5360 return 0;
5361}
5362
5363static void sync_child_event(struct perf_event *child_event, 5769static void sync_child_event(struct perf_event *child_event,
5364 struct task_struct *child) 5770 struct task_struct *child)
5365{ 5771{
@@ -5416,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5416 } 5822 }
5417} 5823}
5418 5824
5419/* 5825static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5420 * When a child task exits, feed back event values to parent events.
5421 */
5422void perf_event_exit_task(struct task_struct *child)
5423{ 5826{
5424 struct perf_event *child_event, *tmp; 5827 struct perf_event *child_event, *tmp;
5425 struct perf_event_context *child_ctx; 5828 struct perf_event_context *child_ctx;
5426 unsigned long flags; 5829 unsigned long flags;
5427 5830
5428 if (likely(!child->perf_event_ctxp)) { 5831 if (likely(!child->perf_event_ctxp[ctxn])) {
5429 perf_event_task(child, NULL, 0); 5832 perf_event_task(child, NULL, 0);
5430 return; 5833 return;
5431 } 5834 }
@@ -5437,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child)
5437 * scheduled, so we are now safe from rescheduling changing 5840 * scheduled, so we are now safe from rescheduling changing
5438 * our context. 5841 * our context.
5439 */ 5842 */
5440 child_ctx = child->perf_event_ctxp; 5843 child_ctx = child->perf_event_ctxp[ctxn];
5441 __perf_event_task_sched_out(child_ctx); 5844 task_ctx_sched_out(child_ctx, EVENT_ALL);
5442 5845
5443 /* 5846 /*
5444 * Take the context lock here so that if find_get_context is 5847 * Take the context lock here so that if find_get_context is
@@ -5446,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child)
5446 * incremented the context's refcount before we do put_ctx below. 5849 * incremented the context's refcount before we do put_ctx below.
5447 */ 5850 */
5448 raw_spin_lock(&child_ctx->lock); 5851 raw_spin_lock(&child_ctx->lock);
5449 child->perf_event_ctxp = NULL; 5852 child->perf_event_ctxp[ctxn] = NULL;
5450 /* 5853 /*
5451 * If this context is a clone; unclone it so it can't get 5854 * If this context is a clone; unclone it so it can't get
5452 * swapped to another process while we're removing all 5855 * swapped to another process while we're removing all
@@ -5499,6 +5902,17 @@ again:
5499 put_ctx(child_ctx); 5902 put_ctx(child_ctx);
5500} 5903}
5501 5904
5905/*
5906 * When a child task exits, feed back event values to parent events.
5907 */
5908void perf_event_exit_task(struct task_struct *child)
5909{
5910 int ctxn;
5911
5912 for_each_task_context_nr(ctxn)
5913 perf_event_exit_task_context(child, ctxn);
5914}
5915
5502static void perf_free_event(struct perf_event *event, 5916static void perf_free_event(struct perf_event *event,
5503 struct perf_event_context *ctx) 5917 struct perf_event_context *ctx)
5504{ 5918{
@@ -5520,48 +5934,166 @@ static void perf_free_event(struct perf_event *event,
5520 5934
5521/* 5935/*
5522 * free an unexposed, unused context as created by inheritance by 5936 * free an unexposed, unused context as created by inheritance by
5523 * init_task below, used by fork() in case of fail. 5937 * perf_event_init_task below, used by fork() in case of fail.
5524 */ 5938 */
5525void perf_event_free_task(struct task_struct *task) 5939void perf_event_free_task(struct task_struct *task)
5526{ 5940{
5527 struct perf_event_context *ctx = task->perf_event_ctxp; 5941 struct perf_event_context *ctx;
5528 struct perf_event *event, *tmp; 5942 struct perf_event *event, *tmp;
5943 int ctxn;
5529 5944
5530 if (!ctx) 5945 for_each_task_context_nr(ctxn) {
5531 return; 5946 ctx = task->perf_event_ctxp[ctxn];
5947 if (!ctx)
5948 continue;
5532 5949
5533 mutex_lock(&ctx->mutex); 5950 mutex_lock(&ctx->mutex);
5534again: 5951again:
5535 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5952 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5536 perf_free_event(event, ctx); 5953 group_entry)
5954 perf_free_event(event, ctx);
5537 5955
5538 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5956 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5539 group_entry) 5957 group_entry)
5540 perf_free_event(event, ctx); 5958 perf_free_event(event, ctx);
5541 5959
5542 if (!list_empty(&ctx->pinned_groups) || 5960 if (!list_empty(&ctx->pinned_groups) ||
5543 !list_empty(&ctx->flexible_groups)) 5961 !list_empty(&ctx->flexible_groups))
5544 goto again; 5962 goto again;
5545 5963
5546 mutex_unlock(&ctx->mutex); 5964 mutex_unlock(&ctx->mutex);
5547 5965
5548 put_ctx(ctx); 5966 put_ctx(ctx);
5967 }
5968}
5969
5970void perf_event_delayed_put(struct task_struct *task)
5971{
5972 int ctxn;
5973
5974 for_each_task_context_nr(ctxn)
5975 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5976}
5977
5978/*
5979 * inherit a event from parent task to child task:
5980 */
5981static struct perf_event *
5982inherit_event(struct perf_event *parent_event,
5983 struct task_struct *parent,
5984 struct perf_event_context *parent_ctx,
5985 struct task_struct *child,
5986 struct perf_event *group_leader,
5987 struct perf_event_context *child_ctx)
5988{
5989 struct perf_event *child_event;
5990 unsigned long flags;
5991
5992 /*
5993 * Instead of creating recursive hierarchies of events,
5994 * we link inherited events back to the original parent,
5995 * which has a filp for sure, which we use as the reference
5996 * count:
5997 */
5998 if (parent_event->parent)
5999 parent_event = parent_event->parent;
6000
6001 child_event = perf_event_alloc(&parent_event->attr,
6002 parent_event->cpu,
6003 child,
6004 group_leader, parent_event,
6005 NULL);
6006 if (IS_ERR(child_event))
6007 return child_event;
6008 get_ctx(child_ctx);
6009
6010 /*
6011 * Make the child state follow the state of the parent event,
6012 * not its attr.disabled bit. We hold the parent's mutex,
6013 * so we won't race with perf_event_{en, dis}able_family.
6014 */
6015 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6016 child_event->state = PERF_EVENT_STATE_INACTIVE;
6017 else
6018 child_event->state = PERF_EVENT_STATE_OFF;
6019
6020 if (parent_event->attr.freq) {
6021 u64 sample_period = parent_event->hw.sample_period;
6022 struct hw_perf_event *hwc = &child_event->hw;
6023
6024 hwc->sample_period = sample_period;
6025 hwc->last_period = sample_period;
6026
6027 local64_set(&hwc->period_left, sample_period);
6028 }
6029
6030 child_event->ctx = child_ctx;
6031 child_event->overflow_handler = parent_event->overflow_handler;
6032
6033 /*
6034 * Link it up in the child's context:
6035 */
6036 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6037 add_event_to_ctx(child_event, child_ctx);
6038 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6039
6040 /*
6041 * Get a reference to the parent filp - we will fput it
6042 * when the child event exits. This is safe to do because
6043 * we are in the parent and we know that the filp still
6044 * exists and has a nonzero count:
6045 */
6046 atomic_long_inc(&parent_event->filp->f_count);
6047
6048 /*
6049 * Link this into the parent event's child list
6050 */
6051 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6052 mutex_lock(&parent_event->child_mutex);
6053 list_add_tail(&child_event->child_list, &parent_event->child_list);
6054 mutex_unlock(&parent_event->child_mutex);
6055
6056 return child_event;
6057}
6058
6059static int inherit_group(struct perf_event *parent_event,
6060 struct task_struct *parent,
6061 struct perf_event_context *parent_ctx,
6062 struct task_struct *child,
6063 struct perf_event_context *child_ctx)
6064{
6065 struct perf_event *leader;
6066 struct perf_event *sub;
6067 struct perf_event *child_ctr;
6068
6069 leader = inherit_event(parent_event, parent, parent_ctx,
6070 child, NULL, child_ctx);
6071 if (IS_ERR(leader))
6072 return PTR_ERR(leader);
6073 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6074 child_ctr = inherit_event(sub, parent, parent_ctx,
6075 child, leader, child_ctx);
6076 if (IS_ERR(child_ctr))
6077 return PTR_ERR(child_ctr);
6078 }
6079 return 0;
5549} 6080}
5550 6081
5551static int 6082static int
5552inherit_task_group(struct perf_event *event, struct task_struct *parent, 6083inherit_task_group(struct perf_event *event, struct task_struct *parent,
5553 struct perf_event_context *parent_ctx, 6084 struct perf_event_context *parent_ctx,
5554 struct task_struct *child, 6085 struct task_struct *child, int ctxn,
5555 int *inherited_all) 6086 int *inherited_all)
5556{ 6087{
5557 int ret; 6088 int ret;
5558 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6089 struct perf_event_context *child_ctx;
5559 6090
5560 if (!event->attr.inherit) { 6091 if (!event->attr.inherit) {
5561 *inherited_all = 0; 6092 *inherited_all = 0;
5562 return 0; 6093 return 0;
5563 } 6094 }
5564 6095
6096 child_ctx = child->perf_event_ctxp[ctxn];
5565 if (!child_ctx) { 6097 if (!child_ctx) {
5566 /* 6098 /*
5567 * This is executed from the parent task context, so 6099 * This is executed from the parent task context, so
@@ -5570,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5570 * child. 6102 * child.
5571 */ 6103 */
5572 6104
5573 child_ctx = kzalloc(sizeof(struct perf_event_context), 6105 child_ctx = alloc_perf_context(event->pmu, child);
5574 GFP_KERNEL);
5575 if (!child_ctx) 6106 if (!child_ctx)
5576 return -ENOMEM; 6107 return -ENOMEM;
5577 6108
5578 __perf_event_init_context(child_ctx, child); 6109 child->perf_event_ctxp[ctxn] = child_ctx;
5579 child->perf_event_ctxp = child_ctx;
5580 get_task_struct(child);
5581 } 6110 }
5582 6111
5583 ret = inherit_group(event, parent, parent_ctx, 6112 ret = inherit_group(event, parent, parent_ctx,
@@ -5589,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5589 return ret; 6118 return ret;
5590} 6119}
5591 6120
5592
5593/* 6121/*
5594 * Initialize the perf_event context in task_struct 6122 * Initialize the perf_event context in task_struct
5595 */ 6123 */
5596int perf_event_init_task(struct task_struct *child) 6124int perf_event_init_context(struct task_struct *child, int ctxn)
5597{ 6125{
5598 struct perf_event_context *child_ctx, *parent_ctx; 6126 struct perf_event_context *child_ctx, *parent_ctx;
5599 struct perf_event_context *cloned_ctx; 6127 struct perf_event_context *cloned_ctx;
@@ -5602,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child)
5602 int inherited_all = 1; 6130 int inherited_all = 1;
5603 int ret = 0; 6131 int ret = 0;
5604 6132
5605 child->perf_event_ctxp = NULL; 6133 child->perf_event_ctxp[ctxn] = NULL;
5606 6134
5607 mutex_init(&child->perf_event_mutex); 6135 mutex_init(&child->perf_event_mutex);
5608 INIT_LIST_HEAD(&child->perf_event_list); 6136 INIT_LIST_HEAD(&child->perf_event_list);
5609 6137
5610 if (likely(!parent->perf_event_ctxp)) 6138 if (likely(!parent->perf_event_ctxp[ctxn]))
5611 return 0; 6139 return 0;
5612 6140
5613 /* 6141 /*
5614 * If the parent's context is a clone, pin it so it won't get 6142 * If the parent's context is a clone, pin it so it won't get
5615 * swapped under us. 6143 * swapped under us.
5616 */ 6144 */
5617 parent_ctx = perf_pin_task_context(parent); 6145 parent_ctx = perf_pin_task_context(parent, ctxn);
5618 6146
5619 /* 6147 /*
5620 * No need to check if parent_ctx != NULL here; since we saw 6148 * No need to check if parent_ctx != NULL here; since we saw
@@ -5634,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child)
5634 * the list, not manipulating it: 6162 * the list, not manipulating it:
5635 */ 6163 */
5636 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6164 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5637 ret = inherit_task_group(event, parent, parent_ctx, child, 6165 ret = inherit_task_group(event, parent, parent_ctx,
5638 &inherited_all); 6166 child, ctxn, &inherited_all);
5639 if (ret) 6167 if (ret)
5640 break; 6168 break;
5641 } 6169 }
5642 6170
5643 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6171 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5644 ret = inherit_task_group(event, parent, parent_ctx, child, 6172 ret = inherit_task_group(event, parent, parent_ctx,
5645 &inherited_all); 6173 child, ctxn, &inherited_all);
5646 if (ret) 6174 if (ret)
5647 break; 6175 break;
5648 } 6176 }
5649 6177
5650 child_ctx = child->perf_event_ctxp; 6178 child_ctx = child->perf_event_ctxp[ctxn];
5651 6179
5652 if (child_ctx && inherited_all) { 6180 if (child_ctx && inherited_all) {
5653 /* 6181 /*
@@ -5676,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child)
5676 return ret; 6204 return ret;
5677} 6205}
5678 6206
6207/*
6208 * Initialize the perf_event context in task_struct
6209 */
6210int perf_event_init_task(struct task_struct *child)
6211{
6212 int ctxn, ret;
6213
6214 for_each_task_context_nr(ctxn) {
6215 ret = perf_event_init_context(child, ctxn);
6216 if (ret)
6217 return ret;
6218 }
6219
6220 return 0;
6221}
6222
5679static void __init perf_event_init_all_cpus(void) 6223static void __init perf_event_init_all_cpus(void)
5680{ 6224{
6225 struct swevent_htable *swhash;
5681 int cpu; 6226 int cpu;
5682 struct perf_cpu_context *cpuctx;
5683 6227
5684 for_each_possible_cpu(cpu) { 6228 for_each_possible_cpu(cpu) {
5685 cpuctx = &per_cpu(perf_cpu_context, cpu); 6229 swhash = &per_cpu(swevent_htable, cpu);
5686 mutex_init(&cpuctx->hlist_mutex); 6230 mutex_init(&swhash->hlist_mutex);
5687 __perf_event_init_context(&cpuctx->ctx, NULL); 6231 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5688 } 6232 }
5689} 6233}
5690 6234
5691static void __cpuinit perf_event_init_cpu(int cpu) 6235static void __cpuinit perf_event_init_cpu(int cpu)
5692{ 6236{
5693 struct perf_cpu_context *cpuctx; 6237 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5694 6238
5695 cpuctx = &per_cpu(perf_cpu_context, cpu); 6239 mutex_lock(&swhash->hlist_mutex);
5696 6240 if (swhash->hlist_refcount > 0) {
5697 spin_lock(&perf_resource_lock);
5698 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5699 spin_unlock(&perf_resource_lock);
5700
5701 mutex_lock(&cpuctx->hlist_mutex);
5702 if (cpuctx->hlist_refcount > 0) {
5703 struct swevent_hlist *hlist; 6241 struct swevent_hlist *hlist;
5704 6242
5705 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6243 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5706 WARN_ON_ONCE(!hlist); 6244 WARN_ON(!hlist);
5707 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6245 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5708 } 6246 }
5709 mutex_unlock(&cpuctx->hlist_mutex); 6247 mutex_unlock(&swhash->hlist_mutex);
5710} 6248}
5711 6249
5712#ifdef CONFIG_HOTPLUG_CPU 6250#ifdef CONFIG_HOTPLUG_CPU
5713static void __perf_event_exit_cpu(void *info) 6251static void perf_pmu_rotate_stop(struct pmu *pmu)
5714{ 6252{
5715 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6253 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5716 struct perf_event_context *ctx = &cpuctx->ctx; 6254
6255 WARN_ON(!irqs_disabled());
6256
6257 list_del_init(&cpuctx->rotation_list);
6258}
6259
6260static void __perf_event_exit_context(void *__info)
6261{
6262 struct perf_event_context *ctx = __info;
5717 struct perf_event *event, *tmp; 6263 struct perf_event *event, *tmp;
5718 6264
6265 perf_pmu_rotate_stop(ctx->pmu);
6266
5719 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6267 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5720 __perf_event_remove_from_context(event); 6268 __perf_event_remove_from_context(event);
5721 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6269 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5722 __perf_event_remove_from_context(event); 6270 __perf_event_remove_from_context(event);
5723} 6271}
6272
6273static void perf_event_exit_cpu_context(int cpu)
6274{
6275 struct perf_event_context *ctx;
6276 struct pmu *pmu;
6277 int idx;
6278
6279 idx = srcu_read_lock(&pmus_srcu);
6280 list_for_each_entry_rcu(pmu, &pmus, entry) {
6281 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6282
6283 mutex_lock(&ctx->mutex);
6284 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6285 mutex_unlock(&ctx->mutex);
6286 }
6287 srcu_read_unlock(&pmus_srcu, idx);
6288}
6289
5724static void perf_event_exit_cpu(int cpu) 6290static void perf_event_exit_cpu(int cpu)
5725{ 6291{
5726 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6292 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5727 struct perf_event_context *ctx = &cpuctx->ctx;
5728 6293
5729 mutex_lock(&cpuctx->hlist_mutex); 6294 mutex_lock(&swhash->hlist_mutex);
5730 swevent_hlist_release(cpuctx); 6295 swevent_hlist_release(swhash);
5731 mutex_unlock(&cpuctx->hlist_mutex); 6296 mutex_unlock(&swhash->hlist_mutex);
5732 6297
5733 mutex_lock(&ctx->mutex); 6298 perf_event_exit_cpu_context(cpu);
5734 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5735 mutex_unlock(&ctx->mutex);
5736} 6299}
5737#else 6300#else
5738static inline void perf_event_exit_cpu(int cpu) { } 6301static inline void perf_event_exit_cpu(int cpu) { }
@@ -5743,15 +6306,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5743{ 6306{
5744 unsigned int cpu = (long)hcpu; 6307 unsigned int cpu = (long)hcpu;
5745 6308
5746 switch (action) { 6309 switch (action & ~CPU_TASKS_FROZEN) {
5747 6310
5748 case CPU_UP_PREPARE: 6311 case CPU_UP_PREPARE:
5749 case CPU_UP_PREPARE_FROZEN: 6312 case CPU_DOWN_FAILED:
5750 perf_event_init_cpu(cpu); 6313 perf_event_init_cpu(cpu);
5751 break; 6314 break;
5752 6315
6316 case CPU_UP_CANCELED:
5753 case CPU_DOWN_PREPARE: 6317 case CPU_DOWN_PREPARE:
5754 case CPU_DOWN_PREPARE_FROZEN:
5755 perf_event_exit_cpu(cpu); 6318 perf_event_exit_cpu(cpu);
5756 break; 6319 break;
5757 6320
@@ -5762,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5762 return NOTIFY_OK; 6325 return NOTIFY_OK;
5763} 6326}
5764 6327
5765/*
5766 * This has to have a higher priority than migration_notifier in sched.c.
5767 */
5768static struct notifier_block __cpuinitdata perf_cpu_nb = {
5769 .notifier_call = perf_cpu_notify,
5770 .priority = 20,
5771};
5772
5773void __init perf_event_init(void) 6328void __init perf_event_init(void)
5774{ 6329{
5775 perf_event_init_all_cpus(); 6330 perf_event_init_all_cpus();
5776 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6331 init_srcu_struct(&pmus_srcu);
5777 (void *)(long)smp_processor_id()); 6332 perf_pmu_register(&perf_swevent);
5778 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6333 perf_pmu_register(&perf_cpu_clock);
5779 (void *)(long)smp_processor_id()); 6334 perf_pmu_register(&perf_task_clock);
5780 register_cpu_notifier(&perf_cpu_nb); 6335 perf_tp_register();
5781} 6336 perf_cpu_notifier(perf_cpu_notify);
5782
5783static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5784 struct sysdev_class_attribute *attr,
5785 char *buf)
5786{
5787 return sprintf(buf, "%d\n", perf_reserved_percpu);
5788}
5789
5790static ssize_t
5791perf_set_reserve_percpu(struct sysdev_class *class,
5792 struct sysdev_class_attribute *attr,
5793 const char *buf,
5794 size_t count)
5795{
5796 struct perf_cpu_context *cpuctx;
5797 unsigned long val;
5798 int err, cpu, mpt;
5799
5800 err = strict_strtoul(buf, 10, &val);
5801 if (err)
5802 return err;
5803 if (val > perf_max_events)
5804 return -EINVAL;
5805
5806 spin_lock(&perf_resource_lock);
5807 perf_reserved_percpu = val;
5808 for_each_online_cpu(cpu) {
5809 cpuctx = &per_cpu(perf_cpu_context, cpu);
5810 raw_spin_lock_irq(&cpuctx->ctx.lock);
5811 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5812 perf_max_events - perf_reserved_percpu);
5813 cpuctx->max_pertask = mpt;
5814 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5815 }
5816 spin_unlock(&perf_resource_lock);
5817
5818 return count;
5819}
5820
5821static ssize_t perf_show_overcommit(struct sysdev_class *class,
5822 struct sysdev_class_attribute *attr,
5823 char *buf)
5824{
5825 return sprintf(buf, "%d\n", perf_overcommit);
5826}
5827
5828static ssize_t
5829perf_set_overcommit(struct sysdev_class *class,
5830 struct sysdev_class_attribute *attr,
5831 const char *buf, size_t count)
5832{
5833 unsigned long val;
5834 int err;
5835
5836 err = strict_strtoul(buf, 10, &val);
5837 if (err)
5838 return err;
5839 if (val > 1)
5840 return -EINVAL;
5841
5842 spin_lock(&perf_resource_lock);
5843 perf_overcommit = val;
5844 spin_unlock(&perf_resource_lock);
5845
5846 return count;
5847}
5848
5849static SYSDEV_CLASS_ATTR(
5850 reserve_percpu,
5851 0644,
5852 perf_show_reserve_percpu,
5853 perf_set_reserve_percpu
5854 );
5855
5856static SYSDEV_CLASS_ATTR(
5857 overcommit,
5858 0644,
5859 perf_show_overcommit,
5860 perf_set_overcommit
5861 );
5862
5863static struct attribute *perfclass_attrs[] = {
5864 &attr_reserve_percpu.attr,
5865 &attr_overcommit.attr,
5866 NULL
5867};
5868
5869static struct attribute_group perfclass_attr_group = {
5870 .attrs = perfclass_attrs,
5871 .name = "perf_events",
5872};
5873
5874static int __init perf_event_sysfs_init(void)
5875{
5876 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5877 &perfclass_attr_group);
5878} 6337}
5879device_initcall(perf_event_sysfs_init);