aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:54:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:54:49 -0400
commit5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (patch)
treea0d6de0930ba83ecf4629c2e2e261f5eaa2d8f33 /kernel
parent888a6f77e0418b049f83d37547c209b904d30af4 (diff)
parent750ed158bf6c782d2813da1bca2c824365a0b777 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (163 commits) tracing: Fix compile issue for trace_sched_wakeup.c [S390] hardirq: remove pointless header file includes [IA64] Move local_softirq_pending() definition perf, powerpc: Fix power_pmu_event_init to not use event->ctx ftrace: Remove recursion between recordmcount and scripts/mod/empty jump_label: Add COND_STMT(), reducer wrappery perf: Optimize sw events perf: Use jump_labels to optimize the scheduler hooks jump_label: Add atomic_t interface jump_label: Use more consistent naming perf, hw_breakpoint: Fix crash in hw_breakpoint creation perf: Find task before event alloc perf: Fix task refcount bugs perf: Fix group moving irq_work: Add generic hardirq context callbacks perf_events: Fix transaction recovery in group_sched_in() perf_events: Fix bogus AMD64 generic TLB events perf_events: Fix bogus context time tracking tracing: Remove parent recording in latency tracer graph options tracing: Use one prologue for the preempt irqs off tracer function tracers ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/hw_breakpoint.c75
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/jump_label.c429
-rw-r--r--kernel/kprobes.c26
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/perf_event.c2592
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/ftrace.c127
-rw-r--r--kernel/trace/ring_buffer.c21
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_perf.c28
-rw-r--r--kernel/trace/trace_events.c55
-rw-r--r--kernel/trace/trace_functions_graph.c209
-rw-r--r--kernel/trace/trace_irqsoff.c152
-rw-r--r--kernel/trace/trace_sched_wakeup.c256
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/watchdog.c41
24 files changed, 2876 insertions, 1369 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17046b6e7c90..e2c9d52cfe9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o 14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 15obj-y += groups.o
16 16
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 23CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 24CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 25CFLAGS_REMOVE_perf_event.o = -pg
26CFLAGS_REMOVE_irq_work.o = -pg
26endif 27endif
27 28
28obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
@@ -101,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 102obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 103obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..e2bdf37f9fde 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 149{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 151
152#ifdef CONFIG_PERF_EVENTS 152 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 153 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 154 put_task_struct(tsk);
157} 155}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2dc..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct perf_event_context *ctx = bp->ctx; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->ctx == ctx && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
122 count += hw_breakpoint_weight(iter); 122 count += hw_breakpoint_weight(iter);
123 } 123 }
124 124
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
134 enum bp_type_idx type) 134 enum bp_type_idx type)
135{ 135{
136 int cpu = bp->cpu; 136 int cpu = bp->cpu;
137 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->hw.bp_target;
138 138
139 if (cpu >= 0) { 139 if (cpu >= 0) {
140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
213 int weight) 213 int weight)
214{ 214{
215 int cpu = bp->cpu; 215 int cpu = bp->cpu;
216 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->hw.bp_target;
217 217
218 /* Pinned counter cpu profiling */ 218 /* Pinned counter cpu profiling */
219 if (!tsk) { 219 if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437 triggered);
438} 437}
439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
440 439
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
516 get_online_cpus(); 515 get_online_cpus();
517 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
518 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
519 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
520 519
521 *pevent = bp; 520 *pevent = bp;
522 521
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
566 .priority = 0x7fffffff 565 .priority = 0x7fffffff
567}; 566};
568 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
569static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
570{ 624{
571 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
587 641
588 constraints_initialized = 1; 642 constraints_initialized = 1;
589 643
644 perf_pmu_register(&perf_breakpoint);
645
590 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
591 647
592 err_alloc: 648 err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
602core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
603 659
604 660
605struct pmu perf_ops_bp = {
606 .enable = arch_install_hw_breakpoint,
607 .disable = arch_uninstall_hw_breakpoint,
608 .read = hw_breakpoint_pmu_read,
609};
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work **head, *next;
81
82 head = &get_cpu_var(irq_work_list);
83
84 do {
85 next = *head;
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 put_cpu_var(irq_work_list);
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list, **head;
124
125 head = &__get_cpu_var(irq_work_list);
126 if (*head == NULL)
127 return;
128
129 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled());
131
132 list = xchg(head, NULL);
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
149 }
150}
151EXPORT_SYMBOL_GPL(irq_work_run);
152
153/*
154 * Synchronize against the irq_work @entry, ensures the entry is not
155 * currently in use.
156 */
157void irq_work_sync(struct irq_work *entry)
158{
159 WARN_ON_ONCE(irqs_disabled());
160
161 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
162 cpu_relax();
163}
164EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42static int jump_label_cmp(const void *a, const void *b)
43{
44 const struct jump_entry *jea = a;
45 const struct jump_entry *jeb = b;
46
47 if (jea->key < jeb->key)
48 return -1;
49
50 if (jea->key > jeb->key)
51 return 1;
52
53 return 0;
54}
55
56static void
57sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
58{
59 unsigned long size;
60
61 size = (((unsigned long)stop - (unsigned long)start)
62 / sizeof(struct jump_entry));
63 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
64}
65
66static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
67{
68 struct hlist_head *head;
69 struct hlist_node *node;
70 struct jump_label_entry *e;
71 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
72
73 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
74 hlist_for_each_entry(e, node, head, hlist) {
75 if (key == e->key)
76 return e;
77 }
78 return NULL;
79}
80
81static struct jump_label_entry *
82add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
83{
84 struct hlist_head *head;
85 struct jump_label_entry *e;
86 u32 hash;
87
88 e = get_jump_label_entry(key);
89 if (e)
90 return ERR_PTR(-EEXIST);
91
92 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
93 if (!e)
94 return ERR_PTR(-ENOMEM);
95
96 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
97 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
98 e->key = key;
99 e->table = table;
100 e->nr_entries = nr_entries;
101 INIT_HLIST_HEAD(&(e->modules));
102 hlist_add_head(&e->hlist, head);
103 return e;
104}
105
106static int
107build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
108{
109 struct jump_entry *iter, *iter_begin;
110 struct jump_label_entry *entry;
111 int count;
112
113 sort_jump_label_entries(start, stop);
114 iter = start;
115 while (iter < stop) {
116 entry = get_jump_label_entry(iter->key);
117 if (!entry) {
118 iter_begin = iter;
119 count = 0;
120 while ((iter < stop) &&
121 (iter->key == iter_begin->key)) {
122 iter++;
123 count++;
124 }
125 entry = add_jump_label_entry(iter_begin->key,
126 count, iter_begin);
127 if (IS_ERR(entry))
128 return PTR_ERR(entry);
129 } else {
130 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
131 return -1;
132 }
133 }
134 return 0;
135}
136
137/***
138 * jump_label_update - update jump label text
139 * @key - key value associated with a a jump label
140 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
141 *
142 * Will enable/disable the jump for jump label @key, depending on the
143 * value of @type.
144 *
145 */
146
147void jump_label_update(unsigned long key, enum jump_label_type type)
148{
149 struct jump_entry *iter;
150 struct jump_label_entry *entry;
151 struct hlist_node *module_node;
152 struct jump_label_module_entry *e_module;
153 int count;
154
155 mutex_lock(&jump_label_mutex);
156 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) {
158 count = entry->nr_entries;
159 iter = entry->table;
160 while (count--) {
161 if (kernel_text_address(iter->code))
162 arch_jump_label_transform(iter, type);
163 iter++;
164 }
165 /* eanble/disable jump labels in modules */
166 hlist_for_each_entry(e_module, module_node, &(entry->modules),
167 hlist) {
168 count = e_module->nr_entries;
169 iter = e_module->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 }
176 }
177 mutex_unlock(&jump_label_mutex);
178}
179
180static int addr_conflict(struct jump_entry *entry, void *start, void *end)
181{
182 if (entry->code <= (unsigned long)end &&
183 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
184 return 1;
185
186 return 0;
187}
188
189#ifdef CONFIG_MODULES
190
191static int module_conflict(void *start, void *end)
192{
193 struct hlist_head *head;
194 struct hlist_node *node, *node_next, *module_node, *module_node_next;
195 struct jump_label_entry *e;
196 struct jump_label_module_entry *e_module;
197 struct jump_entry *iter;
198 int i, count;
199 int conflict = 0;
200
201 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
202 head = &jump_label_table[i];
203 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
204 hlist_for_each_entry_safe(e_module, module_node,
205 module_node_next,
206 &(e->modules), hlist) {
207 count = e_module->nr_entries;
208 iter = e_module->table;
209 while (count--) {
210 if (addr_conflict(iter, start, end)) {
211 conflict = 1;
212 goto out;
213 }
214 iter++;
215 }
216 }
217 }
218 }
219out:
220 return conflict;
221}
222
223#endif
224
225/***
226 * jump_label_text_reserved - check if addr range is reserved
227 * @start: start text addr
228 * @end: end text addr
229 *
230 * checks if the text addr located between @start and @end
231 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses.
234 *
235 * returns 1 if there is an overlap, 0 otherwise
236 */
237int jump_label_text_reserved(void *start, void *end)
238{
239 struct jump_entry *iter;
240 struct jump_entry *iter_start = __start___jump_table;
241 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0;
243
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start;
246 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) {
248 conflict = 1;
249 goto out;
250 }
251 iter++;
252 }
253
254 /* now check modules */
255#ifdef CONFIG_MODULES
256 conflict = module_conflict(start, end);
257#endif
258out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict;
261}
262
263static __init int init_jump_label(void)
264{
265 int ret;
266 struct jump_entry *iter_start = __start___jump_table;
267 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter;
269
270 mutex_lock(&jump_label_mutex);
271 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table);
273 iter = iter_start;
274 while (iter < iter_stop) {
275 arch_jump_label_text_poke_early(iter->code);
276 iter++;
277 }
278 mutex_unlock(&jump_label_mutex);
279 return ret;
280}
281early_initcall(init_jump_label);
282
283#ifdef CONFIG_MODULES
284
285static struct jump_label_module_entry *
286add_jump_label_module_entry(struct jump_label_entry *entry,
287 struct jump_entry *iter_begin,
288 int count, struct module *mod)
289{
290 struct jump_label_module_entry *e;
291
292 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
293 if (!e)
294 return ERR_PTR(-ENOMEM);
295 e->mod = mod;
296 e->nr_entries = count;
297 e->table = iter_begin;
298 hlist_add_head(&e->hlist, &entry->modules);
299 return e;
300}
301
302static int add_jump_label_module(struct module *mod)
303{
304 struct jump_entry *iter, *iter_begin;
305 struct jump_label_entry *entry;
306 struct jump_label_module_entry *module_entry;
307 int count;
308
309 /* if the module doesn't have jump label entries, just return */
310 if (!mod->num_jump_entries)
311 return 0;
312
313 sort_jump_label_entries(mod->jump_entries,
314 mod->jump_entries + mod->num_jump_entries);
315 iter = mod->jump_entries;
316 while (iter < mod->jump_entries + mod->num_jump_entries) {
317 entry = get_jump_label_entry(iter->key);
318 iter_begin = iter;
319 count = 0;
320 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
321 (iter->key == iter_begin->key)) {
322 iter++;
323 count++;
324 }
325 if (!entry) {
326 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
327 if (IS_ERR(entry))
328 return PTR_ERR(entry);
329 }
330 module_entry = add_jump_label_module_entry(entry, iter_begin,
331 count, mod);
332 if (IS_ERR(module_entry))
333 return PTR_ERR(module_entry);
334 }
335 return 0;
336}
337
338static void remove_jump_label_module(struct module *mod)
339{
340 struct hlist_head *head;
341 struct hlist_node *node, *node_next, *module_node, *module_node_next;
342 struct jump_label_entry *e;
343 struct jump_label_module_entry *e_module;
344 int i;
345
346 /* if the module doesn't have jump label entries, just return */
347 if (!mod->num_jump_entries)
348 return;
349
350 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
351 head = &jump_label_table[i];
352 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
353 hlist_for_each_entry_safe(e_module, module_node,
354 module_node_next,
355 &(e->modules), hlist) {
356 if (e_module->mod == mod) {
357 hlist_del(&e_module->hlist);
358 kfree(e_module);
359 }
360 }
361 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
362 hlist_del(&e->hlist);
363 kfree(e);
364 }
365 }
366 }
367}
368
369static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data)
372{
373 struct module *mod = data;
374 int ret = 0;
375
376 switch (val) {
377 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex);
379 ret = add_jump_label_module(mod);
380 if (ret)
381 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex);
383 break;
384 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex);
386 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex);
388 break;
389 }
390 return ret;
391}
392
393/***
394 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
395 * @mod: module to patch
396 *
397 * Allow for run-time selection of the optimal nops. Before the module
398 * loads patch these with arch_get_jump_label_nop(), which is specified by
399 * the arch specific jump label code.
400 */
401void jump_label_apply_nops(struct module *mod)
402{
403 struct jump_entry *iter;
404
405 /* if the module doesn't have jump label entries, just return */
406 if (!mod->num_jump_entries)
407 return;
408
409 iter = mod->jump_entries;
410 while (iter < mod->jump_entries + mod->num_jump_entries) {
411 arch_jump_label_text_poke_early(iter->code);
412 iter++;
413 }
414}
415
416struct notifier_block jump_label_module_nb = {
417 .notifier_call = jump_label_module_notify,
418 .priority = 0,
419};
420
421static __init int init_jump_label_module(void)
422{
423 return register_module_notifier(&jump_label_module_nb);
424}
425early_initcall(init_jump_label_module);
426
427#endif /* CONFIG_MODULES */
428
429#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..ec4210c6501e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 400 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 401 * instructions including addr (exclude breakpoint).
401 */ 402 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 403static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 404{
404 int i; 405 int i;
405 struct kprobe *p = NULL; 406 struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 832
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 833void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 834 struct hlist_head **head, unsigned long *flags)
835__acquires(hlist_lock)
834{ 836{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 837 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 838 spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 844
843static void __kprobes kretprobe_table_lock(unsigned long hash, 845static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 846 unsigned long *flags)
847__acquires(hlist_lock)
845{ 848{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 849 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 850 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 852
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 853void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 854 unsigned long *flags)
855__releases(hlist_lock)
852{ 856{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 857 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 858 spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 861 spin_unlock_irqrestore(hlist_lock, *flags);
858} 862}
859 863
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 864static void __kprobes kretprobe_table_unlock(unsigned long hash,
865 unsigned long *flags)
866__releases(hlist_lock)
861{ 867{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 868 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 869 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1141 preempt_disable(); 1147 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1148 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1149 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1150 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) {
1145 preempt_enable(); 1152 preempt_enable();
1146 return -EINVAL; 1153 return -EINVAL;
1147 } 1154 }
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1346 if (num <= 0)
1340 return -EINVAL; 1347 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1348 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1349 unsigned long addr, offset;
1343 jp = jps[i]; 1350 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1351 addr = arch_deref_entry_point(jp->entry);
1345 1352
1346 if (!kernel_text_address(addr)) 1353 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1354 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1355 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1356 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1357 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1358 ret = register_kprobe(&jp->kp);
1353 } 1359 } else
1360 ret = -EINVAL;
1361
1354 if (ret < 0) { 1362 if (ret < 0) {
1355 if (i > 0) 1363 if (i > 0)
1356 unregister_jprobes(jps, i); 1364 unregister_jprobes(jps, i);
diff --git a/kernel/module.c b/kernel/module.c
index ccd641991842..2df46301a7a4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2309 sizeof(*mod->tracepoints), 2310 sizeof(*mod->tracepoints),
2310 &mod->num_tracepoints); 2311 &mod->num_tracepoints);
2311#endif 2312#endif
2313#ifdef HAVE_JUMP_LABEL
2314 mod->jump_entries = section_objs(info, "__jump_table",
2315 sizeof(*mod->jump_entries),
2316 &mod->num_jump_entries);
2317#endif
2312#ifdef CONFIG_EVENT_TRACING 2318#ifdef CONFIG_EVENT_TRACING
2313 mod->trace_events = section_objs(info, "_ftrace_events", 2319 mod->trace_events = section_objs(info, "_ftrace_events",
2314 sizeof(*mod->trace_events), 2320 sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b98bed3d8182..f309e8014c78 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/* 37atomic_t perf_task_events __read_mostly;
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66extern __weak const char *perf_pmu_name(void)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 return "pmu";
81} 69}
82 70
83void __weak hw_perf_disable(void) { barrier(); } 71void perf_pmu_disable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); } 72{
85 73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
86void __weak perf_event_print_debug(void) { } 74 if (!(*count)++)
87 75 pmu->pmu_disable(pmu);
88static DEFINE_PER_CPU(int, perf_disable_count); 76}
89 77
90void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
91{ 79{
92 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
94} 83}
95 84
96void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 93{
98 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
100} 101}
101 102
102static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 152 * the context could get moved to another task.
152 */ 153 */
153static struct perf_event_context * 154static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 156{
156 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
157 158
158 rcu_read_lock(); 159 rcu_read_lock();
159 retry: 160retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 162 if (ctx) {
162 /* 163 /*
163 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 171 * can't get swapped on us any more.
171 */ 172 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 176 goto retry;
176 } 177 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
191 */ 192 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 195{
194 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
195 unsigned long flags; 197 unsigned long flags;
196 198
197 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 200 if (ctx) {
199 ++ctx->pin_count; 201 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 304 }
303 305
304 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 309 ctx->nr_events++;
306 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
311{ 315{
312 struct perf_event *group_leader = event->group_leader; 316 struct perf_event *group_leader = event->group_leader;
313 317
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 318 /*
319 * We can have double attach due to group movement in perf_event_open.
320 */
321 if (event->attach_state & PERF_ATTACH_GROUP)
322 return;
323
315 event->attach_state |= PERF_ATTACH_GROUP; 324 event->attach_state |= PERF_ATTACH_GROUP;
316 325
317 if (group_leader == event) 326 if (group_leader == event)
@@ -408,8 +417,8 @@ event_filter_match(struct perf_event *event)
408 return event->cpu == -1 || event->cpu == smp_processor_id(); 417 return event->cpu == -1 || event->cpu == smp_processor_id();
409} 418}
410 419
411static void 420static int
412event_sched_out(struct perf_event *event, 421__event_sched_out(struct perf_event *event,
413 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
414 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
415{ 424{
@@ -428,15 +437,14 @@ event_sched_out(struct perf_event *event,
428 } 437 }
429 438
430 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
431 return; 440 return 0;
432 441
433 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
434 if (event->pending_disable) { 443 if (event->pending_disable) {
435 event->pending_disable = 0; 444 event->pending_disable = 0;
436 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
437 } 446 }
438 event->tstamp_stopped = ctx->time; 447 event->pmu->del(event, 0);
439 event->pmu->disable(event);
440 event->oncpu = -1; 448 event->oncpu = -1;
441 449
442 if (!is_software_event(event)) 450 if (!is_software_event(event))
@@ -444,6 +452,19 @@ event_sched_out(struct perf_event *event,
444 ctx->nr_active--; 452 ctx->nr_active--;
445 if (event->attr.exclusive || !cpuctx->active_oncpu) 453 if (event->attr.exclusive || !cpuctx->active_oncpu)
446 cpuctx->exclusive = 0; 454 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
447} 468}
448 469
449static void 470static void
@@ -466,6 +487,12 @@ group_sched_out(struct perf_event *group_event,
466 cpuctx->exclusive = 0; 487 cpuctx->exclusive = 0;
467} 488}
468 489
490static inline struct perf_cpu_context *
491__get_cpu_context(struct perf_event_context *ctx)
492{
493 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
494}
495
469/* 496/*
470 * Cross CPU call to remove a performance event 497 * Cross CPU call to remove a performance event
471 * 498 *
@@ -474,9 +501,9 @@ group_sched_out(struct perf_event *group_event,
474 */ 501 */
475static void __perf_event_remove_from_context(void *info) 502static void __perf_event_remove_from_context(void *info)
476{ 503{
477 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
478 struct perf_event *event = info; 504 struct perf_event *event = info;
479 struct perf_event_context *ctx = event->ctx; 505 struct perf_event_context *ctx = event->ctx;
506 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
480 507
481 /* 508 /*
482 * If this is a task context, we need to check whether it is 509 * If this is a task context, we need to check whether it is
@@ -487,27 +514,11 @@ static void __perf_event_remove_from_context(void *info)
487 return; 514 return;
488 515
489 raw_spin_lock(&ctx->lock); 516 raw_spin_lock(&ctx->lock);
490 /*
491 * Protect the list operation against NMI by disabling the
492 * events on a global level.
493 */
494 perf_disable();
495 517
496 event_sched_out(event, cpuctx, ctx); 518 event_sched_out(event, cpuctx, ctx);
497 519
498 list_del_event(event, ctx); 520 list_del_event(event, ctx);
499 521
500 if (!ctx->task) {
501 /*
502 * Allow more per task events with respect to the
503 * reservation:
504 */
505 cpuctx->max_pertask =
506 min(perf_max_events - ctx->nr_events,
507 perf_max_events - perf_reserved_percpu);
508 }
509
510 perf_enable();
511 raw_spin_unlock(&ctx->lock); 522 raw_spin_unlock(&ctx->lock);
512} 523}
513 524
@@ -572,8 +583,8 @@ retry:
572static void __perf_event_disable(void *info) 583static void __perf_event_disable(void *info)
573{ 584{
574 struct perf_event *event = info; 585 struct perf_event *event = info;
575 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
576 struct perf_event_context *ctx = event->ctx; 586 struct perf_event_context *ctx = event->ctx;
587 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
577 588
578 /* 589 /*
579 * If this is a per-task event, need to check whether this 590 * If this is a per-task event, need to check whether this
@@ -628,7 +639,7 @@ void perf_event_disable(struct perf_event *event)
628 return; 639 return;
629 } 640 }
630 641
631 retry: 642retry:
632 task_oncpu_function_call(task, __perf_event_disable, event); 643 task_oncpu_function_call(task, __perf_event_disable, event);
633 644
634 raw_spin_lock_irq(&ctx->lock); 645 raw_spin_lock_irq(&ctx->lock);
@@ -653,7 +664,7 @@ void perf_event_disable(struct perf_event *event)
653} 664}
654 665
655static int 666static int
656event_sched_in(struct perf_event *event, 667__event_sched_in(struct perf_event *event,
657 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
658 struct perf_event_context *ctx) 669 struct perf_event_context *ctx)
659{ 670{
@@ -667,14 +678,12 @@ event_sched_in(struct perf_event *event,
667 */ 678 */
668 smp_wmb(); 679 smp_wmb();
669 680
670 if (event->pmu->enable(event)) { 681 if (event->pmu->add(event, PERF_EF_START)) {
671 event->state = PERF_EVENT_STATE_INACTIVE; 682 event->state = PERF_EVENT_STATE_INACTIVE;
672 event->oncpu = -1; 683 event->oncpu = -1;
673 return -EAGAIN; 684 return -EAGAIN;
674 } 685 }
675 686
676 event->tstamp_running += ctx->time - event->tstamp_stopped;
677
678 if (!is_software_event(event)) 687 if (!is_software_event(event))
679 cpuctx->active_oncpu++; 688 cpuctx->active_oncpu++;
680 ctx->nr_active++; 689 ctx->nr_active++;
@@ -685,28 +694,56 @@ event_sched_in(struct perf_event *event,
685 return 0; 694 return 0;
686} 695}
687 696
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
688static int 726static int
689group_sched_in(struct perf_event *group_event, 727group_sched_in(struct perf_event *group_event,
690 struct perf_cpu_context *cpuctx, 728 struct perf_cpu_context *cpuctx,
691 struct perf_event_context *ctx) 729 struct perf_event_context *ctx)
692{ 730{
693 struct perf_event *event, *partial_group = NULL; 731 struct perf_event *event, *partial_group = NULL;
694 const struct pmu *pmu = group_event->pmu; 732 struct pmu *pmu = group_event->pmu;
695 bool txn = false;
696 733
697 if (group_event->state == PERF_EVENT_STATE_OFF) 734 if (group_event->state == PERF_EVENT_STATE_OFF)
698 return 0; 735 return 0;
699 736
700 /* Check if group transaction availabe */ 737 pmu->start_txn(pmu);
701 if (pmu->start_txn)
702 txn = true;
703 738
704 if (txn) 739 /*
705 pmu->start_txn(pmu); 740 * use __event_sched_in() to delay updating tstamp_running
706 741 * until the transaction is committed. In case of failure
707 if (event_sched_in(group_event, cpuctx, ctx)) { 742 * we will keep an unmodified tstamp_running which is a
708 if (txn) 743 * requirement to get correct timing information
709 pmu->cancel_txn(pmu); 744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu);
710 return -EAGAIN; 747 return -EAGAIN;
711 } 748 }
712 749
@@ -714,29 +751,33 @@ group_sched_in(struct perf_event *group_event,
714 * Schedule in siblings as one group (if any): 751 * Schedule in siblings as one group (if any):
715 */ 752 */
716 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 753 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
717 if (event_sched_in(event, cpuctx, ctx)) { 754 if (__event_sched_in(event, cpuctx, ctx)) {
718 partial_group = event; 755 partial_group = event;
719 goto group_error; 756 goto group_error;
720 } 757 }
721 } 758 }
722 759
723 if (!txn || !pmu->commit_txn(pmu)) 760 if (!pmu->commit_txn(pmu)) {
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
724 return 0; 763 return 0;
725 764 }
726group_error: 765group_error:
727 /* 766 /*
728 * Groups can be scheduled in as one unit only, so undo any 767 * Groups can be scheduled in as one unit only, so undo any
729 * partial group before returning: 768 * partial group before returning:
769 *
770 * use __event_sched_out() to avoid updating tstamp_stopped
771 * because the event never actually ran
730 */ 772 */
731 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 773 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
732 if (event == partial_group) 774 if (event == partial_group)
733 break; 775 break;
734 event_sched_out(event, cpuctx, ctx); 776 __event_sched_out(event, cpuctx, ctx);
735 } 777 }
736 event_sched_out(group_event, cpuctx, ctx); 778 __event_sched_out(group_event, cpuctx, ctx);
737 779
738 if (txn) 780 pmu->cancel_txn(pmu);
739 pmu->cancel_txn(pmu);
740 781
741 return -EAGAIN; 782 return -EAGAIN;
742} 783}
@@ -789,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event,
789 */ 830 */
790static void __perf_install_in_context(void *info) 831static void __perf_install_in_context(void *info)
791{ 832{
792 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
793 struct perf_event *event = info; 833 struct perf_event *event = info;
794 struct perf_event_context *ctx = event->ctx; 834 struct perf_event_context *ctx = event->ctx;
795 struct perf_event *leader = event->group_leader; 835 struct perf_event *leader = event->group_leader;
836 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
796 int err; 837 int err;
797 838
798 /* 839 /*
@@ -812,12 +853,6 @@ static void __perf_install_in_context(void *info)
812 ctx->is_active = 1; 853 ctx->is_active = 1;
813 update_context_time(ctx); 854 update_context_time(ctx);
814 855
815 /*
816 * Protect the list operation against NMI by disabling the
817 * events on a global level. NOP for non NMI based events.
818 */
819 perf_disable();
820
821 add_event_to_ctx(event, ctx); 856 add_event_to_ctx(event, ctx);
822 857
823 if (event->cpu != -1 && event->cpu != smp_processor_id()) 858 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +890,7 @@ static void __perf_install_in_context(void *info)
855 } 890 }
856 } 891 }
857 892
858 if (!err && !ctx->task && cpuctx->max_pertask) 893unlock:
859 cpuctx->max_pertask--;
860
861 unlock:
862 perf_enable();
863
864 raw_spin_unlock(&ctx->lock); 894 raw_spin_unlock(&ctx->lock);
865} 895}
866 896
@@ -883,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx,
883{ 913{
884 struct task_struct *task = ctx->task; 914 struct task_struct *task = ctx->task;
885 915
916 event->ctx = ctx;
917
886 if (!task) { 918 if (!task) {
887 /* 919 /*
888 * Per cpu events are installed via an smp call and 920 * Per cpu events are installed via an smp call and
@@ -931,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
931 963
932 event->state = PERF_EVENT_STATE_INACTIVE; 964 event->state = PERF_EVENT_STATE_INACTIVE;
933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 965 event->tstamp_enabled = ctx->time - event->total_time_enabled;
934 list_for_each_entry(sub, &event->sibling_list, group_entry) 966 list_for_each_entry(sub, &event->sibling_list, group_entry) {
935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 967 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
936 sub->tstamp_enabled = 968 sub->tstamp_enabled =
937 ctx->time - sub->total_time_enabled; 969 ctx->time - sub->total_time_enabled;
970 }
971 }
938} 972}
939 973
940/* 974/*
@@ -943,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
943static void __perf_event_enable(void *info) 977static void __perf_event_enable(void *info)
944{ 978{
945 struct perf_event *event = info; 979 struct perf_event *event = info;
946 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
947 struct perf_event_context *ctx = event->ctx; 980 struct perf_event_context *ctx = event->ctx;
948 struct perf_event *leader = event->group_leader; 981 struct perf_event *leader = event->group_leader;
982 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
949 int err; 983 int err;
950 984
951 /* 985 /*
@@ -979,12 +1013,10 @@ static void __perf_event_enable(void *info)
979 if (!group_can_go_on(event, cpuctx, 1)) { 1013 if (!group_can_go_on(event, cpuctx, 1)) {
980 err = -EEXIST; 1014 err = -EEXIST;
981 } else { 1015 } else {
982 perf_disable();
983 if (event == leader) 1016 if (event == leader)
984 err = group_sched_in(event, cpuctx, ctx); 1017 err = group_sched_in(event, cpuctx, ctx);
985 else 1018 else
986 err = event_sched_in(event, cpuctx, ctx); 1019 err = event_sched_in(event, cpuctx, ctx);
987 perf_enable();
988 } 1020 }
989 1021
990 if (err) { 1022 if (err) {
@@ -1000,7 +1032,7 @@ static void __perf_event_enable(void *info)
1000 } 1032 }
1001 } 1033 }
1002 1034
1003 unlock: 1035unlock:
1004 raw_spin_unlock(&ctx->lock); 1036 raw_spin_unlock(&ctx->lock);
1005} 1037}
1006 1038
@@ -1041,7 +1073,7 @@ void perf_event_enable(struct perf_event *event)
1041 if (event->state == PERF_EVENT_STATE_ERROR) 1073 if (event->state == PERF_EVENT_STATE_ERROR)
1042 event->state = PERF_EVENT_STATE_OFF; 1074 event->state = PERF_EVENT_STATE_OFF;
1043 1075
1044 retry: 1076retry:
1045 raw_spin_unlock_irq(&ctx->lock); 1077 raw_spin_unlock_irq(&ctx->lock);
1046 task_oncpu_function_call(task, __perf_event_enable, event); 1078 task_oncpu_function_call(task, __perf_event_enable, event);
1047 1079
@@ -1061,7 +1093,7 @@ void perf_event_enable(struct perf_event *event)
1061 if (event->state == PERF_EVENT_STATE_OFF) 1093 if (event->state == PERF_EVENT_STATE_OFF)
1062 __perf_event_mark_enabled(event, ctx); 1094 __perf_event_mark_enabled(event, ctx);
1063 1095
1064 out: 1096out:
1065 raw_spin_unlock_irq(&ctx->lock); 1097 raw_spin_unlock_irq(&ctx->lock);
1066} 1098}
1067 1099
@@ -1092,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_event *event; 1124 struct perf_event *event;
1093 1125
1094 raw_spin_lock(&ctx->lock); 1126 raw_spin_lock(&ctx->lock);
1127 perf_pmu_disable(ctx->pmu);
1095 ctx->is_active = 0; 1128 ctx->is_active = 0;
1096 if (likely(!ctx->nr_events)) 1129 if (likely(!ctx->nr_events))
1097 goto out; 1130 goto out;
1098 update_context_time(ctx); 1131 update_context_time(ctx);
1099 1132
1100 perf_disable();
1101 if (!ctx->nr_active) 1133 if (!ctx->nr_active)
1102 goto out_enable; 1134 goto out;
1103 1135
1104 if (event_type & EVENT_PINNED) 1136 if (event_type & EVENT_PINNED) {
1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1137 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1106 group_sched_out(event, cpuctx, ctx); 1138 group_sched_out(event, cpuctx, ctx);
1139 }
1107 1140
1108 if (event_type & EVENT_FLEXIBLE) 1141 if (event_type & EVENT_FLEXIBLE) {
1109 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1142 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1110 group_sched_out(event, cpuctx, ctx); 1143 group_sched_out(event, cpuctx, ctx);
1111 1144 }
1112 out_enable: 1145out:
1113 perf_enable(); 1146 perf_pmu_enable(ctx->pmu);
1114 out:
1115 raw_spin_unlock(&ctx->lock); 1147 raw_spin_unlock(&ctx->lock);
1116} 1148}
1117 1149
@@ -1209,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1209 } 1241 }
1210} 1242}
1211 1243
1212/* 1244void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1213 * Called from scheduler to remove the events of the current task, 1245 struct task_struct *next)
1214 * with interrupts disabled.
1215 *
1216 * We stop each event and update the event value in event->count.
1217 *
1218 * This does not protect us against NMI, but disable()
1219 * sets the disabled bit in the control field of event _before_
1220 * accessing the event control register. If a NMI hits, then it will
1221 * not restart the event.
1222 */
1223void perf_event_task_sched_out(struct task_struct *task,
1224 struct task_struct *next)
1225{ 1246{
1226 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1247 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1227 struct perf_event_context *ctx = task->perf_event_ctxp;
1228 struct perf_event_context *next_ctx; 1248 struct perf_event_context *next_ctx;
1229 struct perf_event_context *parent; 1249 struct perf_event_context *parent;
1250 struct perf_cpu_context *cpuctx;
1230 int do_switch = 1; 1251 int do_switch = 1;
1231 1252
1232 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1253 if (likely(!ctx))
1254 return;
1233 1255
1234 if (likely(!ctx || !cpuctx->task_ctx)) 1256 cpuctx = __get_cpu_context(ctx);
1257 if (!cpuctx->task_ctx)
1235 return; 1258 return;
1236 1259
1237 rcu_read_lock(); 1260 rcu_read_lock();
1238 parent = rcu_dereference(ctx->parent_ctx); 1261 parent = rcu_dereference(ctx->parent_ctx);
1239 next_ctx = next->perf_event_ctxp; 1262 next_ctx = next->perf_event_ctxp[ctxn];
1240 if (parent && next_ctx && 1263 if (parent && next_ctx &&
1241 rcu_dereference(next_ctx->parent_ctx) == parent) { 1264 rcu_dereference(next_ctx->parent_ctx) == parent) {
1242 /* 1265 /*
@@ -1255,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1255 * XXX do we need a memory barrier of sorts 1278 * XXX do we need a memory barrier of sorts
1256 * wrt to rcu_dereference() of perf_event_ctxp 1279 * wrt to rcu_dereference() of perf_event_ctxp
1257 */ 1280 */
1258 task->perf_event_ctxp = next_ctx; 1281 task->perf_event_ctxp[ctxn] = next_ctx;
1259 next->perf_event_ctxp = ctx; 1282 next->perf_event_ctxp[ctxn] = ctx;
1260 ctx->task = next; 1283 ctx->task = next;
1261 next_ctx->task = task; 1284 next_ctx->task = task;
1262 do_switch = 0; 1285 do_switch = 0;
@@ -1274,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1274 } 1297 }
1275} 1298}
1276 1299
1300#define for_each_task_context_nr(ctxn) \
1301 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1302
1303/*
1304 * Called from scheduler to remove the events of the current task,
1305 * with interrupts disabled.
1306 *
1307 * We stop each event and update the event value in event->count.
1308 *
1309 * This does not protect us against NMI, but disable()
1310 * sets the disabled bit in the control field of event _before_
1311 * accessing the event control register. If a NMI hits, then it will
1312 * not restart the event.
1313 */
1314void __perf_event_task_sched_out(struct task_struct *task,
1315 struct task_struct *next)
1316{
1317 int ctxn;
1318
1319 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1320
1321 for_each_task_context_nr(ctxn)
1322 perf_event_context_sched_out(task, ctxn, next);
1323}
1324
1277static void task_ctx_sched_out(struct perf_event_context *ctx, 1325static void task_ctx_sched_out(struct perf_event_context *ctx,
1278 enum event_type_t event_type) 1326 enum event_type_t event_type)
1279{ 1327{
1280 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1328 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1281 1329
1282 if (!cpuctx->task_ctx) 1330 if (!cpuctx->task_ctx)
1283 return; 1331 return;
@@ -1292,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1292/* 1340/*
1293 * Called with IRQs disabled 1341 * Called with IRQs disabled
1294 */ 1342 */
1295static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1296{
1297 task_ctx_sched_out(ctx, EVENT_ALL);
1298}
1299
1300/*
1301 * Called with IRQs disabled
1302 */
1303static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1343static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1304 enum event_type_t event_type) 1344 enum event_type_t event_type)
1305{ 1345{
@@ -1350,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1350 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1390 if (event->cpu != -1 && event->cpu != smp_processor_id())
1351 continue; 1391 continue;
1352 1392
1353 if (group_can_go_on(event, cpuctx, can_add_hw)) 1393 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1354 if (group_sched_in(event, cpuctx, ctx)) 1394 if (group_sched_in(event, cpuctx, ctx))
1355 can_add_hw = 0; 1395 can_add_hw = 0;
1396 }
1356 } 1397 }
1357} 1398}
1358 1399
@@ -1368,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1368 1409
1369 ctx->timestamp = perf_clock(); 1410 ctx->timestamp = perf_clock();
1370 1411
1371 perf_disable();
1372
1373 /* 1412 /*
1374 * First go through the list and put on any pinned groups 1413 * First go through the list and put on any pinned groups
1375 * in order to give them the best chance of going on. 1414 * in order to give them the best chance of going on.
@@ -1381,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1381 if (event_type & EVENT_FLEXIBLE) 1420 if (event_type & EVENT_FLEXIBLE)
1382 ctx_flexible_sched_in(ctx, cpuctx); 1421 ctx_flexible_sched_in(ctx, cpuctx);
1383 1422
1384 perf_enable(); 1423out:
1385 out:
1386 raw_spin_unlock(&ctx->lock); 1424 raw_spin_unlock(&ctx->lock);
1387} 1425}
1388 1426
@@ -1394,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1394 ctx_sched_in(ctx, cpuctx, event_type); 1432 ctx_sched_in(ctx, cpuctx, event_type);
1395} 1433}
1396 1434
1397static void task_ctx_sched_in(struct task_struct *task, 1435static void task_ctx_sched_in(struct perf_event_context *ctx,
1398 enum event_type_t event_type) 1436 enum event_type_t event_type)
1399{ 1437{
1400 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1438 struct perf_cpu_context *cpuctx;
1401 struct perf_event_context *ctx = task->perf_event_ctxp;
1402 1439
1403 if (likely(!ctx)) 1440 cpuctx = __get_cpu_context(ctx);
1404 return;
1405 if (cpuctx->task_ctx == ctx) 1441 if (cpuctx->task_ctx == ctx)
1406 return; 1442 return;
1443
1407 ctx_sched_in(ctx, cpuctx, event_type); 1444 ctx_sched_in(ctx, cpuctx, event_type);
1408 cpuctx->task_ctx = ctx; 1445 cpuctx->task_ctx = ctx;
1409} 1446}
1410/*
1411 * Called from scheduler to add the events of the current task
1412 * with interrupts disabled.
1413 *
1414 * We restore the event value and then enable it.
1415 *
1416 * This does not protect us against NMI, but enable()
1417 * sets the enabled bit in the control field of event _before_
1418 * accessing the event control register. If a NMI hits, then it will
1419 * keep the event running.
1420 */
1421void perf_event_task_sched_in(struct task_struct *task)
1422{
1423 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1424 struct perf_event_context *ctx = task->perf_event_ctxp;
1425 1447
1426 if (likely(!ctx)) 1448void perf_event_context_sched_in(struct perf_event_context *ctx)
1427 return; 1449{
1450 struct perf_cpu_context *cpuctx;
1428 1451
1452 cpuctx = __get_cpu_context(ctx);
1429 if (cpuctx->task_ctx == ctx) 1453 if (cpuctx->task_ctx == ctx)
1430 return; 1454 return;
1431 1455
1432 perf_disable(); 1456 perf_pmu_disable(ctx->pmu);
1433
1434 /* 1457 /*
1435 * We want to keep the following priority order: 1458 * We want to keep the following priority order:
1436 * cpu pinned (that don't need to move), task pinned, 1459 * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1444 1467
1445 cpuctx->task_ctx = ctx; 1468 cpuctx->task_ctx = ctx;
1446 1469
1447 perf_enable(); 1470 /*
1471 * Since these rotations are per-cpu, we need to ensure the
1472 * cpu-context we got scheduled on is actually rotating.
1473 */
1474 perf_pmu_rotate_start(ctx->pmu);
1475 perf_pmu_enable(ctx->pmu);
1476}
1477
1478/*
1479 * Called from scheduler to add the events of the current task
1480 * with interrupts disabled.
1481 *
1482 * We restore the event value and then enable it.
1483 *
1484 * This does not protect us against NMI, but enable()
1485 * sets the enabled bit in the control field of event _before_
1486 * accessing the event control register. If a NMI hits, then it will
1487 * keep the event running.
1488 */
1489void __perf_event_task_sched_in(struct task_struct *task)
1490{
1491 struct perf_event_context *ctx;
1492 int ctxn;
1493
1494 for_each_task_context_nr(ctxn) {
1495 ctx = task->perf_event_ctxp[ctxn];
1496 if (likely(!ctx))
1497 continue;
1498
1499 perf_event_context_sched_in(ctx);
1500 }
1448} 1501}
1449 1502
1450#define MAX_INTERRUPTS (~0ULL) 1503#define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1577,6 @@ do { \
1524 return div64_u64(dividend, divisor); 1577 return div64_u64(dividend, divisor);
1525} 1578}
1526 1579
1527static void perf_event_stop(struct perf_event *event)
1528{
1529 if (!event->pmu->stop)
1530 return event->pmu->disable(event);
1531
1532 return event->pmu->stop(event);
1533}
1534
1535static int perf_event_start(struct perf_event *event)
1536{
1537 if (!event->pmu->start)
1538 return event->pmu->enable(event);
1539
1540 return event->pmu->start(event);
1541}
1542
1543static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1580static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1544{ 1581{
1545 struct hw_perf_event *hwc = &event->hw; 1582 struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1559 hwc->sample_period = sample_period; 1596 hwc->sample_period = sample_period;
1560 1597
1561 if (local64_read(&hwc->period_left) > 8*sample_period) { 1598 if (local64_read(&hwc->period_left) > 8*sample_period) {
1562 perf_disable(); 1599 event->pmu->stop(event, PERF_EF_UPDATE);
1563 perf_event_stop(event);
1564 local64_set(&hwc->period_left, 0); 1600 local64_set(&hwc->period_left, 0);
1565 perf_event_start(event); 1601 event->pmu->start(event, PERF_EF_RELOAD);
1566 perf_enable();
1567 } 1602 }
1568} 1603}
1569 1604
1570static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1605static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1571{ 1606{
1572 struct perf_event *event; 1607 struct perf_event *event;
1573 struct hw_perf_event *hwc; 1608 struct hw_perf_event *hwc;
@@ -1592,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1592 */ 1627 */
1593 if (interrupts == MAX_INTERRUPTS) { 1628 if (interrupts == MAX_INTERRUPTS) {
1594 perf_log_throttle(event, 1); 1629 perf_log_throttle(event, 1);
1595 perf_disable(); 1630 event->pmu->start(event, 0);
1596 event->pmu->unthrottle(event);
1597 perf_enable();
1598 } 1631 }
1599 1632
1600 if (!event->attr.freq || !event->attr.sample_freq) 1633 if (!event->attr.freq || !event->attr.sample_freq)
1601 continue; 1634 continue;
1602 1635
1603 perf_disable();
1604 event->pmu->read(event); 1636 event->pmu->read(event);
1605 now = local64_read(&event->count); 1637 now = local64_read(&event->count);
1606 delta = now - hwc->freq_count_stamp; 1638 delta = now - hwc->freq_count_stamp;
1607 hwc->freq_count_stamp = now; 1639 hwc->freq_count_stamp = now;
1608 1640
1609 if (delta > 0) 1641 if (delta > 0)
1610 perf_adjust_period(event, TICK_NSEC, delta); 1642 perf_adjust_period(event, period, delta);
1611 perf_enable();
1612 } 1643 }
1613 raw_spin_unlock(&ctx->lock); 1644 raw_spin_unlock(&ctx->lock);
1614} 1645}
@@ -1626,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1626 raw_spin_unlock(&ctx->lock); 1657 raw_spin_unlock(&ctx->lock);
1627} 1658}
1628 1659
1629void perf_event_task_tick(struct task_struct *curr) 1660/*
1661 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1662 * because they're strictly cpu affine and rotate_start is called with IRQs
1663 * disabled, while rotate_context is called from IRQ context.
1664 */
1665static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1630{ 1666{
1631 struct perf_cpu_context *cpuctx; 1667 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1632 struct perf_event_context *ctx; 1668 struct perf_event_context *ctx = NULL;
1633 int rotate = 0; 1669 int rotate = 0, remove = 1;
1634
1635 if (!atomic_read(&nr_events))
1636 return;
1637 1670
1638 cpuctx = &__get_cpu_var(perf_cpu_context); 1671 if (cpuctx->ctx.nr_events) {
1639 if (cpuctx->ctx.nr_events && 1672 remove = 0;
1640 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1673 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1641 rotate = 1; 1674 rotate = 1;
1675 }
1642 1676
1643 ctx = curr->perf_event_ctxp; 1677 ctx = cpuctx->task_ctx;
1644 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1678 if (ctx && ctx->nr_events) {
1645 rotate = 1; 1679 remove = 0;
1680 if (ctx->nr_events != ctx->nr_active)
1681 rotate = 1;
1682 }
1646 1683
1647 perf_ctx_adjust_freq(&cpuctx->ctx); 1684 perf_pmu_disable(cpuctx->ctx.pmu);
1685 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1648 if (ctx) 1686 if (ctx)
1649 perf_ctx_adjust_freq(ctx); 1687 perf_ctx_adjust_freq(ctx, interval);
1650 1688
1651 if (!rotate) 1689 if (!rotate)
1652 return; 1690 goto done;
1653 1691
1654 perf_disable();
1655 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1692 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1656 if (ctx) 1693 if (ctx)
1657 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1694 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr)
1662 1699
1663 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1700 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1664 if (ctx) 1701 if (ctx)
1665 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1702 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1666 perf_enable(); 1703
1704done:
1705 if (remove)
1706 list_del_init(&cpuctx->rotation_list);
1707
1708 perf_pmu_enable(cpuctx->ctx.pmu);
1709}
1710
1711void perf_event_task_tick(void)
1712{
1713 struct list_head *head = &__get_cpu_var(rotation_list);
1714 struct perf_cpu_context *cpuctx, *tmp;
1715
1716 WARN_ON(!irqs_disabled());
1717
1718 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1719 if (cpuctx->jiffies_interval == 1 ||
1720 !(jiffies % cpuctx->jiffies_interval))
1721 perf_rotate_context(cpuctx);
1722 }
1667} 1723}
1668 1724
1669static int event_enable_on_exec(struct perf_event *event, 1725static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event,
1685 * Enable all of a task's events that have been marked enable-on-exec. 1741 * Enable all of a task's events that have been marked enable-on-exec.
1686 * This expects task == current. 1742 * This expects task == current.
1687 */ 1743 */
1688static void perf_event_enable_on_exec(struct task_struct *task) 1744static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1689{ 1745{
1690 struct perf_event_context *ctx;
1691 struct perf_event *event; 1746 struct perf_event *event;
1692 unsigned long flags; 1747 unsigned long flags;
1693 int enabled = 0; 1748 int enabled = 0;
1694 int ret; 1749 int ret;
1695 1750
1696 local_irq_save(flags); 1751 local_irq_save(flags);
1697 ctx = task->perf_event_ctxp;
1698 if (!ctx || !ctx->nr_events) 1752 if (!ctx || !ctx->nr_events)
1699 goto out; 1753 goto out;
1700 1754
1701 __perf_event_task_sched_out(ctx); 1755 task_ctx_sched_out(ctx, EVENT_ALL);
1702 1756
1703 raw_spin_lock(&ctx->lock); 1757 raw_spin_lock(&ctx->lock);
1704 1758
@@ -1722,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1722 1776
1723 raw_spin_unlock(&ctx->lock); 1777 raw_spin_unlock(&ctx->lock);
1724 1778
1725 perf_event_task_sched_in(task); 1779 perf_event_context_sched_in(ctx);
1726 out: 1780out:
1727 local_irq_restore(flags); 1781 local_irq_restore(flags);
1728} 1782}
1729 1783
@@ -1732,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1732 */ 1786 */
1733static void __perf_event_read(void *info) 1787static void __perf_event_read(void *info)
1734{ 1788{
1735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1736 struct perf_event *event = info; 1789 struct perf_event *event = info;
1737 struct perf_event_context *ctx = event->ctx; 1790 struct perf_event_context *ctx = event->ctx;
1791 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1738 1792
1739 /* 1793 /*
1740 * If this is a task context, we need to check whether it is 1794 * If this is a task context, we need to check whether it is
@@ -1773,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
1773 unsigned long flags; 1827 unsigned long flags;
1774 1828
1775 raw_spin_lock_irqsave(&ctx->lock, flags); 1829 raw_spin_lock_irqsave(&ctx->lock, flags);
1776 update_context_time(ctx); 1830 /*
1831 * may read while context is not active
1832 * (e.g., thread is blocked), in that case
1833 * we cannot update context time
1834 */
1835 if (ctx->is_active)
1836 update_context_time(ctx);
1777 update_event_times(event); 1837 update_event_times(event);
1778 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1838 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1779 } 1839 }
@@ -1782,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event)
1782} 1842}
1783 1843
1784/* 1844/*
1785 * Initialize the perf_event context in a task_struct: 1845 * Callchain support
1786 */ 1846 */
1847
1848struct callchain_cpus_entries {
1849 struct rcu_head rcu_head;
1850 struct perf_callchain_entry *cpu_entries[0];
1851};
1852
1853static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1854static atomic_t nr_callchain_events;
1855static DEFINE_MUTEX(callchain_mutex);
1856struct callchain_cpus_entries *callchain_cpus_entries;
1857
1858
1859__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1860 struct pt_regs *regs)
1861{
1862}
1863
1864__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1865 struct pt_regs *regs)
1866{
1867}
1868
1869static void release_callchain_buffers_rcu(struct rcu_head *head)
1870{
1871 struct callchain_cpus_entries *entries;
1872 int cpu;
1873
1874 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1875
1876 for_each_possible_cpu(cpu)
1877 kfree(entries->cpu_entries[cpu]);
1878
1879 kfree(entries);
1880}
1881
1882static void release_callchain_buffers(void)
1883{
1884 struct callchain_cpus_entries *entries;
1885
1886 entries = callchain_cpus_entries;
1887 rcu_assign_pointer(callchain_cpus_entries, NULL);
1888 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1889}
1890
1891static int alloc_callchain_buffers(void)
1892{
1893 int cpu;
1894 int size;
1895 struct callchain_cpus_entries *entries;
1896
1897 /*
1898 * We can't use the percpu allocation API for data that can be
1899 * accessed from NMI. Use a temporary manual per cpu allocation
1900 * until that gets sorted out.
1901 */
1902 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1903 num_possible_cpus();
1904
1905 entries = kzalloc(size, GFP_KERNEL);
1906 if (!entries)
1907 return -ENOMEM;
1908
1909 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1910
1911 for_each_possible_cpu(cpu) {
1912 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1913 cpu_to_node(cpu));
1914 if (!entries->cpu_entries[cpu])
1915 goto fail;
1916 }
1917
1918 rcu_assign_pointer(callchain_cpus_entries, entries);
1919
1920 return 0;
1921
1922fail:
1923 for_each_possible_cpu(cpu)
1924 kfree(entries->cpu_entries[cpu]);
1925 kfree(entries);
1926
1927 return -ENOMEM;
1928}
1929
1930static int get_callchain_buffers(void)
1931{
1932 int err = 0;
1933 int count;
1934
1935 mutex_lock(&callchain_mutex);
1936
1937 count = atomic_inc_return(&nr_callchain_events);
1938 if (WARN_ON_ONCE(count < 1)) {
1939 err = -EINVAL;
1940 goto exit;
1941 }
1942
1943 if (count > 1) {
1944 /* If the allocation failed, give up */
1945 if (!callchain_cpus_entries)
1946 err = -ENOMEM;
1947 goto exit;
1948 }
1949
1950 err = alloc_callchain_buffers();
1951 if (err)
1952 release_callchain_buffers();
1953exit:
1954 mutex_unlock(&callchain_mutex);
1955
1956 return err;
1957}
1958
1959static void put_callchain_buffers(void)
1960{
1961 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1962 release_callchain_buffers();
1963 mutex_unlock(&callchain_mutex);
1964 }
1965}
1966
1967static int get_recursion_context(int *recursion)
1968{
1969 int rctx;
1970
1971 if (in_nmi())
1972 rctx = 3;
1973 else if (in_irq())
1974 rctx = 2;
1975 else if (in_softirq())
1976 rctx = 1;
1977 else
1978 rctx = 0;
1979
1980 if (recursion[rctx])
1981 return -1;
1982
1983 recursion[rctx]++;
1984 barrier();
1985
1986 return rctx;
1987}
1988
1989static inline void put_recursion_context(int *recursion, int rctx)
1990{
1991 barrier();
1992 recursion[rctx]--;
1993}
1994
1995static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1996{
1997 int cpu;
1998 struct callchain_cpus_entries *entries;
1999
2000 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2001 if (*rctx == -1)
2002 return NULL;
2003
2004 entries = rcu_dereference(callchain_cpus_entries);
2005 if (!entries)
2006 return NULL;
2007
2008 cpu = smp_processor_id();
2009
2010 return &entries->cpu_entries[cpu][*rctx];
2011}
2012
1787static void 2013static void
1788__perf_event_init_context(struct perf_event_context *ctx, 2014put_callchain_entry(int rctx)
1789 struct task_struct *task) 2015{
2016 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2017}
2018
2019static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2020{
2021 int rctx;
2022 struct perf_callchain_entry *entry;
2023
2024
2025 entry = get_callchain_entry(&rctx);
2026 if (rctx == -1)
2027 return NULL;
2028
2029 if (!entry)
2030 goto exit_put;
2031
2032 entry->nr = 0;
2033
2034 if (!user_mode(regs)) {
2035 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2036 perf_callchain_kernel(entry, regs);
2037 if (current->mm)
2038 regs = task_pt_regs(current);
2039 else
2040 regs = NULL;
2041 }
2042
2043 if (regs) {
2044 perf_callchain_store(entry, PERF_CONTEXT_USER);
2045 perf_callchain_user(entry, regs);
2046 }
2047
2048exit_put:
2049 put_callchain_entry(rctx);
2050
2051 return entry;
2052}
2053
2054/*
2055 * Initialize the perf_event context in a task_struct:
2056 */
2057static void __perf_event_init_context(struct perf_event_context *ctx)
1790{ 2058{
1791 raw_spin_lock_init(&ctx->lock); 2059 raw_spin_lock_init(&ctx->lock);
1792 mutex_init(&ctx->mutex); 2060 mutex_init(&ctx->mutex);
@@ -1794,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1794 INIT_LIST_HEAD(&ctx->flexible_groups); 2062 INIT_LIST_HEAD(&ctx->flexible_groups);
1795 INIT_LIST_HEAD(&ctx->event_list); 2063 INIT_LIST_HEAD(&ctx->event_list);
1796 atomic_set(&ctx->refcount, 1); 2064 atomic_set(&ctx->refcount, 1);
1797 ctx->task = task;
1798} 2065}
1799 2066
1800static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2067static struct perf_event_context *
2068alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1801{ 2069{
1802 struct perf_event_context *ctx; 2070 struct perf_event_context *ctx;
1803 struct perf_cpu_context *cpuctx;
1804 struct task_struct *task;
1805 unsigned long flags;
1806 int err;
1807
1808 if (pid == -1 && cpu != -1) {
1809 /* Must be root to operate on a CPU event: */
1810 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1811 return ERR_PTR(-EACCES);
1812 2071
1813 if (cpu < 0 || cpu >= nr_cpumask_bits) 2072 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1814 return ERR_PTR(-EINVAL); 2073 if (!ctx)
2074 return NULL;
1815 2075
1816 /* 2076 __perf_event_init_context(ctx);
1817 * We could be clever and allow to attach a event to an 2077 if (task) {
1818 * offline CPU and activate it when the CPU comes up, but 2078 ctx->task = task;
1819 * that's for later. 2079 get_task_struct(task);
1820 */ 2080 }
1821 if (!cpu_online(cpu)) 2081 ctx->pmu = pmu;
1822 return ERR_PTR(-ENODEV);
1823 2082
1824 cpuctx = &per_cpu(perf_cpu_context, cpu); 2083 return ctx;
1825 ctx = &cpuctx->ctx; 2084}
1826 get_ctx(ctx);
1827 2085
1828 return ctx; 2086static struct task_struct *
1829 } 2087find_lively_task_by_vpid(pid_t vpid)
2088{
2089 struct task_struct *task;
2090 int err;
1830 2091
1831 rcu_read_lock(); 2092 rcu_read_lock();
1832 if (!pid) 2093 if (!vpid)
1833 task = current; 2094 task = current;
1834 else 2095 else
1835 task = find_task_by_vpid(pid); 2096 task = find_task_by_vpid(vpid);
1836 if (task) 2097 if (task)
1837 get_task_struct(task); 2098 get_task_struct(task);
1838 rcu_read_unlock(); 2099 rcu_read_unlock();
@@ -1852,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1852 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2113 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1853 goto errout; 2114 goto errout;
1854 2115
1855 retry: 2116 return task;
1856 ctx = perf_lock_task_context(task, &flags); 2117errout:
2118 put_task_struct(task);
2119 return ERR_PTR(err);
2120
2121}
2122
2123static struct perf_event_context *
2124find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2125{
2126 struct perf_event_context *ctx;
2127 struct perf_cpu_context *cpuctx;
2128 unsigned long flags;
2129 int ctxn, err;
2130
2131 if (!task && cpu != -1) {
2132 /* Must be root to operate on a CPU event: */
2133 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2134 return ERR_PTR(-EACCES);
2135
2136 if (cpu < 0 || cpu >= nr_cpumask_bits)
2137 return ERR_PTR(-EINVAL);
2138
2139 /*
2140 * We could be clever and allow to attach a event to an
2141 * offline CPU and activate it when the CPU comes up, but
2142 * that's for later.
2143 */
2144 if (!cpu_online(cpu))
2145 return ERR_PTR(-ENODEV);
2146
2147 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2148 ctx = &cpuctx->ctx;
2149 get_ctx(ctx);
2150
2151 return ctx;
2152 }
2153
2154 err = -EINVAL;
2155 ctxn = pmu->task_ctx_nr;
2156 if (ctxn < 0)
2157 goto errout;
2158
2159retry:
2160 ctx = perf_lock_task_context(task, ctxn, &flags);
1857 if (ctx) { 2161 if (ctx) {
1858 unclone_ctx(ctx); 2162 unclone_ctx(ctx);
1859 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2163 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1860 } 2164 }
1861 2165
1862 if (!ctx) { 2166 if (!ctx) {
1863 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2167 ctx = alloc_perf_context(pmu, task);
1864 err = -ENOMEM; 2168 err = -ENOMEM;
1865 if (!ctx) 2169 if (!ctx)
1866 goto errout; 2170 goto errout;
1867 __perf_event_init_context(ctx, task); 2171
1868 get_ctx(ctx); 2172 get_ctx(ctx);
1869 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2173
2174 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1870 /* 2175 /*
1871 * We raced with some other task; use 2176 * We raced with some other task; use
1872 * the context they set. 2177 * the context they set.
1873 */ 2178 */
2179 put_task_struct(task);
1874 kfree(ctx); 2180 kfree(ctx);
1875 goto retry; 2181 goto retry;
1876 } 2182 }
1877 get_task_struct(task);
1878 } 2183 }
1879 2184
1880 put_task_struct(task);
1881 return ctx; 2185 return ctx;
1882 2186
1883 errout: 2187errout:
1884 put_task_struct(task);
1885 return ERR_PTR(err); 2188 return ERR_PTR(err);
1886} 2189}
1887 2190
@@ -1898,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head)
1898 kfree(event); 2201 kfree(event);
1899} 2202}
1900 2203
1901static void perf_pending_sync(struct perf_event *event);
1902static void perf_buffer_put(struct perf_buffer *buffer); 2204static void perf_buffer_put(struct perf_buffer *buffer);
1903 2205
1904static void free_event(struct perf_event *event) 2206static void free_event(struct perf_event *event)
1905{ 2207{
1906 perf_pending_sync(event); 2208 irq_work_sync(&event->pending);
1907 2209
1908 if (!event->parent) { 2210 if (!event->parent) {
1909 atomic_dec(&nr_events); 2211 if (event->attach_state & PERF_ATTACH_TASK)
2212 jump_label_dec(&perf_task_events);
1910 if (event->attr.mmap || event->attr.mmap_data) 2213 if (event->attr.mmap || event->attr.mmap_data)
1911 atomic_dec(&nr_mmap_events); 2214 atomic_dec(&nr_mmap_events);
1912 if (event->attr.comm) 2215 if (event->attr.comm)
1913 atomic_dec(&nr_comm_events); 2216 atomic_dec(&nr_comm_events);
1914 if (event->attr.task) 2217 if (event->attr.task)
1915 atomic_dec(&nr_task_events); 2218 atomic_dec(&nr_task_events);
2219 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2220 put_callchain_buffers();
1916 } 2221 }
1917 2222
1918 if (event->buffer) { 2223 if (event->buffer) {
@@ -1923,7 +2228,9 @@ static void free_event(struct perf_event *event)
1923 if (event->destroy) 2228 if (event->destroy)
1924 event->destroy(event); 2229 event->destroy(event);
1925 2230
1926 put_ctx(event->ctx); 2231 if (event->ctx)
2232 put_ctx(event->ctx);
2233
1927 call_rcu(&event->rcu_head, free_event_rcu); 2234 call_rcu(&event->rcu_head, free_event_rcu);
1928} 2235}
1929 2236
@@ -2342,6 +2649,9 @@ int perf_event_task_disable(void)
2342 2649
2343static int perf_event_index(struct perf_event *event) 2650static int perf_event_index(struct perf_event *event)
2344{ 2651{
2652 if (event->hw.state & PERF_HES_STOPPED)
2653 return 0;
2654
2345 if (event->state != PERF_EVENT_STATE_ACTIVE) 2655 if (event->state != PERF_EVENT_STATE_ACTIVE)
2346 return 0; 2656 return 0;
2347 2657
@@ -2845,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
2845 } 3155 }
2846} 3156}
2847 3157
2848/* 3158static void perf_pending_event(struct irq_work *entry)
2849 * Pending wakeups
2850 *
2851 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2852 *
2853 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2854 * single linked list and use cmpxchg() to add entries lockless.
2855 */
2856
2857static void perf_pending_event(struct perf_pending_entry *entry)
2858{ 3159{
2859 struct perf_event *event = container_of(entry, 3160 struct perf_event *event = container_of(entry,
2860 struct perf_event, pending); 3161 struct perf_event, pending);
@@ -2870,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2870 } 3171 }
2871} 3172}
2872 3173
2873#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2874
2875static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2876 PENDING_TAIL,
2877};
2878
2879static void perf_pending_queue(struct perf_pending_entry *entry,
2880 void (*func)(struct perf_pending_entry *))
2881{
2882 struct perf_pending_entry **head;
2883
2884 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2885 return;
2886
2887 entry->func = func;
2888
2889 head = &get_cpu_var(perf_pending_head);
2890
2891 do {
2892 entry->next = *head;
2893 } while (cmpxchg(head, entry->next, entry) != entry->next);
2894
2895 set_perf_event_pending();
2896
2897 put_cpu_var(perf_pending_head);
2898}
2899
2900static int __perf_pending_run(void)
2901{
2902 struct perf_pending_entry *list;
2903 int nr = 0;
2904
2905 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2906 while (list != PENDING_TAIL) {
2907 void (*func)(struct perf_pending_entry *);
2908 struct perf_pending_entry *entry = list;
2909
2910 list = list->next;
2911
2912 func = entry->func;
2913 entry->next = NULL;
2914 /*
2915 * Ensure we observe the unqueue before we issue the wakeup,
2916 * so that we won't be waiting forever.
2917 * -- see perf_not_pending().
2918 */
2919 smp_wmb();
2920
2921 func(entry);
2922 nr++;
2923 }
2924
2925 return nr;
2926}
2927
2928static inline int perf_not_pending(struct perf_event *event)
2929{
2930 /*
2931 * If we flush on whatever cpu we run, there is a chance we don't
2932 * need to wait.
2933 */
2934 get_cpu();
2935 __perf_pending_run();
2936 put_cpu();
2937
2938 /*
2939 * Ensure we see the proper queue state before going to sleep
2940 * so that we do not miss the wakeup. -- see perf_pending_handle()
2941 */
2942 smp_rmb();
2943 return event->pending.next == NULL;
2944}
2945
2946static void perf_pending_sync(struct perf_event *event)
2947{
2948 wait_event(event->waitq, perf_not_pending(event));
2949}
2950
2951void perf_event_do_pending(void)
2952{
2953 __perf_pending_run();
2954}
2955
2956/*
2957 * Callchain support -- arch specific
2958 */
2959
2960__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2961{
2962 return NULL;
2963}
2964
2965
2966/* 3174/*
2967 * We assume there is only KVM supporting the callbacks. 3175 * We assume there is only KVM supporting the callbacks.
2968 * Later on, we might change it to a list if there is 3176 * Later on, we might change it to a list if there is
@@ -3012,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 3220
3013 if (handle->nmi) { 3221 if (handle->nmi) {
3014 handle->event->pending_wakeup = 1; 3222 handle->event->pending_wakeup = 1;
3015 perf_pending_queue(&handle->event->pending, 3223 irq_work_queue(&handle->event->pending);
3016 perf_pending_event);
3017 } else 3224 } else
3018 perf_event_wakeup(handle->event); 3225 perf_event_wakeup(handle->event);
3019} 3226}
@@ -3069,7 +3276,7 @@ again:
3069 if (handle->wakeup != local_read(&buffer->wakeup)) 3276 if (handle->wakeup != local_read(&buffer->wakeup))
3070 perf_output_wakeup(handle); 3277 perf_output_wakeup(handle);
3071 3278
3072 out: 3279out:
3073 preempt_enable(); 3280 preempt_enable();
3074} 3281}
3075 3282
@@ -3457,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3457 struct perf_output_handle handle; 3664 struct perf_output_handle handle;
3458 struct perf_event_header header; 3665 struct perf_event_header header;
3459 3666
3667 /* protect the callchain buffers */
3668 rcu_read_lock();
3669
3460 perf_prepare_sample(&header, data, event, regs); 3670 perf_prepare_sample(&header, data, event, regs);
3461 3671
3462 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3672 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3463 return; 3673 goto exit;
3464 3674
3465 perf_output_sample(&handle, &header, data, event); 3675 perf_output_sample(&handle, &header, data, event);
3466 3676
3467 perf_output_end(&handle); 3677 perf_output_end(&handle);
3678
3679exit:
3680 rcu_read_unlock();
3468} 3681}
3469 3682
3470/* 3683/*
@@ -3578,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3578static void perf_event_task_event(struct perf_task_event *task_event) 3791static void perf_event_task_event(struct perf_task_event *task_event)
3579{ 3792{
3580 struct perf_cpu_context *cpuctx; 3793 struct perf_cpu_context *cpuctx;
3581 struct perf_event_context *ctx = task_event->task_ctx; 3794 struct perf_event_context *ctx;
3795 struct pmu *pmu;
3796 int ctxn;
3582 3797
3583 rcu_read_lock(); 3798 rcu_read_lock();
3584 cpuctx = &get_cpu_var(perf_cpu_context); 3799 list_for_each_entry_rcu(pmu, &pmus, entry) {
3585 perf_event_task_ctx(&cpuctx->ctx, task_event); 3800 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3586 if (!ctx) 3801 perf_event_task_ctx(&cpuctx->ctx, task_event);
3587 ctx = rcu_dereference(current->perf_event_ctxp); 3802
3588 if (ctx) 3803 ctx = task_event->task_ctx;
3589 perf_event_task_ctx(ctx, task_event); 3804 if (!ctx) {
3590 put_cpu_var(perf_cpu_context); 3805 ctxn = pmu->task_ctx_nr;
3806 if (ctxn < 0)
3807 goto next;
3808 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3809 }
3810 if (ctx)
3811 perf_event_task_ctx(ctx, task_event);
3812next:
3813 put_cpu_ptr(pmu->pmu_cpu_context);
3814 }
3591 rcu_read_unlock(); 3815 rcu_read_unlock();
3592} 3816}
3593 3817
@@ -3692,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3692{ 3916{
3693 struct perf_cpu_context *cpuctx; 3917 struct perf_cpu_context *cpuctx;
3694 struct perf_event_context *ctx; 3918 struct perf_event_context *ctx;
3695 unsigned int size;
3696 char comm[TASK_COMM_LEN]; 3919 char comm[TASK_COMM_LEN];
3920 unsigned int size;
3921 struct pmu *pmu;
3922 int ctxn;
3697 3923
3698 memset(comm, 0, sizeof(comm)); 3924 memset(comm, 0, sizeof(comm));
3699 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3925 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3705,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3705 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3931 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3706 3932
3707 rcu_read_lock(); 3933 rcu_read_lock();
3708 cpuctx = &get_cpu_var(perf_cpu_context); 3934 list_for_each_entry_rcu(pmu, &pmus, entry) {
3709 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3935 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3710 ctx = rcu_dereference(current->perf_event_ctxp); 3936 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3711 if (ctx) 3937
3712 perf_event_comm_ctx(ctx, comm_event); 3938 ctxn = pmu->task_ctx_nr;
3713 put_cpu_var(perf_cpu_context); 3939 if (ctxn < 0)
3940 goto next;
3941
3942 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3943 if (ctx)
3944 perf_event_comm_ctx(ctx, comm_event);
3945next:
3946 put_cpu_ptr(pmu->pmu_cpu_context);
3947 }
3714 rcu_read_unlock(); 3948 rcu_read_unlock();
3715} 3949}
3716 3950
3717void perf_event_comm(struct task_struct *task) 3951void perf_event_comm(struct task_struct *task)
3718{ 3952{
3719 struct perf_comm_event comm_event; 3953 struct perf_comm_event comm_event;
3954 struct perf_event_context *ctx;
3955 int ctxn;
3956
3957 for_each_task_context_nr(ctxn) {
3958 ctx = task->perf_event_ctxp[ctxn];
3959 if (!ctx)
3960 continue;
3720 3961
3721 if (task->perf_event_ctxp) 3962 perf_event_enable_on_exec(ctx);
3722 perf_event_enable_on_exec(task); 3963 }
3723 3964
3724 if (!atomic_read(&nr_comm_events)) 3965 if (!atomic_read(&nr_comm_events))
3725 return; 3966 return;
@@ -3821,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3821 char tmp[16]; 4062 char tmp[16];
3822 char *buf = NULL; 4063 char *buf = NULL;
3823 const char *name; 4064 const char *name;
4065 struct pmu *pmu;
4066 int ctxn;
3824 4067
3825 memset(tmp, 0, sizeof(tmp)); 4068 memset(tmp, 0, sizeof(tmp));
3826 4069
@@ -3873,12 +4116,23 @@ got_name:
3873 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4116 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3874 4117
3875 rcu_read_lock(); 4118 rcu_read_lock();
3876 cpuctx = &get_cpu_var(perf_cpu_context); 4119 list_for_each_entry_rcu(pmu, &pmus, entry) {
3877 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4120 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3878 ctx = rcu_dereference(current->perf_event_ctxp); 4121 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3879 if (ctx) 4122 vma->vm_flags & VM_EXEC);
3880 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4123
3881 put_cpu_var(perf_cpu_context); 4124 ctxn = pmu->task_ctx_nr;
4125 if (ctxn < 0)
4126 goto next;
4127
4128 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4129 if (ctx) {
4130 perf_event_mmap_ctx(ctx, mmap_event,
4131 vma->vm_flags & VM_EXEC);
4132 }
4133next:
4134 put_cpu_ptr(pmu->pmu_cpu_context);
4135 }
3882 rcu_read_unlock(); 4136 rcu_read_unlock();
3883 4137
3884 kfree(buf); 4138 kfree(buf);
@@ -3960,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3960 struct hw_perf_event *hwc = &event->hw; 4214 struct hw_perf_event *hwc = &event->hw;
3961 int ret = 0; 4215 int ret = 0;
3962 4216
3963 throttle = (throttle && event->pmu->unthrottle != NULL);
3964
3965 if (!throttle) { 4217 if (!throttle) {
3966 hwc->interrupts++; 4218 hwc->interrupts++;
3967 } else { 4219 } else {
@@ -4004,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4004 event->pending_kill = POLL_HUP; 4256 event->pending_kill = POLL_HUP;
4005 if (nmi) { 4257 if (nmi) {
4006 event->pending_disable = 1; 4258 event->pending_disable = 1;
4007 perf_pending_queue(&event->pending, 4259 irq_work_queue(&event->pending);
4008 perf_pending_event);
4009 } else 4260 } else
4010 perf_event_disable(event); 4261 perf_event_disable(event);
4011 } 4262 }
@@ -4029,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4029 * Generic software event infrastructure 4280 * Generic software event infrastructure
4030 */ 4281 */
4031 4282
4283struct swevent_htable {
4284 struct swevent_hlist *swevent_hlist;
4285 struct mutex hlist_mutex;
4286 int hlist_refcount;
4287
4288 /* Recursion avoidance in each contexts */
4289 int recursion[PERF_NR_CONTEXTS];
4290};
4291
4292static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4293
4032/* 4294/*
4033 * We directly increment event->count and keep a second value in 4295 * We directly increment event->count and keep a second value in
4034 * event->hw.period_left to count intervals. This period event 4296 * event->hw.period_left to count intervals. This period event
@@ -4086,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4086 } 4348 }
4087} 4349}
4088 4350
4089static void perf_swevent_add(struct perf_event *event, u64 nr, 4351static void perf_swevent_event(struct perf_event *event, u64 nr,
4090 int nmi, struct perf_sample_data *data, 4352 int nmi, struct perf_sample_data *data,
4091 struct pt_regs *regs) 4353 struct pt_regs *regs)
4092{ 4354{
@@ -4112,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4112static int perf_exclude_event(struct perf_event *event, 4374static int perf_exclude_event(struct perf_event *event,
4113 struct pt_regs *regs) 4375 struct pt_regs *regs)
4114{ 4376{
4377 if (event->hw.state & PERF_HES_STOPPED)
4378 return 0;
4379
4115 if (regs) { 4380 if (regs) {
4116 if (event->attr.exclude_user && user_mode(regs)) 4381 if (event->attr.exclude_user && user_mode(regs))
4117 return 1; 4382 return 1;
@@ -4158,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4158 4423
4159/* For the read side: events when they trigger */ 4424/* For the read side: events when they trigger */
4160static inline struct hlist_head * 4425static inline struct hlist_head *
4161find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4426find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4162{ 4427{
4163 struct swevent_hlist *hlist; 4428 struct swevent_hlist *hlist;
4164 4429
4165 hlist = rcu_dereference(ctx->swevent_hlist); 4430 hlist = rcu_dereference(swhash->swevent_hlist);
4166 if (!hlist) 4431 if (!hlist)
4167 return NULL; 4432 return NULL;
4168 4433
@@ -4171,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4171 4436
4172/* For the event head insertion and removal in the hlist */ 4437/* For the event head insertion and removal in the hlist */
4173static inline struct hlist_head * 4438static inline struct hlist_head *
4174find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4439find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4175{ 4440{
4176 struct swevent_hlist *hlist; 4441 struct swevent_hlist *hlist;
4177 u32 event_id = event->attr.config; 4442 u32 event_id = event->attr.config;
@@ -4182,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4182 * and release. Which makes the protected version suitable here. 4447 * and release. Which makes the protected version suitable here.
4183 * The context lock guarantees that. 4448 * The context lock guarantees that.
4184 */ 4449 */
4185 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4450 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4186 lockdep_is_held(&event->ctx->lock)); 4451 lockdep_is_held(&event->ctx->lock));
4187 if (!hlist) 4452 if (!hlist)
4188 return NULL; 4453 return NULL;
@@ -4195,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4195 struct perf_sample_data *data, 4460 struct perf_sample_data *data,
4196 struct pt_regs *regs) 4461 struct pt_regs *regs)
4197{ 4462{
4198 struct perf_cpu_context *cpuctx; 4463 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4199 struct perf_event *event; 4464 struct perf_event *event;
4200 struct hlist_node *node; 4465 struct hlist_node *node;
4201 struct hlist_head *head; 4466 struct hlist_head *head;
4202 4467
4203 cpuctx = &__get_cpu_var(perf_cpu_context);
4204
4205 rcu_read_lock(); 4468 rcu_read_lock();
4206 4469 head = find_swevent_head_rcu(swhash, type, event_id);
4207 head = find_swevent_head_rcu(cpuctx, type, event_id);
4208
4209 if (!head) 4470 if (!head)
4210 goto end; 4471 goto end;
4211 4472
4212 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4473 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4213 if (perf_swevent_match(event, type, event_id, data, regs)) 4474 if (perf_swevent_match(event, type, event_id, data, regs))
4214 perf_swevent_add(event, nr, nmi, data, regs); 4475 perf_swevent_event(event, nr, nmi, data, regs);
4215 } 4476 }
4216end: 4477end:
4217 rcu_read_unlock(); 4478 rcu_read_unlock();
@@ -4219,33 +4480,17 @@ end:
4219 4480
4220int perf_swevent_get_recursion_context(void) 4481int perf_swevent_get_recursion_context(void)
4221{ 4482{
4222 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4483 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4223 int rctx;
4224
4225 if (in_nmi())
4226 rctx = 3;
4227 else if (in_irq())
4228 rctx = 2;
4229 else if (in_softirq())
4230 rctx = 1;
4231 else
4232 rctx = 0;
4233
4234 if (cpuctx->recursion[rctx])
4235 return -1;
4236 4484
4237 cpuctx->recursion[rctx]++; 4485 return get_recursion_context(swhash->recursion);
4238 barrier();
4239
4240 return rctx;
4241} 4486}
4242EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4487EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4243 4488
4244void inline perf_swevent_put_recursion_context(int rctx) 4489void inline perf_swevent_put_recursion_context(int rctx)
4245{ 4490{
4246 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4491 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4247 barrier(); 4492
4248 cpuctx->recursion[rctx]--; 4493 put_recursion_context(swhash->recursion, rctx);
4249} 4494}
4250 4495
4251void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4496void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event)
4271{ 4516{
4272} 4517}
4273 4518
4274static int perf_swevent_enable(struct perf_event *event) 4519static int perf_swevent_add(struct perf_event *event, int flags)
4275{ 4520{
4521 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4276 struct hw_perf_event *hwc = &event->hw; 4522 struct hw_perf_event *hwc = &event->hw;
4277 struct perf_cpu_context *cpuctx;
4278 struct hlist_head *head; 4523 struct hlist_head *head;
4279 4524
4280 cpuctx = &__get_cpu_var(perf_cpu_context);
4281
4282 if (hwc->sample_period) { 4525 if (hwc->sample_period) {
4283 hwc->last_period = hwc->sample_period; 4526 hwc->last_period = hwc->sample_period;
4284 perf_swevent_set_period(event); 4527 perf_swevent_set_period(event);
4285 } 4528 }
4286 4529
4287 head = find_swevent_head(cpuctx, event); 4530 hwc->state = !(flags & PERF_EF_START);
4531
4532 head = find_swevent_head(swhash, event);
4288 if (WARN_ON_ONCE(!head)) 4533 if (WARN_ON_ONCE(!head))
4289 return -EINVAL; 4534 return -EINVAL;
4290 4535
@@ -4293,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event)
4293 return 0; 4538 return 0;
4294} 4539}
4295 4540
4296static void perf_swevent_disable(struct perf_event *event) 4541static void perf_swevent_del(struct perf_event *event, int flags)
4297{ 4542{
4298 hlist_del_rcu(&event->hlist_entry); 4543 hlist_del_rcu(&event->hlist_entry);
4299} 4544}
4300 4545
4301static void perf_swevent_void(struct perf_event *event) 4546static void perf_swevent_start(struct perf_event *event, int flags)
4302{
4303}
4304
4305static int perf_swevent_int(struct perf_event *event)
4306{
4307 return 0;
4308}
4309
4310static const struct pmu perf_ops_generic = {
4311 .enable = perf_swevent_enable,
4312 .disable = perf_swevent_disable,
4313 .start = perf_swevent_int,
4314 .stop = perf_swevent_void,
4315 .read = perf_swevent_read,
4316 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4317};
4318
4319/*
4320 * hrtimer based swevent callback
4321 */
4322
4323static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4324{ 4547{
4325 enum hrtimer_restart ret = HRTIMER_RESTART; 4548 event->hw.state = 0;
4326 struct perf_sample_data data;
4327 struct pt_regs *regs;
4328 struct perf_event *event;
4329 u64 period;
4330
4331 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4332 event->pmu->read(event);
4333
4334 perf_sample_data_init(&data, 0);
4335 data.period = event->hw.last_period;
4336 regs = get_irq_regs();
4337
4338 if (regs && !perf_exclude_event(event, regs)) {
4339 if (!(event->attr.exclude_idle && current->pid == 0))
4340 if (perf_event_overflow(event, 0, &data, regs))
4341 ret = HRTIMER_NORESTART;
4342 }
4343
4344 period = max_t(u64, 10000, event->hw.sample_period);
4345 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4346
4347 return ret;
4348} 4549}
4349 4550
4350static void perf_swevent_start_hrtimer(struct perf_event *event) 4551static void perf_swevent_stop(struct perf_event *event, int flags)
4351{ 4552{
4352 struct hw_perf_event *hwc = &event->hw; 4553 event->hw.state = PERF_HES_STOPPED;
4353
4354 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4355 hwc->hrtimer.function = perf_swevent_hrtimer;
4356 if (hwc->sample_period) {
4357 u64 period;
4358
4359 if (hwc->remaining) {
4360 if (hwc->remaining < 0)
4361 period = 10000;
4362 else
4363 period = hwc->remaining;
4364 hwc->remaining = 0;
4365 } else {
4366 period = max_t(u64, 10000, hwc->sample_period);
4367 }
4368 __hrtimer_start_range_ns(&hwc->hrtimer,
4369 ns_to_ktime(period), 0,
4370 HRTIMER_MODE_REL, 0);
4371 }
4372}
4373
4374static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4375{
4376 struct hw_perf_event *hwc = &event->hw;
4377
4378 if (hwc->sample_period) {
4379 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4380 hwc->remaining = ktime_to_ns(remaining);
4381
4382 hrtimer_cancel(&hwc->hrtimer);
4383 }
4384}
4385
4386/*
4387 * Software event: cpu wall time clock
4388 */
4389
4390static void cpu_clock_perf_event_update(struct perf_event *event)
4391{
4392 int cpu = raw_smp_processor_id();
4393 s64 prev;
4394 u64 now;
4395
4396 now = cpu_clock(cpu);
4397 prev = local64_xchg(&event->hw.prev_count, now);
4398 local64_add(now - prev, &event->count);
4399}
4400
4401static int cpu_clock_perf_event_enable(struct perf_event *event)
4402{
4403 struct hw_perf_event *hwc = &event->hw;
4404 int cpu = raw_smp_processor_id();
4405
4406 local64_set(&hwc->prev_count, cpu_clock(cpu));
4407 perf_swevent_start_hrtimer(event);
4408
4409 return 0;
4410}
4411
4412static void cpu_clock_perf_event_disable(struct perf_event *event)
4413{
4414 perf_swevent_cancel_hrtimer(event);
4415 cpu_clock_perf_event_update(event);
4416}
4417
4418static void cpu_clock_perf_event_read(struct perf_event *event)
4419{
4420 cpu_clock_perf_event_update(event);
4421}
4422
4423static const struct pmu perf_ops_cpu_clock = {
4424 .enable = cpu_clock_perf_event_enable,
4425 .disable = cpu_clock_perf_event_disable,
4426 .read = cpu_clock_perf_event_read,
4427};
4428
4429/*
4430 * Software event: task time clock
4431 */
4432
4433static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4434{
4435 u64 prev;
4436 s64 delta;
4437
4438 prev = local64_xchg(&event->hw.prev_count, now);
4439 delta = now - prev;
4440 local64_add(delta, &event->count);
4441}
4442
4443static int task_clock_perf_event_enable(struct perf_event *event)
4444{
4445 struct hw_perf_event *hwc = &event->hw;
4446 u64 now;
4447
4448 now = event->ctx->time;
4449
4450 local64_set(&hwc->prev_count, now);
4451
4452 perf_swevent_start_hrtimer(event);
4453
4454 return 0;
4455}
4456
4457static void task_clock_perf_event_disable(struct perf_event *event)
4458{
4459 perf_swevent_cancel_hrtimer(event);
4460 task_clock_perf_event_update(event, event->ctx->time);
4461
4462}
4463
4464static void task_clock_perf_event_read(struct perf_event *event)
4465{
4466 u64 time;
4467
4468 if (!in_nmi()) {
4469 update_context_time(event->ctx);
4470 time = event->ctx->time;
4471 } else {
4472 u64 now = perf_clock();
4473 u64 delta = now - event->ctx->timestamp;
4474 time = event->ctx->time + delta;
4475 }
4476
4477 task_clock_perf_event_update(event, time);
4478} 4554}
4479 4555
4480static const struct pmu perf_ops_task_clock = {
4481 .enable = task_clock_perf_event_enable,
4482 .disable = task_clock_perf_event_disable,
4483 .read = task_clock_perf_event_read,
4484};
4485
4486/* Deref the hlist from the update side */ 4556/* Deref the hlist from the update side */
4487static inline struct swevent_hlist * 4557static inline struct swevent_hlist *
4488swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4558swevent_hlist_deref(struct swevent_htable *swhash)
4489{ 4559{
4490 return rcu_dereference_protected(cpuctx->swevent_hlist, 4560 return rcu_dereference_protected(swhash->swevent_hlist,
4491 lockdep_is_held(&cpuctx->hlist_mutex)); 4561 lockdep_is_held(&swhash->hlist_mutex));
4492} 4562}
4493 4563
4494static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4564static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4499,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4499 kfree(hlist); 4569 kfree(hlist);
4500} 4570}
4501 4571
4502static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4572static void swevent_hlist_release(struct swevent_htable *swhash)
4503{ 4573{
4504 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4574 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4505 4575
4506 if (!hlist) 4576 if (!hlist)
4507 return; 4577 return;
4508 4578
4509 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4579 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4510 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4580 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4511} 4581}
4512 4582
4513static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4583static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4514{ 4584{
4515 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4585 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4516 4586
4517 mutex_lock(&cpuctx->hlist_mutex); 4587 mutex_lock(&swhash->hlist_mutex);
4518 4588
4519 if (!--cpuctx->hlist_refcount) 4589 if (!--swhash->hlist_refcount)
4520 swevent_hlist_release(cpuctx); 4590 swevent_hlist_release(swhash);
4521 4591
4522 mutex_unlock(&cpuctx->hlist_mutex); 4592 mutex_unlock(&swhash->hlist_mutex);
4523} 4593}
4524 4594
4525static void swevent_hlist_put(struct perf_event *event) 4595static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event)
4537 4607
4538static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4608static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4539{ 4609{
4540 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4610 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4541 int err = 0; 4611 int err = 0;
4542 4612
4543 mutex_lock(&cpuctx->hlist_mutex); 4613 mutex_lock(&swhash->hlist_mutex);
4544 4614
4545 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4615 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4546 struct swevent_hlist *hlist; 4616 struct swevent_hlist *hlist;
4547 4617
4548 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4618 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4550 err = -ENOMEM; 4620 err = -ENOMEM;
4551 goto exit; 4621 goto exit;
4552 } 4622 }
4553 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4623 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4554 } 4624 }
4555 cpuctx->hlist_refcount++; 4625 swhash->hlist_refcount++;
4556 exit: 4626exit:
4557 mutex_unlock(&cpuctx->hlist_mutex); 4627 mutex_unlock(&swhash->hlist_mutex);
4558 4628
4559 return err; 4629 return err;
4560} 4630}
@@ -4578,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event)
4578 put_online_cpus(); 4648 put_online_cpus();
4579 4649
4580 return 0; 4650 return 0;
4581 fail: 4651fail:
4582 for_each_possible_cpu(cpu) { 4652 for_each_possible_cpu(cpu) {
4583 if (cpu == failed_cpu) 4653 if (cpu == failed_cpu)
4584 break; 4654 break;
@@ -4589,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event)
4589 return err; 4659 return err;
4590} 4660}
4591 4661
4592#ifdef CONFIG_EVENT_TRACING 4662atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4663
4664static void sw_perf_event_destroy(struct perf_event *event)
4665{
4666 u64 event_id = event->attr.config;
4667
4668 WARN_ON(event->parent);
4669
4670 jump_label_dec(&perf_swevent_enabled[event_id]);
4671 swevent_hlist_put(event);
4672}
4673
4674static int perf_swevent_init(struct perf_event *event)
4675{
4676 int event_id = event->attr.config;
4677
4678 if (event->attr.type != PERF_TYPE_SOFTWARE)
4679 return -ENOENT;
4680
4681 switch (event_id) {
4682 case PERF_COUNT_SW_CPU_CLOCK:
4683 case PERF_COUNT_SW_TASK_CLOCK:
4684 return -ENOENT;
4593 4685
4594static const struct pmu perf_ops_tracepoint = { 4686 default:
4595 .enable = perf_trace_enable, 4687 break;
4596 .disable = perf_trace_disable, 4688 }
4597 .start = perf_swevent_int, 4689
4598 .stop = perf_swevent_void, 4690 if (event_id > PERF_COUNT_SW_MAX)
4691 return -ENOENT;
4692
4693 if (!event->parent) {
4694 int err;
4695
4696 err = swevent_hlist_get(event);
4697 if (err)
4698 return err;
4699
4700 jump_label_inc(&perf_swevent_enabled[event_id]);
4701 event->destroy = sw_perf_event_destroy;
4702 }
4703
4704 return 0;
4705}
4706
4707static struct pmu perf_swevent = {
4708 .task_ctx_nr = perf_sw_context,
4709
4710 .event_init = perf_swevent_init,
4711 .add = perf_swevent_add,
4712 .del = perf_swevent_del,
4713 .start = perf_swevent_start,
4714 .stop = perf_swevent_stop,
4599 .read = perf_swevent_read, 4715 .read = perf_swevent_read,
4600 .unthrottle = perf_swevent_void,
4601}; 4716};
4602 4717
4718#ifdef CONFIG_EVENT_TRACING
4719
4603static int perf_tp_filter_match(struct perf_event *event, 4720static int perf_tp_filter_match(struct perf_event *event,
4604 struct perf_sample_data *data) 4721 struct perf_sample_data *data)
4605{ 4722{
@@ -4643,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4643 4760
4644 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4761 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4645 if (perf_tp_event_match(event, &data, regs)) 4762 if (perf_tp_event_match(event, &data, regs))
4646 perf_swevent_add(event, count, 1, &data, regs); 4763 perf_swevent_event(event, count, 1, &data, regs);
4647 } 4764 }
4648 4765
4649 perf_swevent_put_recursion_context(rctx); 4766 perf_swevent_put_recursion_context(rctx);
@@ -4655,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4655 perf_trace_destroy(event); 4772 perf_trace_destroy(event);
4656} 4773}
4657 4774
4658static const struct pmu *tp_perf_event_init(struct perf_event *event) 4775static int perf_tp_event_init(struct perf_event *event)
4659{ 4776{
4660 int err; 4777 int err;
4661 4778
4779 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4780 return -ENOENT;
4781
4662 /* 4782 /*
4663 * Raw tracepoint data is a severe data leak, only allow root to 4783 * Raw tracepoint data is a severe data leak, only allow root to
4664 * have these. 4784 * have these.
@@ -4666,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4666 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4786 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4667 perf_paranoid_tracepoint_raw() && 4787 perf_paranoid_tracepoint_raw() &&
4668 !capable(CAP_SYS_ADMIN)) 4788 !capable(CAP_SYS_ADMIN))
4669 return ERR_PTR(-EPERM); 4789 return -EPERM;
4670 4790
4671 err = perf_trace_init(event); 4791 err = perf_trace_init(event);
4672 if (err) 4792 if (err)
4673 return NULL; 4793 return err;
4674 4794
4675 event->destroy = tp_perf_event_destroy; 4795 event->destroy = tp_perf_event_destroy;
4676 4796
4677 return &perf_ops_tracepoint; 4797 return 0;
4798}
4799
4800static struct pmu perf_tracepoint = {
4801 .task_ctx_nr = perf_sw_context,
4802
4803 .event_init = perf_tp_event_init,
4804 .add = perf_trace_add,
4805 .del = perf_trace_del,
4806 .start = perf_swevent_start,
4807 .stop = perf_swevent_stop,
4808 .read = perf_swevent_read,
4809};
4810
4811static inline void perf_tp_register(void)
4812{
4813 perf_pmu_register(&perf_tracepoint);
4678} 4814}
4679 4815
4680static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4816static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event)
4702 4838
4703#else 4839#else
4704 4840
4705static const struct pmu *tp_perf_event_init(struct perf_event *event) 4841static inline void perf_tp_register(void)
4706{ 4842{
4707 return NULL;
4708} 4843}
4709 4844
4710static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4845static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event)
4719#endif /* CONFIG_EVENT_TRACING */ 4854#endif /* CONFIG_EVENT_TRACING */
4720 4855
4721#ifdef CONFIG_HAVE_HW_BREAKPOINT 4856#ifdef CONFIG_HAVE_HW_BREAKPOINT
4722static void bp_perf_event_destroy(struct perf_event *event) 4857void perf_bp_event(struct perf_event *bp, void *data)
4723{ 4858{
4724 release_bp_slot(event); 4859 struct perf_sample_data sample;
4860 struct pt_regs *regs = data;
4861
4862 perf_sample_data_init(&sample, bp->attr.bp_addr);
4863
4864 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4865 perf_swevent_event(bp, 1, 1, &sample, regs);
4725} 4866}
4867#endif
4726 4868
4727static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4869/*
4870 * hrtimer based swevent callback
4871 */
4872
4873static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4728{ 4874{
4729 int err; 4875 enum hrtimer_restart ret = HRTIMER_RESTART;
4876 struct perf_sample_data data;
4877 struct pt_regs *regs;
4878 struct perf_event *event;
4879 u64 period;
4730 4880
4731 err = register_perf_hw_breakpoint(bp); 4881 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4732 if (err) 4882 event->pmu->read(event);
4733 return ERR_PTR(err); 4883
4884 perf_sample_data_init(&data, 0);
4885 data.period = event->hw.last_period;
4886 regs = get_irq_regs();
4887
4888 if (regs && !perf_exclude_event(event, regs)) {
4889 if (!(event->attr.exclude_idle && current->pid == 0))
4890 if (perf_event_overflow(event, 0, &data, regs))
4891 ret = HRTIMER_NORESTART;
4892 }
4893
4894 period = max_t(u64, 10000, event->hw.sample_period);
4895 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4734 4896
4735 bp->destroy = bp_perf_event_destroy; 4897 return ret;
4898}
4736 4899
4737 return &perf_ops_bp; 4900static void perf_swevent_start_hrtimer(struct perf_event *event)
4901{
4902 struct hw_perf_event *hwc = &event->hw;
4903
4904 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4905 hwc->hrtimer.function = perf_swevent_hrtimer;
4906 if (hwc->sample_period) {
4907 s64 period = local64_read(&hwc->period_left);
4908
4909 if (period) {
4910 if (period < 0)
4911 period = 10000;
4912
4913 local64_set(&hwc->period_left, 0);
4914 } else {
4915 period = max_t(u64, 10000, hwc->sample_period);
4916 }
4917 __hrtimer_start_range_ns(&hwc->hrtimer,
4918 ns_to_ktime(period), 0,
4919 HRTIMER_MODE_REL_PINNED, 0);
4920 }
4738} 4921}
4739 4922
4740void perf_bp_event(struct perf_event *bp, void *data) 4923static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4741{ 4924{
4742 struct perf_sample_data sample; 4925 struct hw_perf_event *hwc = &event->hw;
4743 struct pt_regs *regs = data;
4744 4926
4745 perf_sample_data_init(&sample, bp->attr.bp_addr); 4927 if (hwc->sample_period) {
4928 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4929 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4746 4930
4747 if (!perf_exclude_event(bp, regs)) 4931 hrtimer_cancel(&hwc->hrtimer);
4748 perf_swevent_add(bp, 1, 1, &sample, regs); 4932 }
4749} 4933}
4750#else 4934
4751static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4935/*
4936 * Software event: cpu wall time clock
4937 */
4938
4939static void cpu_clock_event_update(struct perf_event *event)
4752{ 4940{
4753 return NULL; 4941 s64 prev;
4942 u64 now;
4943
4944 now = local_clock();
4945 prev = local64_xchg(&event->hw.prev_count, now);
4946 local64_add(now - prev, &event->count);
4754} 4947}
4755 4948
4756void perf_bp_event(struct perf_event *bp, void *regs) 4949static void cpu_clock_event_start(struct perf_event *event, int flags)
4757{ 4950{
4951 local64_set(&event->hw.prev_count, local_clock());
4952 perf_swevent_start_hrtimer(event);
4758} 4953}
4759#endif
4760 4954
4761atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4955static void cpu_clock_event_stop(struct perf_event *event, int flags)
4956{
4957 perf_swevent_cancel_hrtimer(event);
4958 cpu_clock_event_update(event);
4959}
4762 4960
4763static void sw_perf_event_destroy(struct perf_event *event) 4961static int cpu_clock_event_add(struct perf_event *event, int flags)
4764{ 4962{
4765 u64 event_id = event->attr.config; 4963 if (flags & PERF_EF_START)
4964 cpu_clock_event_start(event, flags);
4766 4965
4767 WARN_ON(event->parent); 4966 return 0;
4967}
4768 4968
4769 atomic_dec(&perf_swevent_enabled[event_id]); 4969static void cpu_clock_event_del(struct perf_event *event, int flags)
4770 swevent_hlist_put(event); 4970{
4971 cpu_clock_event_stop(event, flags);
4771} 4972}
4772 4973
4773static const struct pmu *sw_perf_event_init(struct perf_event *event) 4974static void cpu_clock_event_read(struct perf_event *event)
4774{ 4975{
4775 const struct pmu *pmu = NULL; 4976 cpu_clock_event_update(event);
4776 u64 event_id = event->attr.config; 4977}
4978
4979static int cpu_clock_event_init(struct perf_event *event)
4980{
4981 if (event->attr.type != PERF_TYPE_SOFTWARE)
4982 return -ENOENT;
4983
4984 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
4985 return -ENOENT;
4986
4987 return 0;
4988}
4777 4989
4990static struct pmu perf_cpu_clock = {
4991 .task_ctx_nr = perf_sw_context,
4992
4993 .event_init = cpu_clock_event_init,
4994 .add = cpu_clock_event_add,
4995 .del = cpu_clock_event_del,
4996 .start = cpu_clock_event_start,
4997 .stop = cpu_clock_event_stop,
4998 .read = cpu_clock_event_read,
4999};
5000
5001/*
5002 * Software event: task time clock
5003 */
5004
5005static void task_clock_event_update(struct perf_event *event, u64 now)
5006{
5007 u64 prev;
5008 s64 delta;
5009
5010 prev = local64_xchg(&event->hw.prev_count, now);
5011 delta = now - prev;
5012 local64_add(delta, &event->count);
5013}
5014
5015static void task_clock_event_start(struct perf_event *event, int flags)
5016{
5017 local64_set(&event->hw.prev_count, event->ctx->time);
5018 perf_swevent_start_hrtimer(event);
5019}
5020
5021static void task_clock_event_stop(struct perf_event *event, int flags)
5022{
5023 perf_swevent_cancel_hrtimer(event);
5024 task_clock_event_update(event, event->ctx->time);
5025}
5026
5027static int task_clock_event_add(struct perf_event *event, int flags)
5028{
5029 if (flags & PERF_EF_START)
5030 task_clock_event_start(event, flags);
5031
5032 return 0;
5033}
5034
5035static void task_clock_event_del(struct perf_event *event, int flags)
5036{
5037 task_clock_event_stop(event, PERF_EF_UPDATE);
5038}
5039
5040static void task_clock_event_read(struct perf_event *event)
5041{
5042 u64 time;
5043
5044 if (!in_nmi()) {
5045 update_context_time(event->ctx);
5046 time = event->ctx->time;
5047 } else {
5048 u64 now = perf_clock();
5049 u64 delta = now - event->ctx->timestamp;
5050 time = event->ctx->time + delta;
5051 }
5052
5053 task_clock_event_update(event, time);
5054}
5055
5056static int task_clock_event_init(struct perf_event *event)
5057{
5058 if (event->attr.type != PERF_TYPE_SOFTWARE)
5059 return -ENOENT;
5060
5061 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5062 return -ENOENT;
5063
5064 return 0;
5065}
5066
5067static struct pmu perf_task_clock = {
5068 .task_ctx_nr = perf_sw_context,
5069
5070 .event_init = task_clock_event_init,
5071 .add = task_clock_event_add,
5072 .del = task_clock_event_del,
5073 .start = task_clock_event_start,
5074 .stop = task_clock_event_stop,
5075 .read = task_clock_event_read,
5076};
5077
5078static void perf_pmu_nop_void(struct pmu *pmu)
5079{
5080}
5081
5082static int perf_pmu_nop_int(struct pmu *pmu)
5083{
5084 return 0;
5085}
5086
5087static void perf_pmu_start_txn(struct pmu *pmu)
5088{
5089 perf_pmu_disable(pmu);
5090}
5091
5092static int perf_pmu_commit_txn(struct pmu *pmu)
5093{
5094 perf_pmu_enable(pmu);
5095 return 0;
5096}
5097
5098static void perf_pmu_cancel_txn(struct pmu *pmu)
5099{
5100 perf_pmu_enable(pmu);
5101}
5102
5103/*
5104 * Ensures all contexts with the same task_ctx_nr have the same
5105 * pmu_cpu_context too.
5106 */
5107static void *find_pmu_context(int ctxn)
5108{
5109 struct pmu *pmu;
5110
5111 if (ctxn < 0)
5112 return NULL;
5113
5114 list_for_each_entry(pmu, &pmus, entry) {
5115 if (pmu->task_ctx_nr == ctxn)
5116 return pmu->pmu_cpu_context;
5117 }
5118
5119 return NULL;
5120}
5121
5122static void free_pmu_context(void * __percpu cpu_context)
5123{
5124 struct pmu *pmu;
5125
5126 mutex_lock(&pmus_lock);
4778 /* 5127 /*
4779 * Software events (currently) can't in general distinguish 5128 * Like a real lame refcount.
4780 * between user, kernel and hypervisor events.
4781 * However, context switches and cpu migrations are considered
4782 * to be kernel events, and page faults are never hypervisor
4783 * events.
4784 */ 5129 */
4785 switch (event_id) { 5130 list_for_each_entry(pmu, &pmus, entry) {
4786 case PERF_COUNT_SW_CPU_CLOCK: 5131 if (pmu->pmu_cpu_context == cpu_context)
4787 pmu = &perf_ops_cpu_clock; 5132 goto out;
5133 }
4788 5134
4789 break; 5135 free_percpu(cpu_context);
4790 case PERF_COUNT_SW_TASK_CLOCK: 5136out:
4791 /* 5137 mutex_unlock(&pmus_lock);
4792 * If the user instantiates this as a per-cpu event, 5138}
4793 * use the cpu_clock event instead.
4794 */
4795 if (event->ctx->task)
4796 pmu = &perf_ops_task_clock;
4797 else
4798 pmu = &perf_ops_cpu_clock;
4799 5139
4800 break; 5140int perf_pmu_register(struct pmu *pmu)
4801 case PERF_COUNT_SW_PAGE_FAULTS: 5141{
4802 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5142 int cpu, ret;
4803 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 5143
4804 case PERF_COUNT_SW_CONTEXT_SWITCHES: 5144 mutex_lock(&pmus_lock);
4805 case PERF_COUNT_SW_CPU_MIGRATIONS: 5145 ret = -ENOMEM;
4806 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 5146 pmu->pmu_disable_count = alloc_percpu(int);
4807 case PERF_COUNT_SW_EMULATION_FAULTS: 5147 if (!pmu->pmu_disable_count)
4808 if (!event->parent) { 5148 goto unlock;
4809 int err;
4810
4811 err = swevent_hlist_get(event);
4812 if (err)
4813 return ERR_PTR(err);
4814 5149
4815 atomic_inc(&perf_swevent_enabled[event_id]); 5150 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
4816 event->destroy = sw_perf_event_destroy; 5151 if (pmu->pmu_cpu_context)
5152 goto got_cpu_context;
5153
5154 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5155 if (!pmu->pmu_cpu_context)
5156 goto free_pdc;
5157
5158 for_each_possible_cpu(cpu) {
5159 struct perf_cpu_context *cpuctx;
5160
5161 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5162 __perf_event_init_context(&cpuctx->ctx);
5163 cpuctx->ctx.type = cpu_context;
5164 cpuctx->ctx.pmu = pmu;
5165 cpuctx->jiffies_interval = 1;
5166 INIT_LIST_HEAD(&cpuctx->rotation_list);
5167 }
5168
5169got_cpu_context:
5170 if (!pmu->start_txn) {
5171 if (pmu->pmu_enable) {
5172 /*
5173 * If we have pmu_enable/pmu_disable calls, install
5174 * transaction stubs that use that to try and batch
5175 * hardware accesses.
5176 */
5177 pmu->start_txn = perf_pmu_start_txn;
5178 pmu->commit_txn = perf_pmu_commit_txn;
5179 pmu->cancel_txn = perf_pmu_cancel_txn;
5180 } else {
5181 pmu->start_txn = perf_pmu_nop_void;
5182 pmu->commit_txn = perf_pmu_nop_int;
5183 pmu->cancel_txn = perf_pmu_nop_void;
5184 }
5185 }
5186
5187 if (!pmu->pmu_enable) {
5188 pmu->pmu_enable = perf_pmu_nop_void;
5189 pmu->pmu_disable = perf_pmu_nop_void;
5190 }
5191
5192 list_add_rcu(&pmu->entry, &pmus);
5193 ret = 0;
5194unlock:
5195 mutex_unlock(&pmus_lock);
5196
5197 return ret;
5198
5199free_pdc:
5200 free_percpu(pmu->pmu_disable_count);
5201 goto unlock;
5202}
5203
5204void perf_pmu_unregister(struct pmu *pmu)
5205{
5206 mutex_lock(&pmus_lock);
5207 list_del_rcu(&pmu->entry);
5208 mutex_unlock(&pmus_lock);
5209
5210 /*
5211 * We dereference the pmu list under both SRCU and regular RCU, so
5212 * synchronize against both of those.
5213 */
5214 synchronize_srcu(&pmus_srcu);
5215 synchronize_rcu();
5216
5217 free_percpu(pmu->pmu_disable_count);
5218 free_pmu_context(pmu->pmu_cpu_context);
5219}
5220
5221struct pmu *perf_init_event(struct perf_event *event)
5222{
5223 struct pmu *pmu = NULL;
5224 int idx;
5225
5226 idx = srcu_read_lock(&pmus_srcu);
5227 list_for_each_entry_rcu(pmu, &pmus, entry) {
5228 int ret = pmu->event_init(event);
5229 if (!ret)
5230 goto unlock;
5231
5232 if (ret != -ENOENT) {
5233 pmu = ERR_PTR(ret);
5234 goto unlock;
4817 } 5235 }
4818 pmu = &perf_ops_generic;
4819 break;
4820 } 5236 }
5237 pmu = ERR_PTR(-ENOENT);
5238unlock:
5239 srcu_read_unlock(&pmus_srcu, idx);
4821 5240
4822 return pmu; 5241 return pmu;
4823} 5242}
@@ -4826,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4826 * Allocate and initialize a event structure 5245 * Allocate and initialize a event structure
4827 */ 5246 */
4828static struct perf_event * 5247static struct perf_event *
4829perf_event_alloc(struct perf_event_attr *attr, 5248perf_event_alloc(struct perf_event_attr *attr, int cpu,
4830 int cpu, 5249 struct task_struct *task,
4831 struct perf_event_context *ctx, 5250 struct perf_event *group_leader,
4832 struct perf_event *group_leader, 5251 struct perf_event *parent_event,
4833 struct perf_event *parent_event, 5252 perf_overflow_handler_t overflow_handler)
4834 perf_overflow_handler_t overflow_handler, 5253{
4835 gfp_t gfpflags) 5254 struct pmu *pmu;
4836{
4837 const struct pmu *pmu;
4838 struct perf_event *event; 5255 struct perf_event *event;
4839 struct hw_perf_event *hwc; 5256 struct hw_perf_event *hwc;
4840 long err; 5257 long err;
4841 5258
4842 event = kzalloc(sizeof(*event), gfpflags); 5259 event = kzalloc(sizeof(*event), GFP_KERNEL);
4843 if (!event) 5260 if (!event)
4844 return ERR_PTR(-ENOMEM); 5261 return ERR_PTR(-ENOMEM);
4845 5262
@@ -4857,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4857 INIT_LIST_HEAD(&event->event_entry); 5274 INIT_LIST_HEAD(&event->event_entry);
4858 INIT_LIST_HEAD(&event->sibling_list); 5275 INIT_LIST_HEAD(&event->sibling_list);
4859 init_waitqueue_head(&event->waitq); 5276 init_waitqueue_head(&event->waitq);
5277 init_irq_work(&event->pending, perf_pending_event);
4860 5278
4861 mutex_init(&event->mmap_mutex); 5279 mutex_init(&event->mmap_mutex);
4862 5280
@@ -4864,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 event->attr = *attr; 5282 event->attr = *attr;
4865 event->group_leader = group_leader; 5283 event->group_leader = group_leader;
4866 event->pmu = NULL; 5284 event->pmu = NULL;
4867 event->ctx = ctx;
4868 event->oncpu = -1; 5285 event->oncpu = -1;
4869 5286
4870 event->parent = parent_event; 5287 event->parent = parent_event;
@@ -4874,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr,
4874 5291
4875 event->state = PERF_EVENT_STATE_INACTIVE; 5292 event->state = PERF_EVENT_STATE_INACTIVE;
4876 5293
5294 if (task) {
5295 event->attach_state = PERF_ATTACH_TASK;
5296#ifdef CONFIG_HAVE_HW_BREAKPOINT
5297 /*
5298 * hw_breakpoint is a bit difficult here..
5299 */
5300 if (attr->type == PERF_TYPE_BREAKPOINT)
5301 event->hw.bp_target = task;
5302#endif
5303 }
5304
4877 if (!overflow_handler && parent_event) 5305 if (!overflow_handler && parent_event)
4878 overflow_handler = parent_event->overflow_handler; 5306 overflow_handler = parent_event->overflow_handler;
4879 5307
@@ -4898,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4898 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5326 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4899 goto done; 5327 goto done;
4900 5328
4901 switch (attr->type) { 5329 pmu = perf_init_event(event);
4902 case PERF_TYPE_RAW:
4903 case PERF_TYPE_HARDWARE:
4904 case PERF_TYPE_HW_CACHE:
4905 pmu = hw_perf_event_init(event);
4906 break;
4907
4908 case PERF_TYPE_SOFTWARE:
4909 pmu = sw_perf_event_init(event);
4910 break;
4911
4912 case PERF_TYPE_TRACEPOINT:
4913 pmu = tp_perf_event_init(event);
4914 break;
4915 5330
4916 case PERF_TYPE_BREAKPOINT:
4917 pmu = bp_perf_event_init(event);
4918 break;
4919
4920
4921 default:
4922 break;
4923 }
4924done: 5331done:
4925 err = 0; 5332 err = 0;
4926 if (!pmu) 5333 if (!pmu)
@@ -4938,13 +5345,21 @@ done:
4938 event->pmu = pmu; 5345 event->pmu = pmu;
4939 5346
4940 if (!event->parent) { 5347 if (!event->parent) {
4941 atomic_inc(&nr_events); 5348 if (event->attach_state & PERF_ATTACH_TASK)
5349 jump_label_inc(&perf_task_events);
4942 if (event->attr.mmap || event->attr.mmap_data) 5350 if (event->attr.mmap || event->attr.mmap_data)
4943 atomic_inc(&nr_mmap_events); 5351 atomic_inc(&nr_mmap_events);
4944 if (event->attr.comm) 5352 if (event->attr.comm)
4945 atomic_inc(&nr_comm_events); 5353 atomic_inc(&nr_comm_events);
4946 if (event->attr.task) 5354 if (event->attr.task)
4947 atomic_inc(&nr_task_events); 5355 atomic_inc(&nr_task_events);
5356 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5357 err = get_callchain_buffers();
5358 if (err) {
5359 free_event(event);
5360 return ERR_PTR(err);
5361 }
5362 }
4948 } 5363 }
4949 5364
4950 return event; 5365 return event;
@@ -5092,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open,
5092 struct perf_event_attr __user *, attr_uptr, 5507 struct perf_event_attr __user *, attr_uptr,
5093 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5508 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5094{ 5509{
5095 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5510 struct perf_event *group_leader = NULL, *output_event = NULL;
5511 struct perf_event *event, *sibling;
5096 struct perf_event_attr attr; 5512 struct perf_event_attr attr;
5097 struct perf_event_context *ctx; 5513 struct perf_event_context *ctx;
5098 struct file *event_file = NULL; 5514 struct file *event_file = NULL;
5099 struct file *group_file = NULL; 5515 struct file *group_file = NULL;
5516 struct task_struct *task = NULL;
5517 struct pmu *pmu;
5100 int event_fd; 5518 int event_fd;
5519 int move_group = 0;
5101 int fput_needed = 0; 5520 int fput_needed = 0;
5102 int err; 5521 int err;
5103 5522
@@ -5123,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open,
5123 if (event_fd < 0) 5542 if (event_fd < 0)
5124 return event_fd; 5543 return event_fd;
5125 5544
5126 /*
5127 * Get the target context (task or percpu):
5128 */
5129 ctx = find_get_context(pid, cpu);
5130 if (IS_ERR(ctx)) {
5131 err = PTR_ERR(ctx);
5132 goto err_fd;
5133 }
5134
5135 if (group_fd != -1) { 5545 if (group_fd != -1) {
5136 group_leader = perf_fget_light(group_fd, &fput_needed); 5546 group_leader = perf_fget_light(group_fd, &fput_needed);
5137 if (IS_ERR(group_leader)) { 5547 if (IS_ERR(group_leader)) {
5138 err = PTR_ERR(group_leader); 5548 err = PTR_ERR(group_leader);
5139 goto err_put_context; 5549 goto err_fd;
5140 } 5550 }
5141 group_file = group_leader->filp; 5551 group_file = group_leader->filp;
5142 if (flags & PERF_FLAG_FD_OUTPUT) 5552 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open,
5145 group_leader = NULL; 5555 group_leader = NULL;
5146 } 5556 }
5147 5557
5558 if (pid != -1) {
5559 task = find_lively_task_by_vpid(pid);
5560 if (IS_ERR(task)) {
5561 err = PTR_ERR(task);
5562 goto err_group_fd;
5563 }
5564 }
5565
5566 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5567 if (IS_ERR(event)) {
5568 err = PTR_ERR(event);
5569 goto err_task;
5570 }
5571
5572 /*
5573 * Special case software events and allow them to be part of
5574 * any hardware group.
5575 */
5576 pmu = event->pmu;
5577
5578 if (group_leader &&
5579 (is_software_event(event) != is_software_event(group_leader))) {
5580 if (is_software_event(event)) {
5581 /*
5582 * If event and group_leader are not both a software
5583 * event, and event is, then group leader is not.
5584 *
5585 * Allow the addition of software events to !software
5586 * groups, this is safe because software events never
5587 * fail to schedule.
5588 */
5589 pmu = group_leader->pmu;
5590 } else if (is_software_event(group_leader) &&
5591 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5592 /*
5593 * In case the group is a pure software group, and we
5594 * try to add a hardware event, move the whole group to
5595 * the hardware context.
5596 */
5597 move_group = 1;
5598 }
5599 }
5600
5601 /*
5602 * Get the target context (task or percpu):
5603 */
5604 ctx = find_get_context(pmu, task, cpu);
5605 if (IS_ERR(ctx)) {
5606 err = PTR_ERR(ctx);
5607 goto err_alloc;
5608 }
5609
5148 /* 5610 /*
5149 * Look up the group leader (we will attach this event to it): 5611 * Look up the group leader (we will attach this event to it):
5150 */ 5612 */
@@ -5156,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open,
5156 * becoming part of another group-sibling): 5618 * becoming part of another group-sibling):
5157 */ 5619 */
5158 if (group_leader->group_leader != group_leader) 5620 if (group_leader->group_leader != group_leader)
5159 goto err_put_context; 5621 goto err_context;
5160 /* 5622 /*
5161 * Do not allow to attach to a group in a different 5623 * Do not allow to attach to a group in a different
5162 * task or CPU context: 5624 * task or CPU context:
5163 */ 5625 */
5164 if (group_leader->ctx != ctx) 5626 if (move_group) {
5165 goto err_put_context; 5627 if (group_leader->ctx->type != ctx->type)
5628 goto err_context;
5629 } else {
5630 if (group_leader->ctx != ctx)
5631 goto err_context;
5632 }
5633
5166 /* 5634 /*
5167 * Only a group leader can be exclusive or pinned 5635 * Only a group leader can be exclusive or pinned
5168 */ 5636 */
5169 if (attr.exclusive || attr.pinned) 5637 if (attr.exclusive || attr.pinned)
5170 goto err_put_context; 5638 goto err_context;
5171 }
5172
5173 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5174 NULL, NULL, GFP_KERNEL);
5175 if (IS_ERR(event)) {
5176 err = PTR_ERR(event);
5177 goto err_put_context;
5178 } 5639 }
5179 5640
5180 if (output_event) { 5641 if (output_event) {
5181 err = perf_event_set_output(event, output_event); 5642 err = perf_event_set_output(event, output_event);
5182 if (err) 5643 if (err)
5183 goto err_free_put_context; 5644 goto err_context;
5184 } 5645 }
5185 5646
5186 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5647 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5187 if (IS_ERR(event_file)) { 5648 if (IS_ERR(event_file)) {
5188 err = PTR_ERR(event_file); 5649 err = PTR_ERR(event_file);
5189 goto err_free_put_context; 5650 goto err_context;
5651 }
5652
5653 if (move_group) {
5654 struct perf_event_context *gctx = group_leader->ctx;
5655
5656 mutex_lock(&gctx->mutex);
5657 perf_event_remove_from_context(group_leader);
5658 list_for_each_entry(sibling, &group_leader->sibling_list,
5659 group_entry) {
5660 perf_event_remove_from_context(sibling);
5661 put_ctx(gctx);
5662 }
5663 mutex_unlock(&gctx->mutex);
5664 put_ctx(gctx);
5190 } 5665 }
5191 5666
5192 event->filp = event_file; 5667 event->filp = event_file;
5193 WARN_ON_ONCE(ctx->parent_ctx); 5668 WARN_ON_ONCE(ctx->parent_ctx);
5194 mutex_lock(&ctx->mutex); 5669 mutex_lock(&ctx->mutex);
5670
5671 if (move_group) {
5672 perf_install_in_context(ctx, group_leader, cpu);
5673 get_ctx(ctx);
5674 list_for_each_entry(sibling, &group_leader->sibling_list,
5675 group_entry) {
5676 perf_install_in_context(ctx, sibling, cpu);
5677 get_ctx(ctx);
5678 }
5679 }
5680
5195 perf_install_in_context(ctx, event, cpu); 5681 perf_install_in_context(ctx, event, cpu);
5196 ++ctx->generation; 5682 ++ctx->generation;
5197 mutex_unlock(&ctx->mutex); 5683 mutex_unlock(&ctx->mutex);
@@ -5212,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open,
5212 fd_install(event_fd, event_file); 5698 fd_install(event_fd, event_file);
5213 return event_fd; 5699 return event_fd;
5214 5700
5215err_free_put_context: 5701err_context:
5702 put_ctx(ctx);
5703err_alloc:
5216 free_event(event); 5704 free_event(event);
5217err_put_context: 5705err_task:
5706 if (task)
5707 put_task_struct(task);
5708err_group_fd:
5218 fput_light(group_file, fput_needed); 5709 fput_light(group_file, fput_needed);
5219 put_ctx(ctx);
5220err_fd: 5710err_fd:
5221 put_unused_fd(event_fd); 5711 put_unused_fd(event_fd);
5222 return err; 5712 return err;
@@ -5227,32 +5717,31 @@ err_fd:
5227 * 5717 *
5228 * @attr: attributes of the counter to create 5718 * @attr: attributes of the counter to create
5229 * @cpu: cpu in which the counter is bound 5719 * @cpu: cpu in which the counter is bound
5230 * @pid: task to profile 5720 * @task: task to profile (NULL for percpu)
5231 */ 5721 */
5232struct perf_event * 5722struct perf_event *
5233perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5723perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5234 pid_t pid, 5724 struct task_struct *task,
5235 perf_overflow_handler_t overflow_handler) 5725 perf_overflow_handler_t overflow_handler)
5236{ 5726{
5237 struct perf_event *event;
5238 struct perf_event_context *ctx; 5727 struct perf_event_context *ctx;
5728 struct perf_event *event;
5239 int err; 5729 int err;
5240 5730
5241 /* 5731 /*
5242 * Get the target context (task or percpu): 5732 * Get the target context (task or percpu):
5243 */ 5733 */
5244 5734
5245 ctx = find_get_context(pid, cpu); 5735 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5246 if (IS_ERR(ctx)) {
5247 err = PTR_ERR(ctx);
5248 goto err_exit;
5249 }
5250
5251 event = perf_event_alloc(attr, cpu, ctx, NULL,
5252 NULL, overflow_handler, GFP_KERNEL);
5253 if (IS_ERR(event)) { 5736 if (IS_ERR(event)) {
5254 err = PTR_ERR(event); 5737 err = PTR_ERR(event);
5255 goto err_put_context; 5738 goto err;
5739 }
5740
5741 ctx = find_get_context(event->pmu, task, cpu);
5742 if (IS_ERR(ctx)) {
5743 err = PTR_ERR(ctx);
5744 goto err_free;
5256 } 5745 }
5257 5746
5258 event->filp = NULL; 5747 event->filp = NULL;
@@ -5270,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5270 5759
5271 return event; 5760 return event;
5272 5761
5273 err_put_context: 5762err_free:
5274 put_ctx(ctx); 5763 free_event(event);
5275 err_exit: 5764err:
5276 return ERR_PTR(err); 5765 return ERR_PTR(err);
5277} 5766}
5278EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5767EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5279 5768
5280/*
5281 * inherit a event from parent task to child task:
5282 */
5283static struct perf_event *
5284inherit_event(struct perf_event *parent_event,
5285 struct task_struct *parent,
5286 struct perf_event_context *parent_ctx,
5287 struct task_struct *child,
5288 struct perf_event *group_leader,
5289 struct perf_event_context *child_ctx)
5290{
5291 struct perf_event *child_event;
5292
5293 /*
5294 * Instead of creating recursive hierarchies of events,
5295 * we link inherited events back to the original parent,
5296 * which has a filp for sure, which we use as the reference
5297 * count:
5298 */
5299 if (parent_event->parent)
5300 parent_event = parent_event->parent;
5301
5302 child_event = perf_event_alloc(&parent_event->attr,
5303 parent_event->cpu, child_ctx,
5304 group_leader, parent_event,
5305 NULL, GFP_KERNEL);
5306 if (IS_ERR(child_event))
5307 return child_event;
5308 get_ctx(child_ctx);
5309
5310 /*
5311 * Make the child state follow the state of the parent event,
5312 * not its attr.disabled bit. We hold the parent's mutex,
5313 * so we won't race with perf_event_{en, dis}able_family.
5314 */
5315 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5316 child_event->state = PERF_EVENT_STATE_INACTIVE;
5317 else
5318 child_event->state = PERF_EVENT_STATE_OFF;
5319
5320 if (parent_event->attr.freq) {
5321 u64 sample_period = parent_event->hw.sample_period;
5322 struct hw_perf_event *hwc = &child_event->hw;
5323
5324 hwc->sample_period = sample_period;
5325 hwc->last_period = sample_period;
5326
5327 local64_set(&hwc->period_left, sample_period);
5328 }
5329
5330 child_event->overflow_handler = parent_event->overflow_handler;
5331
5332 /*
5333 * Link it up in the child's context:
5334 */
5335 add_event_to_ctx(child_event, child_ctx);
5336
5337 /*
5338 * Get a reference to the parent filp - we will fput it
5339 * when the child event exits. This is safe to do because
5340 * we are in the parent and we know that the filp still
5341 * exists and has a nonzero count:
5342 */
5343 atomic_long_inc(&parent_event->filp->f_count);
5344
5345 /*
5346 * Link this into the parent event's child list
5347 */
5348 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5349 mutex_lock(&parent_event->child_mutex);
5350 list_add_tail(&child_event->child_list, &parent_event->child_list);
5351 mutex_unlock(&parent_event->child_mutex);
5352
5353 return child_event;
5354}
5355
5356static int inherit_group(struct perf_event *parent_event,
5357 struct task_struct *parent,
5358 struct perf_event_context *parent_ctx,
5359 struct task_struct *child,
5360 struct perf_event_context *child_ctx)
5361{
5362 struct perf_event *leader;
5363 struct perf_event *sub;
5364 struct perf_event *child_ctr;
5365
5366 leader = inherit_event(parent_event, parent, parent_ctx,
5367 child, NULL, child_ctx);
5368 if (IS_ERR(leader))
5369 return PTR_ERR(leader);
5370 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5371 child_ctr = inherit_event(sub, parent, parent_ctx,
5372 child, leader, child_ctx);
5373 if (IS_ERR(child_ctr))
5374 return PTR_ERR(child_ctr);
5375 }
5376 return 0;
5377}
5378
5379static void sync_child_event(struct perf_event *child_event, 5769static void sync_child_event(struct perf_event *child_event,
5380 struct task_struct *child) 5770 struct task_struct *child)
5381{ 5771{
@@ -5432,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5432 } 5822 }
5433} 5823}
5434 5824
5435/* 5825static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5436 * When a child task exits, feed back event values to parent events.
5437 */
5438void perf_event_exit_task(struct task_struct *child)
5439{ 5826{
5440 struct perf_event *child_event, *tmp; 5827 struct perf_event *child_event, *tmp;
5441 struct perf_event_context *child_ctx; 5828 struct perf_event_context *child_ctx;
5442 unsigned long flags; 5829 unsigned long flags;
5443 5830
5444 if (likely(!child->perf_event_ctxp)) { 5831 if (likely(!child->perf_event_ctxp[ctxn])) {
5445 perf_event_task(child, NULL, 0); 5832 perf_event_task(child, NULL, 0);
5446 return; 5833 return;
5447 } 5834 }
@@ -5453,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child)
5453 * scheduled, so we are now safe from rescheduling changing 5840 * scheduled, so we are now safe from rescheduling changing
5454 * our context. 5841 * our context.
5455 */ 5842 */
5456 child_ctx = child->perf_event_ctxp; 5843 child_ctx = child->perf_event_ctxp[ctxn];
5457 __perf_event_task_sched_out(child_ctx); 5844 task_ctx_sched_out(child_ctx, EVENT_ALL);
5458 5845
5459 /* 5846 /*
5460 * Take the context lock here so that if find_get_context is 5847 * Take the context lock here so that if find_get_context is
@@ -5462,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child)
5462 * incremented the context's refcount before we do put_ctx below. 5849 * incremented the context's refcount before we do put_ctx below.
5463 */ 5850 */
5464 raw_spin_lock(&child_ctx->lock); 5851 raw_spin_lock(&child_ctx->lock);
5465 child->perf_event_ctxp = NULL; 5852 child->perf_event_ctxp[ctxn] = NULL;
5466 /* 5853 /*
5467 * If this context is a clone; unclone it so it can't get 5854 * If this context is a clone; unclone it so it can't get
5468 * swapped to another process while we're removing all 5855 * swapped to another process while we're removing all
@@ -5515,6 +5902,17 @@ again:
5515 put_ctx(child_ctx); 5902 put_ctx(child_ctx);
5516} 5903}
5517 5904
5905/*
5906 * When a child task exits, feed back event values to parent events.
5907 */
5908void perf_event_exit_task(struct task_struct *child)
5909{
5910 int ctxn;
5911
5912 for_each_task_context_nr(ctxn)
5913 perf_event_exit_task_context(child, ctxn);
5914}
5915
5518static void perf_free_event(struct perf_event *event, 5916static void perf_free_event(struct perf_event *event,
5519 struct perf_event_context *ctx) 5917 struct perf_event_context *ctx)
5520{ 5918{
@@ -5536,48 +5934,166 @@ static void perf_free_event(struct perf_event *event,
5536 5934
5537/* 5935/*
5538 * free an unexposed, unused context as created by inheritance by 5936 * free an unexposed, unused context as created by inheritance by
5539 * init_task below, used by fork() in case of fail. 5937 * perf_event_init_task below, used by fork() in case of fail.
5540 */ 5938 */
5541void perf_event_free_task(struct task_struct *task) 5939void perf_event_free_task(struct task_struct *task)
5542{ 5940{
5543 struct perf_event_context *ctx = task->perf_event_ctxp; 5941 struct perf_event_context *ctx;
5544 struct perf_event *event, *tmp; 5942 struct perf_event *event, *tmp;
5943 int ctxn;
5545 5944
5546 if (!ctx) 5945 for_each_task_context_nr(ctxn) {
5547 return; 5946 ctx = task->perf_event_ctxp[ctxn];
5947 if (!ctx)
5948 continue;
5548 5949
5549 mutex_lock(&ctx->mutex); 5950 mutex_lock(&ctx->mutex);
5550again: 5951again:
5551 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5952 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5552 perf_free_event(event, ctx); 5953 group_entry)
5954 perf_free_event(event, ctx);
5553 5955
5554 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5956 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5555 group_entry) 5957 group_entry)
5556 perf_free_event(event, ctx); 5958 perf_free_event(event, ctx);
5557 5959
5558 if (!list_empty(&ctx->pinned_groups) || 5960 if (!list_empty(&ctx->pinned_groups) ||
5559 !list_empty(&ctx->flexible_groups)) 5961 !list_empty(&ctx->flexible_groups))
5560 goto again; 5962 goto again;
5561 5963
5562 mutex_unlock(&ctx->mutex); 5964 mutex_unlock(&ctx->mutex);
5563 5965
5564 put_ctx(ctx); 5966 put_ctx(ctx);
5967 }
5968}
5969
5970void perf_event_delayed_put(struct task_struct *task)
5971{
5972 int ctxn;
5973
5974 for_each_task_context_nr(ctxn)
5975 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5976}
5977
5978/*
5979 * inherit a event from parent task to child task:
5980 */
5981static struct perf_event *
5982inherit_event(struct perf_event *parent_event,
5983 struct task_struct *parent,
5984 struct perf_event_context *parent_ctx,
5985 struct task_struct *child,
5986 struct perf_event *group_leader,
5987 struct perf_event_context *child_ctx)
5988{
5989 struct perf_event *child_event;
5990 unsigned long flags;
5991
5992 /*
5993 * Instead of creating recursive hierarchies of events,
5994 * we link inherited events back to the original parent,
5995 * which has a filp for sure, which we use as the reference
5996 * count:
5997 */
5998 if (parent_event->parent)
5999 parent_event = parent_event->parent;
6000
6001 child_event = perf_event_alloc(&parent_event->attr,
6002 parent_event->cpu,
6003 child,
6004 group_leader, parent_event,
6005 NULL);
6006 if (IS_ERR(child_event))
6007 return child_event;
6008 get_ctx(child_ctx);
6009
6010 /*
6011 * Make the child state follow the state of the parent event,
6012 * not its attr.disabled bit. We hold the parent's mutex,
6013 * so we won't race with perf_event_{en, dis}able_family.
6014 */
6015 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6016 child_event->state = PERF_EVENT_STATE_INACTIVE;
6017 else
6018 child_event->state = PERF_EVENT_STATE_OFF;
6019
6020 if (parent_event->attr.freq) {
6021 u64 sample_period = parent_event->hw.sample_period;
6022 struct hw_perf_event *hwc = &child_event->hw;
6023
6024 hwc->sample_period = sample_period;
6025 hwc->last_period = sample_period;
6026
6027 local64_set(&hwc->period_left, sample_period);
6028 }
6029
6030 child_event->ctx = child_ctx;
6031 child_event->overflow_handler = parent_event->overflow_handler;
6032
6033 /*
6034 * Link it up in the child's context:
6035 */
6036 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6037 add_event_to_ctx(child_event, child_ctx);
6038 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6039
6040 /*
6041 * Get a reference to the parent filp - we will fput it
6042 * when the child event exits. This is safe to do because
6043 * we are in the parent and we know that the filp still
6044 * exists and has a nonzero count:
6045 */
6046 atomic_long_inc(&parent_event->filp->f_count);
6047
6048 /*
6049 * Link this into the parent event's child list
6050 */
6051 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6052 mutex_lock(&parent_event->child_mutex);
6053 list_add_tail(&child_event->child_list, &parent_event->child_list);
6054 mutex_unlock(&parent_event->child_mutex);
6055
6056 return child_event;
6057}
6058
6059static int inherit_group(struct perf_event *parent_event,
6060 struct task_struct *parent,
6061 struct perf_event_context *parent_ctx,
6062 struct task_struct *child,
6063 struct perf_event_context *child_ctx)
6064{
6065 struct perf_event *leader;
6066 struct perf_event *sub;
6067 struct perf_event *child_ctr;
6068
6069 leader = inherit_event(parent_event, parent, parent_ctx,
6070 child, NULL, child_ctx);
6071 if (IS_ERR(leader))
6072 return PTR_ERR(leader);
6073 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6074 child_ctr = inherit_event(sub, parent, parent_ctx,
6075 child, leader, child_ctx);
6076 if (IS_ERR(child_ctr))
6077 return PTR_ERR(child_ctr);
6078 }
6079 return 0;
5565} 6080}
5566 6081
5567static int 6082static int
5568inherit_task_group(struct perf_event *event, struct task_struct *parent, 6083inherit_task_group(struct perf_event *event, struct task_struct *parent,
5569 struct perf_event_context *parent_ctx, 6084 struct perf_event_context *parent_ctx,
5570 struct task_struct *child, 6085 struct task_struct *child, int ctxn,
5571 int *inherited_all) 6086 int *inherited_all)
5572{ 6087{
5573 int ret; 6088 int ret;
5574 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6089 struct perf_event_context *child_ctx;
5575 6090
5576 if (!event->attr.inherit) { 6091 if (!event->attr.inherit) {
5577 *inherited_all = 0; 6092 *inherited_all = 0;
5578 return 0; 6093 return 0;
5579 } 6094 }
5580 6095
6096 child_ctx = child->perf_event_ctxp[ctxn];
5581 if (!child_ctx) { 6097 if (!child_ctx) {
5582 /* 6098 /*
5583 * This is executed from the parent task context, so 6099 * This is executed from the parent task context, so
@@ -5586,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5586 * child. 6102 * child.
5587 */ 6103 */
5588 6104
5589 child_ctx = kzalloc(sizeof(struct perf_event_context), 6105 child_ctx = alloc_perf_context(event->pmu, child);
5590 GFP_KERNEL);
5591 if (!child_ctx) 6106 if (!child_ctx)
5592 return -ENOMEM; 6107 return -ENOMEM;
5593 6108
5594 __perf_event_init_context(child_ctx, child); 6109 child->perf_event_ctxp[ctxn] = child_ctx;
5595 child->perf_event_ctxp = child_ctx;
5596 get_task_struct(child);
5597 } 6110 }
5598 6111
5599 ret = inherit_group(event, parent, parent_ctx, 6112 ret = inherit_group(event, parent, parent_ctx,
@@ -5605,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5605 return ret; 6118 return ret;
5606} 6119}
5607 6120
5608
5609/* 6121/*
5610 * Initialize the perf_event context in task_struct 6122 * Initialize the perf_event context in task_struct
5611 */ 6123 */
5612int perf_event_init_task(struct task_struct *child) 6124int perf_event_init_context(struct task_struct *child, int ctxn)
5613{ 6125{
5614 struct perf_event_context *child_ctx, *parent_ctx; 6126 struct perf_event_context *child_ctx, *parent_ctx;
5615 struct perf_event_context *cloned_ctx; 6127 struct perf_event_context *cloned_ctx;
@@ -5618,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child)
5618 int inherited_all = 1; 6130 int inherited_all = 1;
5619 int ret = 0; 6131 int ret = 0;
5620 6132
5621 child->perf_event_ctxp = NULL; 6133 child->perf_event_ctxp[ctxn] = NULL;
5622 6134
5623 mutex_init(&child->perf_event_mutex); 6135 mutex_init(&child->perf_event_mutex);
5624 INIT_LIST_HEAD(&child->perf_event_list); 6136 INIT_LIST_HEAD(&child->perf_event_list);
5625 6137
5626 if (likely(!parent->perf_event_ctxp)) 6138 if (likely(!parent->perf_event_ctxp[ctxn]))
5627 return 0; 6139 return 0;
5628 6140
5629 /* 6141 /*
5630 * If the parent's context is a clone, pin it so it won't get 6142 * If the parent's context is a clone, pin it so it won't get
5631 * swapped under us. 6143 * swapped under us.
5632 */ 6144 */
5633 parent_ctx = perf_pin_task_context(parent); 6145 parent_ctx = perf_pin_task_context(parent, ctxn);
5634 6146
5635 /* 6147 /*
5636 * No need to check if parent_ctx != NULL here; since we saw 6148 * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child)
5650 * the list, not manipulating it: 6162 * the list, not manipulating it:
5651 */ 6163 */
5652 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6164 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5653 ret = inherit_task_group(event, parent, parent_ctx, child, 6165 ret = inherit_task_group(event, parent, parent_ctx,
5654 &inherited_all); 6166 child, ctxn, &inherited_all);
5655 if (ret) 6167 if (ret)
5656 break; 6168 break;
5657 } 6169 }
5658 6170
5659 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6171 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5660 ret = inherit_task_group(event, parent, parent_ctx, child, 6172 ret = inherit_task_group(event, parent, parent_ctx,
5661 &inherited_all); 6173 child, ctxn, &inherited_all);
5662 if (ret) 6174 if (ret)
5663 break; 6175 break;
5664 } 6176 }
5665 6177
5666 child_ctx = child->perf_event_ctxp; 6178 child_ctx = child->perf_event_ctxp[ctxn];
5667 6179
5668 if (child_ctx && inherited_all) { 6180 if (child_ctx && inherited_all) {
5669 /* 6181 /*
@@ -5692,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child)
5692 return ret; 6204 return ret;
5693} 6205}
5694 6206
6207/*
6208 * Initialize the perf_event context in task_struct
6209 */
6210int perf_event_init_task(struct task_struct *child)
6211{
6212 int ctxn, ret;
6213
6214 for_each_task_context_nr(ctxn) {
6215 ret = perf_event_init_context(child, ctxn);
6216 if (ret)
6217 return ret;
6218 }
6219
6220 return 0;
6221}
6222
5695static void __init perf_event_init_all_cpus(void) 6223static void __init perf_event_init_all_cpus(void)
5696{ 6224{
6225 struct swevent_htable *swhash;
5697 int cpu; 6226 int cpu;
5698 struct perf_cpu_context *cpuctx;
5699 6227
5700 for_each_possible_cpu(cpu) { 6228 for_each_possible_cpu(cpu) {
5701 cpuctx = &per_cpu(perf_cpu_context, cpu); 6229 swhash = &per_cpu(swevent_htable, cpu);
5702 mutex_init(&cpuctx->hlist_mutex); 6230 mutex_init(&swhash->hlist_mutex);
5703 __perf_event_init_context(&cpuctx->ctx, NULL); 6231 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5704 } 6232 }
5705} 6233}
5706 6234
5707static void __cpuinit perf_event_init_cpu(int cpu) 6235static void __cpuinit perf_event_init_cpu(int cpu)
5708{ 6236{
5709 struct perf_cpu_context *cpuctx; 6237 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5710
5711 cpuctx = &per_cpu(perf_cpu_context, cpu);
5712 6238
5713 spin_lock(&perf_resource_lock); 6239 mutex_lock(&swhash->hlist_mutex);
5714 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 6240 if (swhash->hlist_refcount > 0) {
5715 spin_unlock(&perf_resource_lock);
5716
5717 mutex_lock(&cpuctx->hlist_mutex);
5718 if (cpuctx->hlist_refcount > 0) {
5719 struct swevent_hlist *hlist; 6241 struct swevent_hlist *hlist;
5720 6242
5721 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6243 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5722 WARN_ON_ONCE(!hlist); 6244 WARN_ON(!hlist);
5723 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6245 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5724 } 6246 }
5725 mutex_unlock(&cpuctx->hlist_mutex); 6247 mutex_unlock(&swhash->hlist_mutex);
5726} 6248}
5727 6249
5728#ifdef CONFIG_HOTPLUG_CPU 6250#ifdef CONFIG_HOTPLUG_CPU
5729static void __perf_event_exit_cpu(void *info) 6251static void perf_pmu_rotate_stop(struct pmu *pmu)
5730{ 6252{
5731 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6253 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5732 struct perf_event_context *ctx = &cpuctx->ctx; 6254
6255 WARN_ON(!irqs_disabled());
6256
6257 list_del_init(&cpuctx->rotation_list);
6258}
6259
6260static void __perf_event_exit_context(void *__info)
6261{
6262 struct perf_event_context *ctx = __info;
5733 struct perf_event *event, *tmp; 6263 struct perf_event *event, *tmp;
5734 6264
6265 perf_pmu_rotate_stop(ctx->pmu);
6266
5735 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6267 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5736 __perf_event_remove_from_context(event); 6268 __perf_event_remove_from_context(event);
5737 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6269 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5738 __perf_event_remove_from_context(event); 6270 __perf_event_remove_from_context(event);
5739} 6271}
6272
6273static void perf_event_exit_cpu_context(int cpu)
6274{
6275 struct perf_event_context *ctx;
6276 struct pmu *pmu;
6277 int idx;
6278
6279 idx = srcu_read_lock(&pmus_srcu);
6280 list_for_each_entry_rcu(pmu, &pmus, entry) {
6281 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6282
6283 mutex_lock(&ctx->mutex);
6284 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6285 mutex_unlock(&ctx->mutex);
6286 }
6287 srcu_read_unlock(&pmus_srcu, idx);
6288}
6289
5740static void perf_event_exit_cpu(int cpu) 6290static void perf_event_exit_cpu(int cpu)
5741{ 6291{
5742 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6292 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5743 struct perf_event_context *ctx = &cpuctx->ctx;
5744 6293
5745 mutex_lock(&cpuctx->hlist_mutex); 6294 mutex_lock(&swhash->hlist_mutex);
5746 swevent_hlist_release(cpuctx); 6295 swevent_hlist_release(swhash);
5747 mutex_unlock(&cpuctx->hlist_mutex); 6296 mutex_unlock(&swhash->hlist_mutex);
5748 6297
5749 mutex_lock(&ctx->mutex); 6298 perf_event_exit_cpu_context(cpu);
5750 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5751 mutex_unlock(&ctx->mutex);
5752} 6299}
5753#else 6300#else
5754static inline void perf_event_exit_cpu(int cpu) { } 6301static inline void perf_event_exit_cpu(int cpu) { }
@@ -5778,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5778 return NOTIFY_OK; 6325 return NOTIFY_OK;
5779} 6326}
5780 6327
5781/*
5782 * This has to have a higher priority than migration_notifier in sched.c.
5783 */
5784static struct notifier_block __cpuinitdata perf_cpu_nb = {
5785 .notifier_call = perf_cpu_notify,
5786 .priority = 20,
5787};
5788
5789void __init perf_event_init(void) 6328void __init perf_event_init(void)
5790{ 6329{
5791 perf_event_init_all_cpus(); 6330 perf_event_init_all_cpus();
5792 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6331 init_srcu_struct(&pmus_srcu);
5793 (void *)(long)smp_processor_id()); 6332 perf_pmu_register(&perf_swevent);
5794 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6333 perf_pmu_register(&perf_cpu_clock);
5795 (void *)(long)smp_processor_id()); 6334 perf_pmu_register(&perf_task_clock);
5796 register_cpu_notifier(&perf_cpu_nb); 6335 perf_tp_register();
5797} 6336 perf_cpu_notifier(perf_cpu_notify);
5798
5799static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5800 struct sysdev_class_attribute *attr,
5801 char *buf)
5802{
5803 return sprintf(buf, "%d\n", perf_reserved_percpu);
5804}
5805
5806static ssize_t
5807perf_set_reserve_percpu(struct sysdev_class *class,
5808 struct sysdev_class_attribute *attr,
5809 const char *buf,
5810 size_t count)
5811{
5812 struct perf_cpu_context *cpuctx;
5813 unsigned long val;
5814 int err, cpu, mpt;
5815
5816 err = strict_strtoul(buf, 10, &val);
5817 if (err)
5818 return err;
5819 if (val > perf_max_events)
5820 return -EINVAL;
5821
5822 spin_lock(&perf_resource_lock);
5823 perf_reserved_percpu = val;
5824 for_each_online_cpu(cpu) {
5825 cpuctx = &per_cpu(perf_cpu_context, cpu);
5826 raw_spin_lock_irq(&cpuctx->ctx.lock);
5827 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5828 perf_max_events - perf_reserved_percpu);
5829 cpuctx->max_pertask = mpt;
5830 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5831 }
5832 spin_unlock(&perf_resource_lock);
5833
5834 return count;
5835}
5836
5837static ssize_t perf_show_overcommit(struct sysdev_class *class,
5838 struct sysdev_class_attribute *attr,
5839 char *buf)
5840{
5841 return sprintf(buf, "%d\n", perf_overcommit);
5842}
5843
5844static ssize_t
5845perf_set_overcommit(struct sysdev_class *class,
5846 struct sysdev_class_attribute *attr,
5847 const char *buf, size_t count)
5848{
5849 unsigned long val;
5850 int err;
5851
5852 err = strict_strtoul(buf, 10, &val);
5853 if (err)
5854 return err;
5855 if (val > 1)
5856 return -EINVAL;
5857
5858 spin_lock(&perf_resource_lock);
5859 perf_overcommit = val;
5860 spin_unlock(&perf_resource_lock);
5861
5862 return count;
5863}
5864
5865static SYSDEV_CLASS_ATTR(
5866 reserve_percpu,
5867 0644,
5868 perf_show_reserve_percpu,
5869 perf_set_reserve_percpu
5870 );
5871
5872static SYSDEV_CLASS_ATTR(
5873 overcommit,
5874 0644,
5875 perf_show_overcommit,
5876 perf_set_overcommit
5877 );
5878
5879static struct attribute *perfclass_attrs[] = {
5880 &attr_reserve_percpu.attr,
5881 &attr_overcommit.attr,
5882 NULL
5883};
5884
5885static struct attribute_group perfclass_attr_group = {
5886 .attrs = perfclass_attrs,
5887 .name = "perf_events",
5888};
5889
5890static int __init perf_event_sysfs_init(void)
5891{
5892 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5893 &perfclass_attr_group);
5894} 6337}
5895device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 0bba34a48d10..5a5cc33e4999 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3584,7 +3584,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3584 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3585 raw_spin_unlock(&rq->lock);
3586 3586
3587 perf_event_task_tick(curr); 3587 perf_event_task_tick();
3588 3588
3589#ifdef CONFIG_SMP 3589#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3590 rq->idle_at_tick = idle_cpu(cpu);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
1279 run_local_timers(); 1279 run_local_timers();
1280 rcu_check_callbacks(cpu, user_tick); 1280 rcu_check_callbacks(cpu, user_tick);
1281 printk_tick(); 1281 printk_tick();
1282 perf_event_do_pending(); 1282#ifdef CONFIG_IRQ_WORK
1283 if (in_irq())
1284 irq_work_run();
1285#endif
1283 scheduler_tick(); 1286 scheduler_tick();
1284 run_posix_cpu_timers(p); 1287 run_posix_cpu_timers(p);
1285} 1288}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..e550d2eda1df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..ebd80d50c474 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
884 FTRACE_ENABLE_CALLS = (1 << 0), 884 FTRACE_ENABLE_CALLS = (1 << 0),
885 FTRACE_DISABLE_CALLS = (1 << 1), 885 FTRACE_DISABLE_CALLS = (1 << 1),
886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
887 FTRACE_ENABLE_MCOUNT = (1 << 3), 887 FTRACE_START_FUNC_RET = (1 << 3),
888 FTRACE_DISABLE_MCOUNT = (1 << 4), 888 FTRACE_STOP_FUNC_RET = (1 << 4),
889 FTRACE_START_FUNC_RET = (1 << 5),
890 FTRACE_STOP_FUNC_RET = (1 << 6),
891}; 889};
892 890
893static int ftrace_filtered; 891static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
1226 1224
1227static void ftrace_startup_sysctl(void) 1225static void ftrace_startup_sysctl(void)
1228{ 1226{
1229 int command = FTRACE_ENABLE_MCOUNT;
1230
1231 if (unlikely(ftrace_disabled)) 1227 if (unlikely(ftrace_disabled))
1232 return; 1228 return;
1233 1229
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
1235 saved_ftrace_func = NULL; 1231 saved_ftrace_func = NULL;
1236 /* ftrace_start_up is true if we want ftrace running */ 1232 /* ftrace_start_up is true if we want ftrace running */
1237 if (ftrace_start_up) 1233 if (ftrace_start_up)
1238 command |= FTRACE_ENABLE_CALLS; 1234 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1239
1240 ftrace_run_update_code(command);
1241} 1235}
1242 1236
1243static void ftrace_shutdown_sysctl(void) 1237static void ftrace_shutdown_sysctl(void)
1244{ 1238{
1245 int command = FTRACE_DISABLE_MCOUNT;
1246
1247 if (unlikely(ftrace_disabled)) 1239 if (unlikely(ftrace_disabled))
1248 return; 1240 return;
1249 1241
1250 /* ftrace_start_up is true if ftrace is running */ 1242 /* ftrace_start_up is true if ftrace is running */
1251 if (ftrace_start_up) 1243 if (ftrace_start_up)
1252 command |= FTRACE_DISABLE_CALLS; 1244 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1253
1254 ftrace_run_update_code(command);
1255} 1245}
1256 1246
1257static cycle_t ftrace_update_time; 1247static cycle_t ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
1368#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1358#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1369 1359
1370struct ftrace_iterator { 1360struct ftrace_iterator {
1371 struct ftrace_page *pg; 1361 loff_t pos;
1372 int hidx; 1362 loff_t func_pos;
1373 int idx; 1363 struct ftrace_page *pg;
1374 unsigned flags; 1364 struct dyn_ftrace *func;
1375 struct trace_parser parser; 1365 struct ftrace_func_probe *probe;
1366 struct trace_parser parser;
1367 int hidx;
1368 int idx;
1369 unsigned flags;
1376}; 1370};
1377 1371
1378static void * 1372static void *
1379t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1373t_hash_next(struct seq_file *m, loff_t *pos)
1380{ 1374{
1381 struct ftrace_iterator *iter = m->private; 1375 struct ftrace_iterator *iter = m->private;
1382 struct hlist_node *hnd = v; 1376 struct hlist_node *hnd = NULL;
1383 struct hlist_head *hhd; 1377 struct hlist_head *hhd;
1384 1378
1385 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1386
1387 (*pos)++; 1379 (*pos)++;
1380 iter->pos = *pos;
1388 1381
1382 if (iter->probe)
1383 hnd = &iter->probe->node;
1389 retry: 1384 retry:
1390 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1385 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1391 return NULL; 1386 return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1408 } 1403 }
1409 } 1404 }
1410 1405
1411 return hnd; 1406 if (WARN_ON_ONCE(!hnd))
1407 return NULL;
1408
1409 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1410
1411 return iter;
1412} 1412}
1413 1413
1414static void *t_hash_start(struct seq_file *m, loff_t *pos) 1414static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417 void *p = NULL; 1417 void *p = NULL;
1418 loff_t l; 1418 loff_t l;
1419 1419
1420 if (!(iter->flags & FTRACE_ITER_HASH)) 1420 if (iter->func_pos > *pos)
1421 *pos = 0; 1421 return NULL;
1422
1423 iter->flags |= FTRACE_ITER_HASH;
1424 1422
1425 iter->hidx = 0; 1423 iter->hidx = 0;
1426 for (l = 0; l <= *pos; ) { 1424 for (l = 0; l <= (*pos - iter->func_pos); ) {
1427 p = t_hash_next(m, p, &l); 1425 p = t_hash_next(m, &l);
1428 if (!p) 1426 if (!p)
1429 break; 1427 break;
1430 } 1428 }
1431 return p; 1429 if (!p)
1430 return NULL;
1431
1432 /* Only set this if we have an item */
1433 iter->flags |= FTRACE_ITER_HASH;
1434
1435 return iter;
1432} 1436}
1433 1437
1434static int t_hash_show(struct seq_file *m, void *v) 1438static int
1439t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1435{ 1440{
1436 struct ftrace_func_probe *rec; 1441 struct ftrace_func_probe *rec;
1437 struct hlist_node *hnd = v;
1438 1442
1439 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1443 rec = iter->probe;
1444 if (WARN_ON_ONCE(!rec))
1445 return -EIO;
1440 1446
1441 if (rec->ops->print) 1447 if (rec->ops->print)
1442 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1448 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1457 struct dyn_ftrace *rec = NULL; 1463 struct dyn_ftrace *rec = NULL;
1458 1464
1459 if (iter->flags & FTRACE_ITER_HASH) 1465 if (iter->flags & FTRACE_ITER_HASH)
1460 return t_hash_next(m, v, pos); 1466 return t_hash_next(m, pos);
1461 1467
1462 (*pos)++; 1468 (*pos)++;
1469 iter->pos = *pos;
1463 1470
1464 if (iter->flags & FTRACE_ITER_PRINTALL) 1471 if (iter->flags & FTRACE_ITER_PRINTALL)
1465 return NULL; 1472 return t_hash_start(m, pos);
1466 1473
1467 retry: 1474 retry:
1468 if (iter->idx >= iter->pg->index) { 1475 if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1491 } 1498 }
1492 } 1499 }
1493 1500
1494 return rec; 1501 if (!rec)
1502 return t_hash_start(m, pos);
1503
1504 iter->func_pos = *pos;
1505 iter->func = rec;
1506
1507 return iter;
1508}
1509
1510static void reset_iter_read(struct ftrace_iterator *iter)
1511{
1512 iter->pos = 0;
1513 iter->func_pos = 0;
1514 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1495} 1515}
1496 1516
1497static void *t_start(struct seq_file *m, loff_t *pos) 1517static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1502 1522
1503 mutex_lock(&ftrace_lock); 1523 mutex_lock(&ftrace_lock);
1504 /* 1524 /*
1525 * If an lseek was done, then reset and start from beginning.
1526 */
1527 if (*pos < iter->pos)
1528 reset_iter_read(iter);
1529
1530 /*
1505 * For set_ftrace_filter reading, if we have the filter 1531 * For set_ftrace_filter reading, if we have the filter
1506 * off, we can short cut and just print out that all 1532 * off, we can short cut and just print out that all
1507 * functions are enabled. 1533 * functions are enabled.
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1518 if (iter->flags & FTRACE_ITER_HASH) 1544 if (iter->flags & FTRACE_ITER_HASH)
1519 return t_hash_start(m, pos); 1545 return t_hash_start(m, pos);
1520 1546
1547 /*
1548 * Unfortunately, we need to restart at ftrace_pages_start
1549 * every time we let go of the ftrace_mutex. This is because
1550 * those pointers can change without the lock.
1551 */
1521 iter->pg = ftrace_pages_start; 1552 iter->pg = ftrace_pages_start;
1522 iter->idx = 0; 1553 iter->idx = 0;
1523 for (l = 0; l <= *pos; ) { 1554 for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1526 break; 1557 break;
1527 } 1558 }
1528 1559
1529 if (!p && iter->flags & FTRACE_ITER_FILTER) 1560 if (!p) {
1530 return t_hash_start(m, pos); 1561 if (iter->flags & FTRACE_ITER_FILTER)
1562 return t_hash_start(m, pos);
1531 1563
1532 return p; 1564 return NULL;
1565 }
1566
1567 return iter;
1533} 1568}
1534 1569
1535static void t_stop(struct seq_file *m, void *p) 1570static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
1540static int t_show(struct seq_file *m, void *v) 1575static int t_show(struct seq_file *m, void *v)
1541{ 1576{
1542 struct ftrace_iterator *iter = m->private; 1577 struct ftrace_iterator *iter = m->private;
1543 struct dyn_ftrace *rec = v; 1578 struct dyn_ftrace *rec;
1544 1579
1545 if (iter->flags & FTRACE_ITER_HASH) 1580 if (iter->flags & FTRACE_ITER_HASH)
1546 return t_hash_show(m, v); 1581 return t_hash_show(m, iter);
1547 1582
1548 if (iter->flags & FTRACE_ITER_PRINTALL) { 1583 if (iter->flags & FTRACE_ITER_PRINTALL) {
1549 seq_printf(m, "#### all functions enabled ####\n"); 1584 seq_printf(m, "#### all functions enabled ####\n");
1550 return 0; 1585 return 0;
1551 } 1586 }
1552 1587
1588 rec = iter->func;
1589
1553 if (!rec) 1590 if (!rec)
1554 return 0; 1591 return 0;
1555 1592
@@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
1601 1638
1602 ret = ftrace_avail_open(inode, file); 1639 ret = ftrace_avail_open(inode, file);
1603 if (!ret) { 1640 if (!ret) {
1604 m = (struct seq_file *)file->private_data; 1641 m = file->private_data;
1605 iter = (struct ftrace_iterator *)m->private; 1642 iter = m->private;
1606 iter->flags = FTRACE_ITER_FAILURES; 1643 iter->flags = FTRACE_ITER_FAILURES;
1607 } 1644 }
1608 1645
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
2418 .open = ftrace_filter_open, 2455 .open = ftrace_filter_open,
2419 .read = seq_read, 2456 .read = seq_read,
2420 .write = ftrace_filter_write, 2457 .write = ftrace_filter_write,
2421 .llseek = no_llseek, 2458 .llseek = ftrace_regex_lseek,
2422 .release = ftrace_filter_release, 2459 .release = ftrace_filter_release,
2423}; 2460};
2424 2461
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bca96377fd4e..c5a632a669e1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2606}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2608
2609/*
2610 * The total entries in the ring buffer is the running counter
2611 * of entries entered into the ring buffer, minus the sum of
2612 * the entries read from the ring buffer and the number of
2613 * entries that were overwritten.
2614 */
2615static inline unsigned long
2616rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2617{
2618 return local_read(&cpu_buffer->entries) -
2619 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2620}
2621
2609/** 2622/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2623 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2624 * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2627unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2628{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2629 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2630
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2631 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2632 return 0;
2621 2633
2622 cpu_buffer = buffer->buffers[cpu]; 2634 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2635
2626 return ret; 2636 return rb_num_of_entries(cpu_buffer);
2627} 2637}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2639
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2694 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2695 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2696 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2697 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2698 }
2690 2699
2691 return entries; 2700 return entries;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..001bcd2ccf4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2196 2196
2197static int tracing_release(struct inode *inode, struct file *file) 2197static int tracing_release(struct inode *inode, struct file *file)
2198{ 2198{
2199 struct seq_file *m = (struct seq_file *)file->private_data; 2199 struct seq_file *m = file->private_data;
2200 struct trace_iterator *iter; 2200 struct trace_iterator *iter;
2201 int cpu; 2201 int cpu;
2202 2202
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
343 unsigned long ip, 343 unsigned long ip,
344 unsigned long parent_ip, 344 unsigned long parent_ip,
345 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int total_ref_count;
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 struct hlist_head *list; 27 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu; 29 int cpu;
30 30
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 42 tp_event->perf_events = list;
43 43
44 if (!total_ref_count) { 44 if (!total_ref_count) {
45 char *buf; 45 char __percpu *buf;
46 int i; 46 int i;
47 47
48 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 50 if (!buf)
51 goto fail; 51 goto fail;
52 52
@@ -65,7 +65,7 @@ fail:
65 if (!total_ref_count) { 65 if (!total_ref_count) {
66 int i; 66 int i;
67 67
68 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
71 } 71 }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
101 return ret; 101 return ret;
102} 102}
103 103
104int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
105{ 105{
106 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
107 struct hlist_head *list; 108 struct hlist_head *list;
108 109
109 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
110 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
111 return -EINVAL; 112 return -EINVAL;
112 113
113 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
114 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
115 119
116 return 0; 120 return 0;
117} 121}
118 122
119void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
120{ 124{
121 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
122} 126}
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
142 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
143 147
144 if (!--total_ref_count) { 148 if (!--total_ref_count) {
145 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
146 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
147 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
148 } 152 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
600 600
601enum { 601enum {
602 FORMAT_HEADER = 1, 602 FORMAT_HEADER = 1,
603 FORMAT_PRINTFMT = 2, 603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
604}; 605};
605 606
606static void *f_next(struct seq_file *m, void *v, loff_t *pos) 607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
607{ 608{
608 struct ftrace_event_call *call = m->private; 609 struct ftrace_event_call *call = m->private;
609 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
610 struct list_head *head; 611 struct list_head *common_head = &ftrace_common_fields;
612 struct list_head *head = trace_get_fields(call);
611 613
612 (*pos)++; 614 (*pos)++;
613 615
614 switch ((unsigned long)v) { 616 switch ((unsigned long)v) {
615 case FORMAT_HEADER: 617 case FORMAT_HEADER:
616 head = &ftrace_common_fields; 618 if (unlikely(list_empty(common_head)))
619 return NULL;
620
621 field = list_entry(common_head->prev,
622 struct ftrace_event_field, link);
623 return field;
617 624
625 case FORMAT_FIELD_SEPERATOR:
618 if (unlikely(list_empty(head))) 626 if (unlikely(list_empty(head)))
619 return NULL; 627 return NULL;
620 628
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
626 return NULL; 634 return NULL;
627 } 635 }
628 636
629 head = trace_get_fields(call);
630
631 /*
632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
636
637 field = v; 637 field = v;
638 /* 638 if (field->link.prev == common_head)
639 * If this is a common field, and at the end of the list, then 639 return (void *)FORMAT_FIELD_SEPERATOR;
640 * continue with main list. 640 else if (field->link.prev == head)
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
650 }
651
652 /* If we are done tell f_show to print the format */
653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT; 641 return (void *)FORMAT_PRINTFMT;
655 642
656 field = list_entry(field->link.prev, struct ftrace_event_field, link); 643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
688 seq_printf(m, "format:\n"); 675 seq_printf(m, "format:\n");
689 return 0; 676 return 0;
690 677
678 case FORMAT_FIELD_SEPERATOR:
679 seq_putc(m, '\n');
680 return 0;
681
691 case FORMAT_PRINTFMT: 682 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n", 683 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt); 684 call->print_fmt);
694 return 0; 685 return 0;
695 } 686 }
696 687
697 /*
698 * To separate common fields from event fields, the
699 * LSB is set on the first event field. Clear it and
700 * print a newline if it is set.
701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v; 688 field = v;
708 689
709 /* 690 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
247} 263}
248 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
249void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
251 unsigned long flags, 295 unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
649 693
650 /* Print nsecs (we don't want to exceed 7 numbers) */ 694 /* Print nsecs (we don't want to exceed 7 numbers) */
651 if (len < 7) { 695 if (len < 7) {
652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", 696 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
653 nsecs_rem); 697
698 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
654 ret = trace_seq_printf(s, ".%s", nsecs_str); 699 ret = trace_seq_printf(s, ".%s", nsecs_str);
655 if (!ret) 700 if (!ret)
656 return TRACE_TYPE_PARTIAL_LINE; 701 return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
855 return 0; 900 return 0;
856} 901}
857 902
903/*
904 * Entry check for irq code
905 *
906 * returns 1 if
907 * - we are inside irq code
908 * - we just extered irq code
909 *
910 * retunns 0 if
911 * - funcgraph-interrupts option is set
912 * - we are not inside irq code
913 */
914static int
915check_irq_entry(struct trace_iterator *iter, u32 flags,
916 unsigned long addr, int depth)
917{
918 int cpu = iter->cpu;
919 int *depth_irq;
920 struct fgraph_data *data = iter->private;
921
922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
929 return 0;
930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
933 /*
934 * We are inside the irq code
935 */
936 if (*depth_irq >= 0)
937 return 1;
938
939 if ((addr < (unsigned long)__irqentry_text_start) ||
940 (addr >= (unsigned long)__irqentry_text_end))
941 return 0;
942
943 /*
944 * We are entering irq code.
945 */
946 *depth_irq = depth;
947 return 1;
948}
949
950/*
951 * Return check for irq code
952 *
953 * returns 1 if
954 * - we are inside irq code
955 * - we just left irq code
956 *
957 * returns 0 if
958 * - funcgraph-interrupts option is set
959 * - we are not inside irq code
960 */
961static int
962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
963{
964 int cpu = iter->cpu;
965 int *depth_irq;
966 struct fgraph_data *data = iter->private;
967
968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
975 return 0;
976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
979 /*
980 * We are not inside the irq code.
981 */
982 if (*depth_irq == -1)
983 return 0;
984
985 /*
986 * We are inside the irq code, and this is returning entry.
987 * Let's not trace it and clear the entry depth, since
988 * we are out of irq code.
989 *
990 * This condition ensures that we 'leave the irq code' once
991 * we are out of the entry depth. Thus protecting us from
992 * the RETURN entry loss.
993 */
994 if (*depth_irq >= depth) {
995 *depth_irq = -1;
996 return 1;
997 }
998
999 /*
1000 * We are inside the irq code, and this is not the entry.
1001 */
1002 return 1;
1003}
1004
858static enum print_line_t 1005static enum print_line_t
859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 1006print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
860 struct trace_iterator *iter, u32 flags) 1007 struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
865 static enum print_line_t ret; 1012 static enum print_line_t ret;
866 int cpu = iter->cpu; 1013 int cpu = iter->cpu;
867 1014
1015 if (check_irq_entry(iter, flags, call->func, call->depth))
1016 return TRACE_TYPE_HANDLED;
1017
868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1018 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
869 return TRACE_TYPE_PARTIAL_LINE; 1019 return TRACE_TYPE_PARTIAL_LINE;
870 1020
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
902 int ret; 1052 int ret;
903 int i; 1053 int i;
904 1054
1055 if (check_irq_return(iter, flags, trace->depth))
1056 return TRACE_TYPE_HANDLED;
1057
905 if (data) { 1058 if (data) {
906 struct fgraph_cpu_data *cpu_data; 1059 struct fgraph_cpu_data *cpu_data;
907 int cpu = iter->cpu; 1060 int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1054 1207
1055 1208
1056enum print_line_t 1209enum print_line_t
1057print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1058{ 1211{
1059 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1060 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1117static enum print_line_t 1270static enum print_line_t
1118print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1119{ 1272{
1120 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1121} 1285}
1122 1286
1123static enum print_line_t 1287static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1149 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1150} 1314}
1151 1315
1152void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1153{ 1317{
1154 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1155 1319
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1190 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1191} 1355}
1192 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1193void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1194{ 1375{
1195 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
1210 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1391 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1211 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1392 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1212 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1393 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1394 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1395
1213 *pid = -1; 1396 *pid = -1;
1214 *depth = 0; 1397 *depth = 0;
1215 *ignore = 0; 1398 *ignore = 0;
1399 *depth_irq = -1;
1216 } 1400 }
1217 1401
1218 iter->private = data; 1402 iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
1235 } 1419 }
1236} 1420}
1237 1421
1422static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1423{
1424 if (bit == TRACE_GRAPH_PRINT_IRQS)
1425 ftrace_graph_skip_irqs = !set;
1426
1427 return 0;
1428}
1429
1238static struct trace_event_functions graph_functions = { 1430static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event, 1431 .trace = print_graph_function_event,
1240}; 1432};
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
1261 .print_line = print_graph_function, 1453 .print_line = print_graph_function,
1262 .print_header = print_graph_headers, 1454 .print_header = print_graph_headers,
1263 .flags = &tracer_flags, 1455 .flags = &tracer_flags,
1456 .set_flag = func_graph_set_flag,
1264#ifdef CONFIG_FTRACE_SELFTEST 1457#ifdef CONFIG_FTRACE_SELFTEST
1265 .selftest = trace_selftest_startup_function_graph, 1458 .selftest = trace_selftest_startup_function_graph,
1266#endif 1459#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 183 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 184 struct trace_array_cpu *data;
157 unsigned long flags; 185 unsigned long flags;
158 long disabled;
159 int ret; 186 int ret;
160 int cpu;
161 int pc; 187 int pc;
162 188
163 cpu = raw_smp_processor_id(); 189 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0; 190 return 0;
166 191
167 local_save_flags(flags); 192 pc = preempt_count();
168 /* slight chance to get a false positive on tracing_cpu */ 193 ret = __trace_graph_entry(tr, trace, flags, pc);
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 194 atomic_dec(&data->disabled);
195
182 return ret; 196 return ret;
183} 197}
184 198
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 201 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 202 struct trace_array_cpu *data;
189 unsigned long flags; 203 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 204 int pc;
193 205
194 cpu = raw_smp_processor_id(); 206 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return; 207 return;
197 208
198 local_save_flags(flags); 209 pc = preempt_count();
199 /* slight chance to get a false positive on tracing_cpu */ 210 __trace_graph_return(tr, trace, flags, pc);
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 211 atomic_dec(&data->disabled);
212} 212}
213 213
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 232 /*
240 * In graph mode call the graph tracer output function, 233 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 234 * otherwise go with the TRACE_FN event handler
242 */ 235 */
243 if (is_graph()) 236 if (is_graph())
244 return print_graph_function_flags(iter, flags); 237 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 238
246 return TRACE_TYPE_UNHANDLED; 239 return TRACE_TYPE_UNHANDLED;
247} 240}
248 241
249static void irqsoff_print_header(struct seq_file *s) 242static void irqsoff_print_header(struct seq_file *s)
250{ 243{
251 if (is_graph()) { 244 if (is_graph())
252 struct trace_iterator *iter = s->private; 245 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 246 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 247 trace_default_header(s);
268} 248}
269 249
270static void 250static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 251__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 252 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 253 unsigned long flags, int pc)
294{ 254{
295 if (!is_graph()) 255 if (is_graph())
256 trace_graph_function(tr, ip, parent_ip, flags, pc);
257 else
296 trace_function(tr, ip, parent_ip, flags, pc); 258 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 259}
302 260
303#else 261#else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int cpu; 80 int cpu;
50 int pc;
51 81
52 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
53 return; 83 return 0;
54 84
55 pc = preempt_count(); 85 *pc = preempt_count();
56 preempt_disable_notrace(); 86 preempt_disable_notrace();
57 87
58 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
59 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
60 goto out_enable; 90 goto out_enable;
61 91
62 data = tr->data[cpu]; 92 *data = tr->data[cpu];
63 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
64 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
65 goto out; 95 goto out;
66 96
67 local_irq_save(flags); 97 return 1;
68 98
69 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
70 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
71 local_irq_restore(flags); 123 local_irq_restore(flags);
72 124
73 out:
74 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
75 out_enable:
76 preempt_enable_notrace(); 126 preempt_enable_notrace();
77} 127}
78 128
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
82}; 132};
83#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
84 134
135static int start_func_tracer(int graph)
136{
137 int ret;
138
139 if (!graph)
140 ret = register_ftrace_function(&trace_ops);
141 else
142 ret = register_ftrace_graph(&wakeup_graph_return,
143 &wakeup_graph_entry);
144
145 if (!ret && tracing_is_enabled())
146 tracer_enabled = 1;
147 else
148 tracer_enabled = 0;
149
150 return ret;
151}
152
153static void stop_func_tracer(int graph)
154{
155 tracer_enabled = 0;
156
157 if (!graph)
158 unregister_ftrace_function(&trace_ops);
159 else
160 unregister_ftrace_graph();
161}
162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
165{
166
167 if (!(bit & TRACE_DISPLAY_GRAPH))
168 return -EINVAL;
169
170 if (!(is_graph() ^ set))
171 return 0;
172
173 stop_func_tracer(!set);
174
175 wakeup_reset(wakeup_trace);
176 tracing_max_latency = 0;
177
178 return start_func_tracer(set);
179}
180
181static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
182{
183 struct trace_array *tr = wakeup_trace;
184 struct trace_array_cpu *data;
185 unsigned long flags;
186 int pc, ret = 0;
187
188 if (!func_prolog_preempt_disable(tr, &data, &pc))
189 return 0;
190
191 local_save_flags(flags);
192 ret = __trace_graph_entry(tr, trace, flags, pc);
193 atomic_dec(&data->disabled);
194 preempt_enable_notrace();
195
196 return ret;
197}
198
199static void wakeup_graph_return(struct ftrace_graph_ret *trace)
200{
201 struct trace_array *tr = wakeup_trace;
202 struct trace_array_cpu *data;
203 unsigned long flags;
204 int pc;
205
206 if (!func_prolog_preempt_disable(tr, &data, &pc))
207 return;
208
209 local_save_flags(flags);
210 __trace_graph_return(tr, trace, flags, pc);
211 atomic_dec(&data->disabled);
212
213 preempt_enable_notrace();
214 return;
215}
216
217static void wakeup_trace_open(struct trace_iterator *iter)
218{
219 if (is_graph())
220 graph_trace_open(iter);
221}
222
223static void wakeup_trace_close(struct trace_iterator *iter)
224{
225 if (iter->private)
226 graph_trace_close(iter);
227}
228
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
230
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{
233 /*
234 * In graph mode call the graph tracer output function,
235 * otherwise go with the TRACE_FN event handler
236 */
237 if (is_graph())
238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
239
240 return TRACE_TYPE_UNHANDLED;
241}
242
243static void wakeup_print_header(struct seq_file *s)
244{
245 if (is_graph())
246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
247 else
248 trace_default_header(s);
249}
250
251static void
252__trace_function(struct trace_array *tr,
253 unsigned long ip, unsigned long parent_ip,
254 unsigned long flags, int pc)
255{
256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
259 trace_function(tr, ip, parent_ip, flags, pc);
260}
261#else
262#define __trace_function trace_function
263
264static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
265{
266 return -EINVAL;
267}
268
269static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
270{
271 return -1;
272}
273
274static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
275{
276 return TRACE_TYPE_UNHANDLED;
277}
278
279static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
280static void wakeup_print_header(struct seq_file *s) { }
281static void wakeup_trace_open(struct trace_iterator *iter) { }
282static void wakeup_trace_close(struct trace_iterator *iter) { }
283#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
284
85/* 285/*
86 * Should this new latency be reported/recorded? 286 * Should this new latency be reported/recorded?
87 */ 287 */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
152 /* The task we are waiting for is waking up */ 352 /* The task we are waiting for is waking up */
153 data = wakeup_trace->data[wakeup_cpu]; 353 data = wakeup_trace->data[wakeup_cpu];
154 354
155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 355 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 356 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
157 357
158 T0 = data->preempt_timestamp; 358 T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
252 * is not called by an assembly function (where as schedule is) 452 * is not called by an assembly function (where as schedule is)
253 * it should be safe to use it here. 453 * it should be safe to use it here.
254 */ 454 */
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 455 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 456
257out_locked: 457out_locked:
258 arch_spin_unlock(&wakeup_lock); 458 arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
303 */ 503 */
304 smp_wmb(); 504 smp_wmb();
305 505
306 register_ftrace_function(&trace_ops); 506 if (start_func_tracer(is_graph()))
307 507 printk(KERN_ERR "failed to start wakeup tracer\n");
308 if (tracing_is_enabled())
309 tracer_enabled = 1;
310 else
311 tracer_enabled = 0;
312 508
313 return; 509 return;
314fail_deprobe_wake_new: 510fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
320static void stop_wakeup_tracer(struct trace_array *tr) 516static void stop_wakeup_tracer(struct trace_array *tr)
321{ 517{
322 tracer_enabled = 0; 518 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 519 stop_func_tracer(is_graph());
324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 520 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 521 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
379 .start = wakeup_tracer_start, 575 .start = wakeup_tracer_start,
380 .stop = wakeup_tracer_stop, 576 .stop = wakeup_tracer_stop,
381 .print_max = 1, 577 .print_max = 1,
578 .print_header = wakeup_print_header,
579 .print_line = wakeup_print_line,
580 .flags = &tracer_flags,
581 .set_flag = wakeup_set_flag,
382#ifdef CONFIG_FTRACE_SELFTEST 582#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 583 .selftest = trace_selftest_startup_wakeup,
384#endif 584#endif
585 .open = wakeup_trace_open,
586 .close = wakeup_trace_close,
385 .use_max_tr = 1, 587 .use_max_tr = 1,
386}; 588};
387 589
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 596 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 597 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 598 .print_max = 1,
599 .print_header = wakeup_print_header,
600 .print_line = wakeup_print_line,
601 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 603#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 604 .selftest = trace_selftest_startup_wakeup,
399#endif 605#endif
606 .open = wakeup_trace_open,
607 .close = wakeup_trace_close,
400 .use_max_tr = 1, 608 .use_max_tr = 1,
401}; 609};
402 610
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 jump_label_enable(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 jump_label_disable(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2feb2870d3a1..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog; 46static int __initdata no_watchdog;
48 47
49 48
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
187 return 0; 186 return 0;
188} 187}
189 188
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 189#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = { 190static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 191 .type = PERF_TYPE_HARDWARE,
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
371 /* Try to register using hardware perf events */ 358 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 359 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(); 360 wd_attr->sample_period = hw_nmi_get_sample_period();
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
375 if (!IS_ERR(event)) { 362 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 364 goto out_save;
378 } 365 }
379 366
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
381 return -1; 368 return PTR_ERR(event);
382 369
383 /* success path */ 370 /* success path */
384out_save: 371out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
422static int watchdog_enable(int cpu) 409static int watchdog_enable(int cpu)
423{ 410{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
425 413
426 /* enable the perf event */ 414 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0) 415 err = watchdog_nmi_enable(cpu);
428 return -1; 416 if (err)
417 return err;
429 418
430 /* create the watchdog thread */ 419 /* create the watchdog thread */
431 if (!p) { 420 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) { 422 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1; 424 return PTR_ERR(p);
436 } 425 }
437 kthread_bind(p, cpu); 426 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0; 427 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
484{ 473{
485 int cpu; 474 int cpu;
486 475
476 if (no_watchdog)
477 return;
478
487 for_each_online_cpu(cpu) 479 for_each_online_cpu(cpu)
488 watchdog_disable(cpu); 480 watchdog_disable(cpu);
489 481
@@ -526,17 +518,16 @@ static int __cpuinit
526cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
527{ 519{
528 int hotcpu = (unsigned long)hcpu; 520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
529 522
530 switch (action) { 523 switch (action) {
531 case CPU_UP_PREPARE: 524 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN: 525 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu)) 526 err = watchdog_prepare_cpu(hotcpu);
534 return NOTIFY_BAD;
535 break; 527 break;
536 case CPU_ONLINE: 528 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN: 529 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu)) 530 err = watchdog_enable(hotcpu);
539 return NOTIFY_BAD;
540 break; 531 break;
541#ifdef CONFIG_HOTPLUG_CPU 532#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED: 533 case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
549 break; 540 break;
550#endif /* CONFIG_HOTPLUG_CPU */ 541#endif /* CONFIG_HOTPLUG_CPU */
551 } 542 }
552 return NOTIFY_OK; 543 return notifier_from_errno(err);
553} 544}
554 545
555static struct notifier_block __cpuinitdata cpu_nfb = { 546static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
565 return 0; 556 return 0;
566 557
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD); 559 WARN_ON(notifier_to_errno(err));
569 560
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
572 563
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
574
575 return 0; 564 return 0;
576} 565}
577early_initcall(spawn_watchdog_task); 566early_initcall(spawn_watchdog_task);