aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/hw_breakpoint.c67
-rw-r--r--kernel/jump_label.c429
-rw-r--r--kernel/kprobes.c26
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/perf_event.c2355
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/trace/ftrace.c123
-rw-r--r--kernel/trace/ring_buffer.c21
-rw-r--r--kernel/trace/trace_event_perf.c28
-rw-r--r--kernel/trace/trace_events.c55
-rw-r--r--kernel/trace/trace_functions_graph.c126
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/watchdog.c41
16 files changed, 2206 insertions, 1103 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..d52b473c99a1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o 14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 15obj-y += groups.o
16 16
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..e2bdf37f9fde 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 149{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 151
152#ifdef CONFIG_PERF_EVENTS 152 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 153 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 154 put_task_struct(tsk);
157} 155}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2dc..3b714e839c10 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437 triggered);
438} 437}
439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
440 439
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
516 get_online_cpus(); 515 get_online_cpus();
517 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
518 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
519 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
520 519
521 *pevent = bp; 520 *pevent = bp;
522 521
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
566 .priority = 0x7fffffff 565 .priority = 0x7fffffff
567}; 566};
568 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
569static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
570{ 624{
571 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
587 641
588 constraints_initialized = 1; 642 constraints_initialized = 1;
589 643
644 perf_pmu_register(&perf_breakpoint);
645
590 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
591 647
592 err_alloc: 648 err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
602core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
603 659
604 660
605struct pmu perf_ops_bp = {
606 .enable = arch_install_hw_breakpoint,
607 .disable = arch_uninstall_hw_breakpoint,
608 .read = hw_breakpoint_pmu_read,
609};
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42static int jump_label_cmp(const void *a, const void *b)
43{
44 const struct jump_entry *jea = a;
45 const struct jump_entry *jeb = b;
46
47 if (jea->key < jeb->key)
48 return -1;
49
50 if (jea->key > jeb->key)
51 return 1;
52
53 return 0;
54}
55
56static void
57sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
58{
59 unsigned long size;
60
61 size = (((unsigned long)stop - (unsigned long)start)
62 / sizeof(struct jump_entry));
63 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
64}
65
66static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
67{
68 struct hlist_head *head;
69 struct hlist_node *node;
70 struct jump_label_entry *e;
71 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
72
73 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
74 hlist_for_each_entry(e, node, head, hlist) {
75 if (key == e->key)
76 return e;
77 }
78 return NULL;
79}
80
81static struct jump_label_entry *
82add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
83{
84 struct hlist_head *head;
85 struct jump_label_entry *e;
86 u32 hash;
87
88 e = get_jump_label_entry(key);
89 if (e)
90 return ERR_PTR(-EEXIST);
91
92 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
93 if (!e)
94 return ERR_PTR(-ENOMEM);
95
96 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
97 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
98 e->key = key;
99 e->table = table;
100 e->nr_entries = nr_entries;
101 INIT_HLIST_HEAD(&(e->modules));
102 hlist_add_head(&e->hlist, head);
103 return e;
104}
105
106static int
107build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
108{
109 struct jump_entry *iter, *iter_begin;
110 struct jump_label_entry *entry;
111 int count;
112
113 sort_jump_label_entries(start, stop);
114 iter = start;
115 while (iter < stop) {
116 entry = get_jump_label_entry(iter->key);
117 if (!entry) {
118 iter_begin = iter;
119 count = 0;
120 while ((iter < stop) &&
121 (iter->key == iter_begin->key)) {
122 iter++;
123 count++;
124 }
125 entry = add_jump_label_entry(iter_begin->key,
126 count, iter_begin);
127 if (IS_ERR(entry))
128 return PTR_ERR(entry);
129 } else {
130 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
131 return -1;
132 }
133 }
134 return 0;
135}
136
137/***
138 * jump_label_update - update jump label text
139 * @key - key value associated with a a jump label
140 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
141 *
142 * Will enable/disable the jump for jump label @key, depending on the
143 * value of @type.
144 *
145 */
146
147void jump_label_update(unsigned long key, enum jump_label_type type)
148{
149 struct jump_entry *iter;
150 struct jump_label_entry *entry;
151 struct hlist_node *module_node;
152 struct jump_label_module_entry *e_module;
153 int count;
154
155 mutex_lock(&jump_label_mutex);
156 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) {
158 count = entry->nr_entries;
159 iter = entry->table;
160 while (count--) {
161 if (kernel_text_address(iter->code))
162 arch_jump_label_transform(iter, type);
163 iter++;
164 }
165 /* eanble/disable jump labels in modules */
166 hlist_for_each_entry(e_module, module_node, &(entry->modules),
167 hlist) {
168 count = e_module->nr_entries;
169 iter = e_module->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 }
176 }
177 mutex_unlock(&jump_label_mutex);
178}
179
180static int addr_conflict(struct jump_entry *entry, void *start, void *end)
181{
182 if (entry->code <= (unsigned long)end &&
183 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
184 return 1;
185
186 return 0;
187}
188
189#ifdef CONFIG_MODULES
190
191static int module_conflict(void *start, void *end)
192{
193 struct hlist_head *head;
194 struct hlist_node *node, *node_next, *module_node, *module_node_next;
195 struct jump_label_entry *e;
196 struct jump_label_module_entry *e_module;
197 struct jump_entry *iter;
198 int i, count;
199 int conflict = 0;
200
201 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
202 head = &jump_label_table[i];
203 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
204 hlist_for_each_entry_safe(e_module, module_node,
205 module_node_next,
206 &(e->modules), hlist) {
207 count = e_module->nr_entries;
208 iter = e_module->table;
209 while (count--) {
210 if (addr_conflict(iter, start, end)) {
211 conflict = 1;
212 goto out;
213 }
214 iter++;
215 }
216 }
217 }
218 }
219out:
220 return conflict;
221}
222
223#endif
224
225/***
226 * jump_label_text_reserved - check if addr range is reserved
227 * @start: start text addr
228 * @end: end text addr
229 *
230 * checks if the text addr located between @start and @end
231 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses.
234 *
235 * returns 1 if there is an overlap, 0 otherwise
236 */
237int jump_label_text_reserved(void *start, void *end)
238{
239 struct jump_entry *iter;
240 struct jump_entry *iter_start = __start___jump_table;
241 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0;
243
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start;
246 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) {
248 conflict = 1;
249 goto out;
250 }
251 iter++;
252 }
253
254 /* now check modules */
255#ifdef CONFIG_MODULES
256 conflict = module_conflict(start, end);
257#endif
258out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict;
261}
262
263static __init int init_jump_label(void)
264{
265 int ret;
266 struct jump_entry *iter_start = __start___jump_table;
267 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter;
269
270 mutex_lock(&jump_label_mutex);
271 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table);
273 iter = iter_start;
274 while (iter < iter_stop) {
275 arch_jump_label_text_poke_early(iter->code);
276 iter++;
277 }
278 mutex_unlock(&jump_label_mutex);
279 return ret;
280}
281early_initcall(init_jump_label);
282
283#ifdef CONFIG_MODULES
284
285static struct jump_label_module_entry *
286add_jump_label_module_entry(struct jump_label_entry *entry,
287 struct jump_entry *iter_begin,
288 int count, struct module *mod)
289{
290 struct jump_label_module_entry *e;
291
292 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
293 if (!e)
294 return ERR_PTR(-ENOMEM);
295 e->mod = mod;
296 e->nr_entries = count;
297 e->table = iter_begin;
298 hlist_add_head(&e->hlist, &entry->modules);
299 return e;
300}
301
302static int add_jump_label_module(struct module *mod)
303{
304 struct jump_entry *iter, *iter_begin;
305 struct jump_label_entry *entry;
306 struct jump_label_module_entry *module_entry;
307 int count;
308
309 /* if the module doesn't have jump label entries, just return */
310 if (!mod->num_jump_entries)
311 return 0;
312
313 sort_jump_label_entries(mod->jump_entries,
314 mod->jump_entries + mod->num_jump_entries);
315 iter = mod->jump_entries;
316 while (iter < mod->jump_entries + mod->num_jump_entries) {
317 entry = get_jump_label_entry(iter->key);
318 iter_begin = iter;
319 count = 0;
320 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
321 (iter->key == iter_begin->key)) {
322 iter++;
323 count++;
324 }
325 if (!entry) {
326 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
327 if (IS_ERR(entry))
328 return PTR_ERR(entry);
329 }
330 module_entry = add_jump_label_module_entry(entry, iter_begin,
331 count, mod);
332 if (IS_ERR(module_entry))
333 return PTR_ERR(module_entry);
334 }
335 return 0;
336}
337
338static void remove_jump_label_module(struct module *mod)
339{
340 struct hlist_head *head;
341 struct hlist_node *node, *node_next, *module_node, *module_node_next;
342 struct jump_label_entry *e;
343 struct jump_label_module_entry *e_module;
344 int i;
345
346 /* if the module doesn't have jump label entries, just return */
347 if (!mod->num_jump_entries)
348 return;
349
350 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
351 head = &jump_label_table[i];
352 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
353 hlist_for_each_entry_safe(e_module, module_node,
354 module_node_next,
355 &(e->modules), hlist) {
356 if (e_module->mod == mod) {
357 hlist_del(&e_module->hlist);
358 kfree(e_module);
359 }
360 }
361 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
362 hlist_del(&e->hlist);
363 kfree(e);
364 }
365 }
366 }
367}
368
369static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data)
372{
373 struct module *mod = data;
374 int ret = 0;
375
376 switch (val) {
377 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex);
379 ret = add_jump_label_module(mod);
380 if (ret)
381 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex);
383 break;
384 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex);
386 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex);
388 break;
389 }
390 return ret;
391}
392
393/***
394 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
395 * @mod: module to patch
396 *
397 * Allow for run-time selection of the optimal nops. Before the module
398 * loads patch these with arch_get_jump_label_nop(), which is specified by
399 * the arch specific jump label code.
400 */
401void jump_label_apply_nops(struct module *mod)
402{
403 struct jump_entry *iter;
404
405 /* if the module doesn't have jump label entries, just return */
406 if (!mod->num_jump_entries)
407 return;
408
409 iter = mod->jump_entries;
410 while (iter < mod->jump_entries + mod->num_jump_entries) {
411 arch_jump_label_text_poke_early(iter->code);
412 iter++;
413 }
414}
415
416struct notifier_block jump_label_module_nb = {
417 .notifier_call = jump_label_module_notify,
418 .priority = 0,
419};
420
421static __init int init_jump_label_module(void)
422{
423 return register_module_notifier(&jump_label_module_nb);
424}
425early_initcall(init_jump_label_module);
426
427#endif /* CONFIG_MODULES */
428
429#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..ec4210c6501e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 400 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 401 * instructions including addr (exclude breakpoint).
401 */ 402 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 403static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 404{
404 int i; 405 int i;
405 struct kprobe *p = NULL; 406 struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 832
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 833void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 834 struct hlist_head **head, unsigned long *flags)
835__acquires(hlist_lock)
834{ 836{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 837 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 838 spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 844
843static void __kprobes kretprobe_table_lock(unsigned long hash, 845static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 846 unsigned long *flags)
847__acquires(hlist_lock)
845{ 848{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 849 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 850 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 852
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 853void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 854 unsigned long *flags)
855__releases(hlist_lock)
852{ 856{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 857 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 858 spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 861 spin_unlock_irqrestore(hlist_lock, *flags);
858} 862}
859 863
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 864static void __kprobes kretprobe_table_unlock(unsigned long hash,
865 unsigned long *flags)
866__releases(hlist_lock)
861{ 867{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 868 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 869 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1141 preempt_disable(); 1147 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1148 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1149 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1150 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) {
1145 preempt_enable(); 1152 preempt_enable();
1146 return -EINVAL; 1153 return -EINVAL;
1147 } 1154 }
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1346 if (num <= 0)
1340 return -EINVAL; 1347 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1348 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1349 unsigned long addr, offset;
1343 jp = jps[i]; 1350 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1351 addr = arch_deref_entry_point(jp->entry);
1345 1352
1346 if (!kernel_text_address(addr)) 1353 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1354 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1355 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1356 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1357 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1358 ret = register_kprobe(&jp->kp);
1353 } 1359 } else
1360 ret = -EINVAL;
1361
1354 if (ret < 0) { 1362 if (ret < 0) {
1355 if (i > 0) 1363 if (i > 0)
1356 unregister_jprobes(jps, i); 1364 unregister_jprobes(jps, i);
diff --git a/kernel/module.c b/kernel/module.c
index d0b5f8db11b4..eba134157ef6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -2308,6 +2309,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2308 sizeof(*mod->tracepoints), 2309 sizeof(*mod->tracepoints),
2309 &mod->num_tracepoints); 2310 &mod->num_tracepoints);
2310#endif 2311#endif
2312#ifdef HAVE_JUMP_LABEL
2313 mod->jump_entries = section_objs(info, "__jump_table",
2314 sizeof(*mod->jump_entries),
2315 &mod->num_jump_entries);
2316#endif
2311#ifdef CONFIG_EVENT_TRACING 2317#ifdef CONFIG_EVENT_TRACING
2312 mod->trace_events = section_objs(info, "_ftrace_events", 2318 mod->trace_events = section_objs(info, "_ftrace_events",
2313 sizeof(*mod->trace_events), 2319 sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index db5b56064687..64507eaa2d9e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/*
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly; 37static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,38 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66void perf_pmu_disable(struct pmu *pmu)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 int *count = this_cpu_ptr(pmu->pmu_disable_count);
69 if (!(*count)++)
70 pmu->pmu_disable(pmu);
81} 71}
82 72
83void __weak hw_perf_disable(void) { barrier(); } 73void perf_pmu_enable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); }
85
86void __weak perf_event_print_debug(void) { }
87
88static DEFINE_PER_CPU(int, perf_disable_count);
89
90void perf_disable(void)
91{ 74{
92 if (!__get_cpu_var(perf_disable_count)++) 75 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 76 if (!--(*count))
77 pmu->pmu_enable(pmu);
94} 78}
95 79
96void perf_enable(void) 80static DEFINE_PER_CPU(struct list_head, rotation_list);
81
82/*
83 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
84 * because they're strictly cpu affine and rotate_start is called with IRQs
85 * disabled, while rotate_context is called from IRQ context.
86 */
87static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 88{
98 if (!--__get_cpu_var(perf_disable_count)) 89 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 90 struct list_head *head = &__get_cpu_var(rotation_list);
91
92 WARN_ON(!irqs_disabled());
93
94 if (list_empty(&cpuctx->rotation_list))
95 list_add(&cpuctx->rotation_list, head);
100} 96}
101 97
102static void get_ctx(struct perf_event_context *ctx) 98static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +147,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 147 * the context could get moved to another task.
152 */ 148 */
153static struct perf_event_context * 149static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 150perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 151{
156 struct perf_event_context *ctx; 152 struct perf_event_context *ctx;
157 153
158 rcu_read_lock(); 154 rcu_read_lock();
159 retry: 155retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 156 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 157 if (ctx) {
162 /* 158 /*
163 * If this context is a clone of another, it might 159 * If this context is a clone of another, it might
@@ -170,7 +166,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 166 * can't get swapped on us any more.
171 */ 167 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 168 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 169 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 170 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 171 goto retry;
176 } 172 }
@@ -189,12 +185,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 185 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 186 * reference count so that the context can't get freed.
191 */ 187 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 188static struct perf_event_context *
189perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 190{
194 struct perf_event_context *ctx; 191 struct perf_event_context *ctx;
195 unsigned long flags; 192 unsigned long flags;
196 193
197 ctx = perf_lock_task_context(task, &flags); 194 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 195 if (ctx) {
199 ++ctx->pin_count; 196 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 197 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +299,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 299 }
303 300
304 list_add_rcu(&event->event_entry, &ctx->event_list); 301 list_add_rcu(&event->event_entry, &ctx->event_list);
302 if (!ctx->nr_events)
303 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 304 ctx->nr_events++;
306 if (event->attr.inherit_stat) 305 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 306 ctx->nr_stat++;
@@ -436,7 +435,7 @@ event_sched_out(struct perf_event *event,
436 event->state = PERF_EVENT_STATE_OFF; 435 event->state = PERF_EVENT_STATE_OFF;
437 } 436 }
438 event->tstamp_stopped = ctx->time; 437 event->tstamp_stopped = ctx->time;
439 event->pmu->disable(event); 438 event->pmu->del(event, 0);
440 event->oncpu = -1; 439 event->oncpu = -1;
441 440
442 if (!is_software_event(event)) 441 if (!is_software_event(event))
@@ -466,6 +465,12 @@ group_sched_out(struct perf_event *group_event,
466 cpuctx->exclusive = 0; 465 cpuctx->exclusive = 0;
467} 466}
468 467
468static inline struct perf_cpu_context *
469__get_cpu_context(struct perf_event_context *ctx)
470{
471 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
472}
473
469/* 474/*
470 * Cross CPU call to remove a performance event 475 * Cross CPU call to remove a performance event
471 * 476 *
@@ -474,9 +479,9 @@ group_sched_out(struct perf_event *group_event,
474 */ 479 */
475static void __perf_event_remove_from_context(void *info) 480static void __perf_event_remove_from_context(void *info)
476{ 481{
477 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
478 struct perf_event *event = info; 482 struct perf_event *event = info;
479 struct perf_event_context *ctx = event->ctx; 483 struct perf_event_context *ctx = event->ctx;
484 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
480 485
481 /* 486 /*
482 * If this is a task context, we need to check whether it is 487 * If this is a task context, we need to check whether it is
@@ -487,27 +492,11 @@ static void __perf_event_remove_from_context(void *info)
487 return; 492 return;
488 493
489 raw_spin_lock(&ctx->lock); 494 raw_spin_lock(&ctx->lock);
490 /*
491 * Protect the list operation against NMI by disabling the
492 * events on a global level.
493 */
494 perf_disable();
495 495
496 event_sched_out(event, cpuctx, ctx); 496 event_sched_out(event, cpuctx, ctx);
497 497
498 list_del_event(event, ctx); 498 list_del_event(event, ctx);
499 499
500 if (!ctx->task) {
501 /*
502 * Allow more per task events with respect to the
503 * reservation:
504 */
505 cpuctx->max_pertask =
506 min(perf_max_events - ctx->nr_events,
507 perf_max_events - perf_reserved_percpu);
508 }
509
510 perf_enable();
511 raw_spin_unlock(&ctx->lock); 500 raw_spin_unlock(&ctx->lock);
512} 501}
513 502
@@ -572,8 +561,8 @@ retry:
572static void __perf_event_disable(void *info) 561static void __perf_event_disable(void *info)
573{ 562{
574 struct perf_event *event = info; 563 struct perf_event *event = info;
575 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
576 struct perf_event_context *ctx = event->ctx; 564 struct perf_event_context *ctx = event->ctx;
565 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
577 566
578 /* 567 /*
579 * If this is a per-task event, need to check whether this 568 * If this is a per-task event, need to check whether this
@@ -628,7 +617,7 @@ void perf_event_disable(struct perf_event *event)
628 return; 617 return;
629 } 618 }
630 619
631 retry: 620retry:
632 task_oncpu_function_call(task, __perf_event_disable, event); 621 task_oncpu_function_call(task, __perf_event_disable, event);
633 622
634 raw_spin_lock_irq(&ctx->lock); 623 raw_spin_lock_irq(&ctx->lock);
@@ -667,7 +656,7 @@ event_sched_in(struct perf_event *event,
667 */ 656 */
668 smp_wmb(); 657 smp_wmb();
669 658
670 if (event->pmu->enable(event)) { 659 if (event->pmu->add(event, PERF_EF_START)) {
671 event->state = PERF_EVENT_STATE_INACTIVE; 660 event->state = PERF_EVENT_STATE_INACTIVE;
672 event->oncpu = -1; 661 event->oncpu = -1;
673 return -EAGAIN; 662 return -EAGAIN;
@@ -691,22 +680,15 @@ group_sched_in(struct perf_event *group_event,
691 struct perf_event_context *ctx) 680 struct perf_event_context *ctx)
692{ 681{
693 struct perf_event *event, *partial_group = NULL; 682 struct perf_event *event, *partial_group = NULL;
694 const struct pmu *pmu = group_event->pmu; 683 struct pmu *pmu = group_event->pmu;
695 bool txn = false;
696 684
697 if (group_event->state == PERF_EVENT_STATE_OFF) 685 if (group_event->state == PERF_EVENT_STATE_OFF)
698 return 0; 686 return 0;
699 687
700 /* Check if group transaction availabe */ 688 pmu->start_txn(pmu);
701 if (pmu->start_txn)
702 txn = true;
703
704 if (txn)
705 pmu->start_txn(pmu);
706 689
707 if (event_sched_in(group_event, cpuctx, ctx)) { 690 if (event_sched_in(group_event, cpuctx, ctx)) {
708 if (txn) 691 pmu->cancel_txn(pmu);
709 pmu->cancel_txn(pmu);
710 return -EAGAIN; 692 return -EAGAIN;
711 } 693 }
712 694
@@ -720,7 +702,7 @@ group_sched_in(struct perf_event *group_event,
720 } 702 }
721 } 703 }
722 704
723 if (!txn || !pmu->commit_txn(pmu)) 705 if (!pmu->commit_txn(pmu))
724 return 0; 706 return 0;
725 707
726group_error: 708group_error:
@@ -735,8 +717,7 @@ group_error:
735 } 717 }
736 event_sched_out(group_event, cpuctx, ctx); 718 event_sched_out(group_event, cpuctx, ctx);
737 719
738 if (txn) 720 pmu->cancel_txn(pmu);
739 pmu->cancel_txn(pmu);
740 721
741 return -EAGAIN; 722 return -EAGAIN;
742} 723}
@@ -789,10 +770,10 @@ static void add_event_to_ctx(struct perf_event *event,
789 */ 770 */
790static void __perf_install_in_context(void *info) 771static void __perf_install_in_context(void *info)
791{ 772{
792 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
793 struct perf_event *event = info; 773 struct perf_event *event = info;
794 struct perf_event_context *ctx = event->ctx; 774 struct perf_event_context *ctx = event->ctx;
795 struct perf_event *leader = event->group_leader; 775 struct perf_event *leader = event->group_leader;
776 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
796 int err; 777 int err;
797 778
798 /* 779 /*
@@ -812,12 +793,6 @@ static void __perf_install_in_context(void *info)
812 ctx->is_active = 1; 793 ctx->is_active = 1;
813 update_context_time(ctx); 794 update_context_time(ctx);
814 795
815 /*
816 * Protect the list operation against NMI by disabling the
817 * events on a global level. NOP for non NMI based events.
818 */
819 perf_disable();
820
821 add_event_to_ctx(event, ctx); 796 add_event_to_ctx(event, ctx);
822 797
823 if (event->cpu != -1 && event->cpu != smp_processor_id()) 798 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +830,7 @@ static void __perf_install_in_context(void *info)
855 } 830 }
856 } 831 }
857 832
858 if (!err && !ctx->task && cpuctx->max_pertask) 833unlock:
859 cpuctx->max_pertask--;
860
861 unlock:
862 perf_enable();
863
864 raw_spin_unlock(&ctx->lock); 834 raw_spin_unlock(&ctx->lock);
865} 835}
866 836
@@ -883,6 +853,8 @@ perf_install_in_context(struct perf_event_context *ctx,
883{ 853{
884 struct task_struct *task = ctx->task; 854 struct task_struct *task = ctx->task;
885 855
856 event->ctx = ctx;
857
886 if (!task) { 858 if (!task) {
887 /* 859 /*
888 * Per cpu events are installed via an smp call and 860 * Per cpu events are installed via an smp call and
@@ -931,10 +903,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
931 903
932 event->state = PERF_EVENT_STATE_INACTIVE; 904 event->state = PERF_EVENT_STATE_INACTIVE;
933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 905 event->tstamp_enabled = ctx->time - event->total_time_enabled;
934 list_for_each_entry(sub, &event->sibling_list, group_entry) 906 list_for_each_entry(sub, &event->sibling_list, group_entry) {
935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 907 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
936 sub->tstamp_enabled = 908 sub->tstamp_enabled =
937 ctx->time - sub->total_time_enabled; 909 ctx->time - sub->total_time_enabled;
910 }
911 }
938} 912}
939 913
940/* 914/*
@@ -943,9 +917,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
943static void __perf_event_enable(void *info) 917static void __perf_event_enable(void *info)
944{ 918{
945 struct perf_event *event = info; 919 struct perf_event *event = info;
946 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
947 struct perf_event_context *ctx = event->ctx; 920 struct perf_event_context *ctx = event->ctx;
948 struct perf_event *leader = event->group_leader; 921 struct perf_event *leader = event->group_leader;
922 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
949 int err; 923 int err;
950 924
951 /* 925 /*
@@ -979,12 +953,10 @@ static void __perf_event_enable(void *info)
979 if (!group_can_go_on(event, cpuctx, 1)) { 953 if (!group_can_go_on(event, cpuctx, 1)) {
980 err = -EEXIST; 954 err = -EEXIST;
981 } else { 955 } else {
982 perf_disable();
983 if (event == leader) 956 if (event == leader)
984 err = group_sched_in(event, cpuctx, ctx); 957 err = group_sched_in(event, cpuctx, ctx);
985 else 958 else
986 err = event_sched_in(event, cpuctx, ctx); 959 err = event_sched_in(event, cpuctx, ctx);
987 perf_enable();
988 } 960 }
989 961
990 if (err) { 962 if (err) {
@@ -1000,7 +972,7 @@ static void __perf_event_enable(void *info)
1000 } 972 }
1001 } 973 }
1002 974
1003 unlock: 975unlock:
1004 raw_spin_unlock(&ctx->lock); 976 raw_spin_unlock(&ctx->lock);
1005} 977}
1006 978
@@ -1041,7 +1013,7 @@ void perf_event_enable(struct perf_event *event)
1041 if (event->state == PERF_EVENT_STATE_ERROR) 1013 if (event->state == PERF_EVENT_STATE_ERROR)
1042 event->state = PERF_EVENT_STATE_OFF; 1014 event->state = PERF_EVENT_STATE_OFF;
1043 1015
1044 retry: 1016retry:
1045 raw_spin_unlock_irq(&ctx->lock); 1017 raw_spin_unlock_irq(&ctx->lock);
1046 task_oncpu_function_call(task, __perf_event_enable, event); 1018 task_oncpu_function_call(task, __perf_event_enable, event);
1047 1019
@@ -1061,7 +1033,7 @@ void perf_event_enable(struct perf_event *event)
1061 if (event->state == PERF_EVENT_STATE_OFF) 1033 if (event->state == PERF_EVENT_STATE_OFF)
1062 __perf_event_mark_enabled(event, ctx); 1034 __perf_event_mark_enabled(event, ctx);
1063 1035
1064 out: 1036out:
1065 raw_spin_unlock_irq(&ctx->lock); 1037 raw_spin_unlock_irq(&ctx->lock);
1066} 1038}
1067 1039
@@ -1092,26 +1064,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_event *event; 1064 struct perf_event *event;
1093 1065
1094 raw_spin_lock(&ctx->lock); 1066 raw_spin_lock(&ctx->lock);
1067 perf_pmu_disable(ctx->pmu);
1095 ctx->is_active = 0; 1068 ctx->is_active = 0;
1096 if (likely(!ctx->nr_events)) 1069 if (likely(!ctx->nr_events))
1097 goto out; 1070 goto out;
1098 update_context_time(ctx); 1071 update_context_time(ctx);
1099 1072
1100 perf_disable();
1101 if (!ctx->nr_active) 1073 if (!ctx->nr_active)
1102 goto out_enable; 1074 goto out;
1103 1075
1104 if (event_type & EVENT_PINNED) 1076 if (event_type & EVENT_PINNED) {
1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1077 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1106 group_sched_out(event, cpuctx, ctx); 1078 group_sched_out(event, cpuctx, ctx);
1079 }
1107 1080
1108 if (event_type & EVENT_FLEXIBLE) 1081 if (event_type & EVENT_FLEXIBLE) {
1109 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1082 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1110 group_sched_out(event, cpuctx, ctx); 1083 group_sched_out(event, cpuctx, ctx);
1111 1084 }
1112 out_enable: 1085out:
1113 perf_enable(); 1086 perf_pmu_enable(ctx->pmu);
1114 out:
1115 raw_spin_unlock(&ctx->lock); 1087 raw_spin_unlock(&ctx->lock);
1116} 1088}
1117 1089
@@ -1209,34 +1181,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1209 } 1181 }
1210} 1182}
1211 1183
1212/* 1184void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1213 * Called from scheduler to remove the events of the current task, 1185 struct task_struct *next)
1214 * with interrupts disabled.
1215 *
1216 * We stop each event and update the event value in event->count.
1217 *
1218 * This does not protect us against NMI, but disable()
1219 * sets the disabled bit in the control field of event _before_
1220 * accessing the event control register. If a NMI hits, then it will
1221 * not restart the event.
1222 */
1223void perf_event_task_sched_out(struct task_struct *task,
1224 struct task_struct *next)
1225{ 1186{
1226 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1187 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1227 struct perf_event_context *ctx = task->perf_event_ctxp;
1228 struct perf_event_context *next_ctx; 1188 struct perf_event_context *next_ctx;
1229 struct perf_event_context *parent; 1189 struct perf_event_context *parent;
1190 struct perf_cpu_context *cpuctx;
1230 int do_switch = 1; 1191 int do_switch = 1;
1231 1192
1232 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1193 if (likely(!ctx))
1194 return;
1233 1195
1234 if (likely(!ctx || !cpuctx->task_ctx)) 1196 cpuctx = __get_cpu_context(ctx);
1197 if (!cpuctx->task_ctx)
1235 return; 1198 return;
1236 1199
1237 rcu_read_lock(); 1200 rcu_read_lock();
1238 parent = rcu_dereference(ctx->parent_ctx); 1201 parent = rcu_dereference(ctx->parent_ctx);
1239 next_ctx = next->perf_event_ctxp; 1202 next_ctx = next->perf_event_ctxp[ctxn];
1240 if (parent && next_ctx && 1203 if (parent && next_ctx &&
1241 rcu_dereference(next_ctx->parent_ctx) == parent) { 1204 rcu_dereference(next_ctx->parent_ctx) == parent) {
1242 /* 1205 /*
@@ -1255,8 +1218,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1255 * XXX do we need a memory barrier of sorts 1218 * XXX do we need a memory barrier of sorts
1256 * wrt to rcu_dereference() of perf_event_ctxp 1219 * wrt to rcu_dereference() of perf_event_ctxp
1257 */ 1220 */
1258 task->perf_event_ctxp = next_ctx; 1221 task->perf_event_ctxp[ctxn] = next_ctx;
1259 next->perf_event_ctxp = ctx; 1222 next->perf_event_ctxp[ctxn] = ctx;
1260 ctx->task = next; 1223 ctx->task = next;
1261 next_ctx->task = task; 1224 next_ctx->task = task;
1262 do_switch = 0; 1225 do_switch = 0;
@@ -1274,10 +1237,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1274 } 1237 }
1275} 1238}
1276 1239
1240#define for_each_task_context_nr(ctxn) \
1241 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1242
1243/*
1244 * Called from scheduler to remove the events of the current task,
1245 * with interrupts disabled.
1246 *
1247 * We stop each event and update the event value in event->count.
1248 *
1249 * This does not protect us against NMI, but disable()
1250 * sets the disabled bit in the control field of event _before_
1251 * accessing the event control register. If a NMI hits, then it will
1252 * not restart the event.
1253 */
1254void perf_event_task_sched_out(struct task_struct *task,
1255 struct task_struct *next)
1256{
1257 int ctxn;
1258
1259 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1260
1261 for_each_task_context_nr(ctxn)
1262 perf_event_context_sched_out(task, ctxn, next);
1263}
1264
1277static void task_ctx_sched_out(struct perf_event_context *ctx, 1265static void task_ctx_sched_out(struct perf_event_context *ctx,
1278 enum event_type_t event_type) 1266 enum event_type_t event_type)
1279{ 1267{
1280 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1268 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1281 1269
1282 if (!cpuctx->task_ctx) 1270 if (!cpuctx->task_ctx)
1283 return; 1271 return;
@@ -1350,9 +1338,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1350 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1338 if (event->cpu != -1 && event->cpu != smp_processor_id())
1351 continue; 1339 continue;
1352 1340
1353 if (group_can_go_on(event, cpuctx, can_add_hw)) 1341 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1354 if (group_sched_in(event, cpuctx, ctx)) 1342 if (group_sched_in(event, cpuctx, ctx))
1355 can_add_hw = 0; 1343 can_add_hw = 0;
1344 }
1356 } 1345 }
1357} 1346}
1358 1347
@@ -1368,8 +1357,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1368 1357
1369 ctx->timestamp = perf_clock(); 1358 ctx->timestamp = perf_clock();
1370 1359
1371 perf_disable();
1372
1373 /* 1360 /*
1374 * First go through the list and put on any pinned groups 1361 * First go through the list and put on any pinned groups
1375 * in order to give them the best chance of going on. 1362 * in order to give them the best chance of going on.
@@ -1381,8 +1368,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1381 if (event_type & EVENT_FLEXIBLE) 1368 if (event_type & EVENT_FLEXIBLE)
1382 ctx_flexible_sched_in(ctx, cpuctx); 1369 ctx_flexible_sched_in(ctx, cpuctx);
1383 1370
1384 perf_enable(); 1371out:
1385 out:
1386 raw_spin_unlock(&ctx->lock); 1372 raw_spin_unlock(&ctx->lock);
1387} 1373}
1388 1374
@@ -1394,43 +1380,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1394 ctx_sched_in(ctx, cpuctx, event_type); 1380 ctx_sched_in(ctx, cpuctx, event_type);
1395} 1381}
1396 1382
1397static void task_ctx_sched_in(struct task_struct *task, 1383static void task_ctx_sched_in(struct perf_event_context *ctx,
1398 enum event_type_t event_type) 1384 enum event_type_t event_type)
1399{ 1385{
1400 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1386 struct perf_cpu_context *cpuctx;
1401 struct perf_event_context *ctx = task->perf_event_ctxp;
1402 1387
1403 if (likely(!ctx)) 1388 cpuctx = __get_cpu_context(ctx);
1404 return;
1405 if (cpuctx->task_ctx == ctx) 1389 if (cpuctx->task_ctx == ctx)
1406 return; 1390 return;
1391
1407 ctx_sched_in(ctx, cpuctx, event_type); 1392 ctx_sched_in(ctx, cpuctx, event_type);
1408 cpuctx->task_ctx = ctx; 1393 cpuctx->task_ctx = ctx;
1409} 1394}
1410/*
1411 * Called from scheduler to add the events of the current task
1412 * with interrupts disabled.
1413 *
1414 * We restore the event value and then enable it.
1415 *
1416 * This does not protect us against NMI, but enable()
1417 * sets the enabled bit in the control field of event _before_
1418 * accessing the event control register. If a NMI hits, then it will
1419 * keep the event running.
1420 */
1421void perf_event_task_sched_in(struct task_struct *task)
1422{
1423 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1424 struct perf_event_context *ctx = task->perf_event_ctxp;
1425 1395
1426 if (likely(!ctx)) 1396void perf_event_context_sched_in(struct perf_event_context *ctx)
1427 return; 1397{
1398 struct perf_cpu_context *cpuctx;
1428 1399
1400 cpuctx = __get_cpu_context(ctx);
1429 if (cpuctx->task_ctx == ctx) 1401 if (cpuctx->task_ctx == ctx)
1430 return; 1402 return;
1431 1403
1432 perf_disable(); 1404 perf_pmu_disable(ctx->pmu);
1433
1434 /* 1405 /*
1435 * We want to keep the following priority order: 1406 * We want to keep the following priority order:
1436 * cpu pinned (that don't need to move), task pinned, 1407 * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1415,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1444 1415
1445 cpuctx->task_ctx = ctx; 1416 cpuctx->task_ctx = ctx;
1446 1417
1447 perf_enable(); 1418 /*
1419 * Since these rotations are per-cpu, we need to ensure the
1420 * cpu-context we got scheduled on is actually rotating.
1421 */
1422 perf_pmu_rotate_start(ctx->pmu);
1423 perf_pmu_enable(ctx->pmu);
1424}
1425
1426/*
1427 * Called from scheduler to add the events of the current task
1428 * with interrupts disabled.
1429 *
1430 * We restore the event value and then enable it.
1431 *
1432 * This does not protect us against NMI, but enable()
1433 * sets the enabled bit in the control field of event _before_
1434 * accessing the event control register. If a NMI hits, then it will
1435 * keep the event running.
1436 */
1437void perf_event_task_sched_in(struct task_struct *task)
1438{
1439 struct perf_event_context *ctx;
1440 int ctxn;
1441
1442 for_each_task_context_nr(ctxn) {
1443 ctx = task->perf_event_ctxp[ctxn];
1444 if (likely(!ctx))
1445 continue;
1446
1447 perf_event_context_sched_in(ctx);
1448 }
1448} 1449}
1449 1450
1450#define MAX_INTERRUPTS (~0ULL) 1451#define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1525,6 @@ do { \
1524 return div64_u64(dividend, divisor); 1525 return div64_u64(dividend, divisor);
1525} 1526}
1526 1527
1527static void perf_event_stop(struct perf_event *event)
1528{
1529 if (!event->pmu->stop)
1530 return event->pmu->disable(event);
1531
1532 return event->pmu->stop(event);
1533}
1534
1535static int perf_event_start(struct perf_event *event)
1536{
1537 if (!event->pmu->start)
1538 return event->pmu->enable(event);
1539
1540 return event->pmu->start(event);
1541}
1542
1543static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1528static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1544{ 1529{
1545 struct hw_perf_event *hwc = &event->hw; 1530 struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1544,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1559 hwc->sample_period = sample_period; 1544 hwc->sample_period = sample_period;
1560 1545
1561 if (local64_read(&hwc->period_left) > 8*sample_period) { 1546 if (local64_read(&hwc->period_left) > 8*sample_period) {
1562 perf_disable(); 1547 event->pmu->stop(event, PERF_EF_UPDATE);
1563 perf_event_stop(event);
1564 local64_set(&hwc->period_left, 0); 1548 local64_set(&hwc->period_left, 0);
1565 perf_event_start(event); 1549 event->pmu->start(event, PERF_EF_RELOAD);
1566 perf_enable();
1567 } 1550 }
1568} 1551}
1569 1552
1570static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1553static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1571{ 1554{
1572 struct perf_event *event; 1555 struct perf_event *event;
1573 struct hw_perf_event *hwc; 1556 struct hw_perf_event *hwc;
@@ -1592,23 +1575,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1592 */ 1575 */
1593 if (interrupts == MAX_INTERRUPTS) { 1576 if (interrupts == MAX_INTERRUPTS) {
1594 perf_log_throttle(event, 1); 1577 perf_log_throttle(event, 1);
1595 perf_disable(); 1578 event->pmu->start(event, 0);
1596 event->pmu->unthrottle(event);
1597 perf_enable();
1598 } 1579 }
1599 1580
1600 if (!event->attr.freq || !event->attr.sample_freq) 1581 if (!event->attr.freq || !event->attr.sample_freq)
1601 continue; 1582 continue;
1602 1583
1603 perf_disable();
1604 event->pmu->read(event); 1584 event->pmu->read(event);
1605 now = local64_read(&event->count); 1585 now = local64_read(&event->count);
1606 delta = now - hwc->freq_count_stamp; 1586 delta = now - hwc->freq_count_stamp;
1607 hwc->freq_count_stamp = now; 1587 hwc->freq_count_stamp = now;
1608 1588
1609 if (delta > 0) 1589 if (delta > 0)
1610 perf_adjust_period(event, TICK_NSEC, delta); 1590 perf_adjust_period(event, period, delta);
1611 perf_enable();
1612 } 1591 }
1613 raw_spin_unlock(&ctx->lock); 1592 raw_spin_unlock(&ctx->lock);
1614} 1593}
@@ -1626,32 +1605,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1626 raw_spin_unlock(&ctx->lock); 1605 raw_spin_unlock(&ctx->lock);
1627} 1606}
1628 1607
1629void perf_event_task_tick(struct task_struct *curr) 1608/*
1609 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1610 * because they're strictly cpu affine and rotate_start is called with IRQs
1611 * disabled, while rotate_context is called from IRQ context.
1612 */
1613static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1630{ 1614{
1631 struct perf_cpu_context *cpuctx; 1615 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1632 struct perf_event_context *ctx; 1616 struct perf_event_context *ctx = NULL;
1633 int rotate = 0; 1617 int rotate = 0, remove = 1;
1634 1618
1635 if (!atomic_read(&nr_events)) 1619 if (cpuctx->ctx.nr_events) {
1636 return; 1620 remove = 0;
1637 1621 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1638 cpuctx = &__get_cpu_var(perf_cpu_context); 1622 rotate = 1;
1639 if (cpuctx->ctx.nr_events && 1623 }
1640 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1641 rotate = 1;
1642 1624
1643 ctx = curr->perf_event_ctxp; 1625 ctx = cpuctx->task_ctx;
1644 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1626 if (ctx && ctx->nr_events) {
1645 rotate = 1; 1627 remove = 0;
1628 if (ctx->nr_events != ctx->nr_active)
1629 rotate = 1;
1630 }
1646 1631
1647 perf_ctx_adjust_freq(&cpuctx->ctx); 1632 perf_pmu_disable(cpuctx->ctx.pmu);
1633 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1648 if (ctx) 1634 if (ctx)
1649 perf_ctx_adjust_freq(ctx); 1635 perf_ctx_adjust_freq(ctx, interval);
1650 1636
1651 if (!rotate) 1637 if (!rotate)
1652 return; 1638 goto done;
1653 1639
1654 perf_disable();
1655 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1640 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1656 if (ctx) 1641 if (ctx)
1657 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1642 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1647,27 @@ void perf_event_task_tick(struct task_struct *curr)
1662 1647
1663 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1648 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1664 if (ctx) 1649 if (ctx)
1665 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1650 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1666 perf_enable(); 1651
1652done:
1653 if (remove)
1654 list_del_init(&cpuctx->rotation_list);
1655
1656 perf_pmu_enable(cpuctx->ctx.pmu);
1657}
1658
1659void perf_event_task_tick(void)
1660{
1661 struct list_head *head = &__get_cpu_var(rotation_list);
1662 struct perf_cpu_context *cpuctx, *tmp;
1663
1664 WARN_ON(!irqs_disabled());
1665
1666 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1667 if (cpuctx->jiffies_interval == 1 ||
1668 !(jiffies % cpuctx->jiffies_interval))
1669 perf_rotate_context(cpuctx);
1670 }
1667} 1671}
1668 1672
1669static int event_enable_on_exec(struct perf_event *event, 1673static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1689,18 @@ static int event_enable_on_exec(struct perf_event *event,
1685 * Enable all of a task's events that have been marked enable-on-exec. 1689 * Enable all of a task's events that have been marked enable-on-exec.
1686 * This expects task == current. 1690 * This expects task == current.
1687 */ 1691 */
1688static void perf_event_enable_on_exec(struct task_struct *task) 1692static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1689{ 1693{
1690 struct perf_event_context *ctx;
1691 struct perf_event *event; 1694 struct perf_event *event;
1692 unsigned long flags; 1695 unsigned long flags;
1693 int enabled = 0; 1696 int enabled = 0;
1694 int ret; 1697 int ret;
1695 1698
1696 local_irq_save(flags); 1699 local_irq_save(flags);
1697 ctx = task->perf_event_ctxp;
1698 if (!ctx || !ctx->nr_events) 1700 if (!ctx || !ctx->nr_events)
1699 goto out; 1701 goto out;
1700 1702
1701 __perf_event_task_sched_out(ctx); 1703 task_ctx_sched_out(ctx, EVENT_ALL);
1702 1704
1703 raw_spin_lock(&ctx->lock); 1705 raw_spin_lock(&ctx->lock);
1704 1706
@@ -1722,8 +1724,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1722 1724
1723 raw_spin_unlock(&ctx->lock); 1725 raw_spin_unlock(&ctx->lock);
1724 1726
1725 perf_event_task_sched_in(task); 1727 perf_event_context_sched_in(ctx);
1726 out: 1728out:
1727 local_irq_restore(flags); 1729 local_irq_restore(flags);
1728} 1730}
1729 1731
@@ -1732,9 +1734,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1732 */ 1734 */
1733static void __perf_event_read(void *info) 1735static void __perf_event_read(void *info)
1734{ 1736{
1735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1736 struct perf_event *event = info; 1737 struct perf_event *event = info;
1737 struct perf_event_context *ctx = event->ctx; 1738 struct perf_event_context *ctx = event->ctx;
1739 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1738 1740
1739 /* 1741 /*
1740 * If this is a task context, we need to check whether it is 1742 * If this is a task context, we need to check whether it is
@@ -1782,11 +1784,219 @@ static u64 perf_event_read(struct perf_event *event)
1782} 1784}
1783 1785
1784/* 1786/*
1785 * Initialize the perf_event context in a task_struct: 1787 * Callchain support
1786 */ 1788 */
1789
1790struct callchain_cpus_entries {
1791 struct rcu_head rcu_head;
1792 struct perf_callchain_entry *cpu_entries[0];
1793};
1794
1795static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1796static atomic_t nr_callchain_events;
1797static DEFINE_MUTEX(callchain_mutex);
1798struct callchain_cpus_entries *callchain_cpus_entries;
1799
1800
1801__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1802 struct pt_regs *regs)
1803{
1804}
1805
1806__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1807 struct pt_regs *regs)
1808{
1809}
1810
1811static void release_callchain_buffers_rcu(struct rcu_head *head)
1812{
1813 struct callchain_cpus_entries *entries;
1814 int cpu;
1815
1816 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1817
1818 for_each_possible_cpu(cpu)
1819 kfree(entries->cpu_entries[cpu]);
1820
1821 kfree(entries);
1822}
1823
1824static void release_callchain_buffers(void)
1825{
1826 struct callchain_cpus_entries *entries;
1827
1828 entries = callchain_cpus_entries;
1829 rcu_assign_pointer(callchain_cpus_entries, NULL);
1830 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1831}
1832
1833static int alloc_callchain_buffers(void)
1834{
1835 int cpu;
1836 int size;
1837 struct callchain_cpus_entries *entries;
1838
1839 /*
1840 * We can't use the percpu allocation API for data that can be
1841 * accessed from NMI. Use a temporary manual per cpu allocation
1842 * until that gets sorted out.
1843 */
1844 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1845 num_possible_cpus();
1846
1847 entries = kzalloc(size, GFP_KERNEL);
1848 if (!entries)
1849 return -ENOMEM;
1850
1851 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1852
1853 for_each_possible_cpu(cpu) {
1854 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1855 cpu_to_node(cpu));
1856 if (!entries->cpu_entries[cpu])
1857 goto fail;
1858 }
1859
1860 rcu_assign_pointer(callchain_cpus_entries, entries);
1861
1862 return 0;
1863
1864fail:
1865 for_each_possible_cpu(cpu)
1866 kfree(entries->cpu_entries[cpu]);
1867 kfree(entries);
1868
1869 return -ENOMEM;
1870}
1871
1872static int get_callchain_buffers(void)
1873{
1874 int err = 0;
1875 int count;
1876
1877 mutex_lock(&callchain_mutex);
1878
1879 count = atomic_inc_return(&nr_callchain_events);
1880 if (WARN_ON_ONCE(count < 1)) {
1881 err = -EINVAL;
1882 goto exit;
1883 }
1884
1885 if (count > 1) {
1886 /* If the allocation failed, give up */
1887 if (!callchain_cpus_entries)
1888 err = -ENOMEM;
1889 goto exit;
1890 }
1891
1892 err = alloc_callchain_buffers();
1893 if (err)
1894 release_callchain_buffers();
1895exit:
1896 mutex_unlock(&callchain_mutex);
1897
1898 return err;
1899}
1900
1901static void put_callchain_buffers(void)
1902{
1903 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1904 release_callchain_buffers();
1905 mutex_unlock(&callchain_mutex);
1906 }
1907}
1908
1909static int get_recursion_context(int *recursion)
1910{
1911 int rctx;
1912
1913 if (in_nmi())
1914 rctx = 3;
1915 else if (in_irq())
1916 rctx = 2;
1917 else if (in_softirq())
1918 rctx = 1;
1919 else
1920 rctx = 0;
1921
1922 if (recursion[rctx])
1923 return -1;
1924
1925 recursion[rctx]++;
1926 barrier();
1927
1928 return rctx;
1929}
1930
1931static inline void put_recursion_context(int *recursion, int rctx)
1932{
1933 barrier();
1934 recursion[rctx]--;
1935}
1936
1937static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1938{
1939 int cpu;
1940 struct callchain_cpus_entries *entries;
1941
1942 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
1943 if (*rctx == -1)
1944 return NULL;
1945
1946 entries = rcu_dereference(callchain_cpus_entries);
1947 if (!entries)
1948 return NULL;
1949
1950 cpu = smp_processor_id();
1951
1952 return &entries->cpu_entries[cpu][*rctx];
1953}
1954
1787static void 1955static void
1788__perf_event_init_context(struct perf_event_context *ctx, 1956put_callchain_entry(int rctx)
1789 struct task_struct *task) 1957{
1958 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
1959}
1960
1961static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1962{
1963 int rctx;
1964 struct perf_callchain_entry *entry;
1965
1966
1967 entry = get_callchain_entry(&rctx);
1968 if (rctx == -1)
1969 return NULL;
1970
1971 if (!entry)
1972 goto exit_put;
1973
1974 entry->nr = 0;
1975
1976 if (!user_mode(regs)) {
1977 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
1978 perf_callchain_kernel(entry, regs);
1979 if (current->mm)
1980 regs = task_pt_regs(current);
1981 else
1982 regs = NULL;
1983 }
1984
1985 if (regs) {
1986 perf_callchain_store(entry, PERF_CONTEXT_USER);
1987 perf_callchain_user(entry, regs);
1988 }
1989
1990exit_put:
1991 put_callchain_entry(rctx);
1992
1993 return entry;
1994}
1995
1996/*
1997 * Initialize the perf_event context in a task_struct:
1998 */
1999static void __perf_event_init_context(struct perf_event_context *ctx)
1790{ 2000{
1791 raw_spin_lock_init(&ctx->lock); 2001 raw_spin_lock_init(&ctx->lock);
1792 mutex_init(&ctx->mutex); 2002 mutex_init(&ctx->mutex);
@@ -1794,45 +2004,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1794 INIT_LIST_HEAD(&ctx->flexible_groups); 2004 INIT_LIST_HEAD(&ctx->flexible_groups);
1795 INIT_LIST_HEAD(&ctx->event_list); 2005 INIT_LIST_HEAD(&ctx->event_list);
1796 atomic_set(&ctx->refcount, 1); 2006 atomic_set(&ctx->refcount, 1);
1797 ctx->task = task;
1798} 2007}
1799 2008
1800static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2009static struct perf_event_context *
2010alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1801{ 2011{
1802 struct perf_event_context *ctx; 2012 struct perf_event_context *ctx;
1803 struct perf_cpu_context *cpuctx;
1804 struct task_struct *task;
1805 unsigned long flags;
1806 int err;
1807
1808 if (pid == -1 && cpu != -1) {
1809 /* Must be root to operate on a CPU event: */
1810 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1811 return ERR_PTR(-EACCES);
1812 2013
1813 if (cpu < 0 || cpu >= nr_cpumask_bits) 2014 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1814 return ERR_PTR(-EINVAL); 2015 if (!ctx)
2016 return NULL;
1815 2017
1816 /* 2018 __perf_event_init_context(ctx);
1817 * We could be clever and allow to attach a event to an 2019 if (task) {
1818 * offline CPU and activate it when the CPU comes up, but 2020 ctx->task = task;
1819 * that's for later. 2021 get_task_struct(task);
1820 */ 2022 }
1821 if (!cpu_online(cpu)) 2023 ctx->pmu = pmu;
1822 return ERR_PTR(-ENODEV);
1823 2024
1824 cpuctx = &per_cpu(perf_cpu_context, cpu); 2025 return ctx;
1825 ctx = &cpuctx->ctx; 2026}
1826 get_ctx(ctx);
1827 2027
1828 return ctx; 2028static struct task_struct *
1829 } 2029find_lively_task_by_vpid(pid_t vpid)
2030{
2031 struct task_struct *task;
2032 int err;
1830 2033
1831 rcu_read_lock(); 2034 rcu_read_lock();
1832 if (!pid) 2035 if (!vpid)
1833 task = current; 2036 task = current;
1834 else 2037 else
1835 task = find_task_by_vpid(pid); 2038 task = find_task_by_vpid(vpid);
1836 if (task) 2039 if (task)
1837 get_task_struct(task); 2040 get_task_struct(task);
1838 rcu_read_unlock(); 2041 rcu_read_unlock();
@@ -1852,35 +2055,79 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1852 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2055 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1853 goto errout; 2056 goto errout;
1854 2057
1855 retry: 2058 return task;
1856 ctx = perf_lock_task_context(task, &flags); 2059errout:
2060 put_task_struct(task);
2061 return ERR_PTR(err);
2062
2063}
2064
2065static struct perf_event_context *
2066find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2067{
2068 struct perf_event_context *ctx;
2069 struct perf_cpu_context *cpuctx;
2070 unsigned long flags;
2071 int ctxn, err;
2072
2073 if (!task && cpu != -1) {
2074 /* Must be root to operate on a CPU event: */
2075 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2076 return ERR_PTR(-EACCES);
2077
2078 if (cpu < 0 || cpu >= nr_cpumask_bits)
2079 return ERR_PTR(-EINVAL);
2080
2081 /*
2082 * We could be clever and allow to attach a event to an
2083 * offline CPU and activate it when the CPU comes up, but
2084 * that's for later.
2085 */
2086 if (!cpu_online(cpu))
2087 return ERR_PTR(-ENODEV);
2088
2089 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2090 ctx = &cpuctx->ctx;
2091 get_ctx(ctx);
2092
2093 return ctx;
2094 }
2095
2096 err = -EINVAL;
2097 ctxn = pmu->task_ctx_nr;
2098 if (ctxn < 0)
2099 goto errout;
2100
2101retry:
2102 ctx = perf_lock_task_context(task, ctxn, &flags);
1857 if (ctx) { 2103 if (ctx) {
1858 unclone_ctx(ctx); 2104 unclone_ctx(ctx);
1859 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2105 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1860 } 2106 }
1861 2107
1862 if (!ctx) { 2108 if (!ctx) {
1863 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2109 ctx = alloc_perf_context(pmu, task);
1864 err = -ENOMEM; 2110 err = -ENOMEM;
1865 if (!ctx) 2111 if (!ctx)
1866 goto errout; 2112 goto errout;
1867 __perf_event_init_context(ctx, task); 2113
1868 get_ctx(ctx); 2114 get_ctx(ctx);
1869 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2115
2116 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1870 /* 2117 /*
1871 * We raced with some other task; use 2118 * We raced with some other task; use
1872 * the context they set. 2119 * the context they set.
1873 */ 2120 */
2121 put_task_struct(task);
1874 kfree(ctx); 2122 kfree(ctx);
1875 goto retry; 2123 goto retry;
1876 } 2124 }
1877 get_task_struct(task);
1878 } 2125 }
1879 2126
1880 put_task_struct(task); 2127 put_task_struct(task);
1881 return ctx; 2128 return ctx;
1882 2129
1883 errout: 2130errout:
1884 put_task_struct(task); 2131 put_task_struct(task);
1885 return ERR_PTR(err); 2132 return ERR_PTR(err);
1886} 2133}
@@ -1913,6 +2160,8 @@ static void free_event(struct perf_event *event)
1913 atomic_dec(&nr_comm_events); 2160 atomic_dec(&nr_comm_events);
1914 if (event->attr.task) 2161 if (event->attr.task)
1915 atomic_dec(&nr_task_events); 2162 atomic_dec(&nr_task_events);
2163 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2164 put_callchain_buffers();
1916 } 2165 }
1917 2166
1918 if (event->buffer) { 2167 if (event->buffer) {
@@ -1923,7 +2172,9 @@ static void free_event(struct perf_event *event)
1923 if (event->destroy) 2172 if (event->destroy)
1924 event->destroy(event); 2173 event->destroy(event);
1925 2174
1926 put_ctx(event->ctx); 2175 if (event->ctx)
2176 put_ctx(event->ctx);
2177
1927 call_rcu(&event->rcu_head, free_event_rcu); 2178 call_rcu(&event->rcu_head, free_event_rcu);
1928} 2179}
1929 2180
@@ -2344,6 +2595,9 @@ int perf_event_task_disable(void)
2344 2595
2345static int perf_event_index(struct perf_event *event) 2596static int perf_event_index(struct perf_event *event)
2346{ 2597{
2598 if (event->hw.state & PERF_HES_STOPPED)
2599 return 0;
2600
2347 if (event->state != PERF_EVENT_STATE_ACTIVE) 2601 if (event->state != PERF_EVENT_STATE_ACTIVE)
2348 return 0; 2602 return 0;
2349 2603
@@ -2956,16 +3210,6 @@ void perf_event_do_pending(void)
2956} 3210}
2957 3211
2958/* 3212/*
2959 * Callchain support -- arch specific
2960 */
2961
2962__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2963{
2964 return NULL;
2965}
2966
2967
2968/*
2969 * We assume there is only KVM supporting the callbacks. 3213 * We assume there is only KVM supporting the callbacks.
2970 * Later on, we might change it to a list if there is 3214 * Later on, we might change it to a list if there is
2971 * another virtualization implementation supporting the callbacks. 3215 * another virtualization implementation supporting the callbacks.
@@ -3071,7 +3315,7 @@ again:
3071 if (handle->wakeup != local_read(&buffer->wakeup)) 3315 if (handle->wakeup != local_read(&buffer->wakeup))
3072 perf_output_wakeup(handle); 3316 perf_output_wakeup(handle);
3073 3317
3074 out: 3318out:
3075 preempt_enable(); 3319 preempt_enable();
3076} 3320}
3077 3321
@@ -3459,14 +3703,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3459 struct perf_output_handle handle; 3703 struct perf_output_handle handle;
3460 struct perf_event_header header; 3704 struct perf_event_header header;
3461 3705
3706 /* protect the callchain buffers */
3707 rcu_read_lock();
3708
3462 perf_prepare_sample(&header, data, event, regs); 3709 perf_prepare_sample(&header, data, event, regs);
3463 3710
3464 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3711 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3465 return; 3712 goto exit;
3466 3713
3467 perf_output_sample(&handle, &header, data, event); 3714 perf_output_sample(&handle, &header, data, event);
3468 3715
3469 perf_output_end(&handle); 3716 perf_output_end(&handle);
3717
3718exit:
3719 rcu_read_unlock();
3470} 3720}
3471 3721
3472/* 3722/*
@@ -3580,16 +3830,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3580static void perf_event_task_event(struct perf_task_event *task_event) 3830static void perf_event_task_event(struct perf_task_event *task_event)
3581{ 3831{
3582 struct perf_cpu_context *cpuctx; 3832 struct perf_cpu_context *cpuctx;
3583 struct perf_event_context *ctx = task_event->task_ctx; 3833 struct perf_event_context *ctx;
3834 struct pmu *pmu;
3835 int ctxn;
3584 3836
3585 rcu_read_lock(); 3837 rcu_read_lock();
3586 cpuctx = &get_cpu_var(perf_cpu_context); 3838 list_for_each_entry_rcu(pmu, &pmus, entry) {
3587 perf_event_task_ctx(&cpuctx->ctx, task_event); 3839 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3588 if (!ctx) 3840 perf_event_task_ctx(&cpuctx->ctx, task_event);
3589 ctx = rcu_dereference(current->perf_event_ctxp); 3841
3590 if (ctx) 3842 ctx = task_event->task_ctx;
3591 perf_event_task_ctx(ctx, task_event); 3843 if (!ctx) {
3592 put_cpu_var(perf_cpu_context); 3844 ctxn = pmu->task_ctx_nr;
3845 if (ctxn < 0)
3846 goto next;
3847 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3848 }
3849 if (ctx)
3850 perf_event_task_ctx(ctx, task_event);
3851next:
3852 put_cpu_ptr(pmu->pmu_cpu_context);
3853 }
3593 rcu_read_unlock(); 3854 rcu_read_unlock();
3594} 3855}
3595 3856
@@ -3694,8 +3955,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3694{ 3955{
3695 struct perf_cpu_context *cpuctx; 3956 struct perf_cpu_context *cpuctx;
3696 struct perf_event_context *ctx; 3957 struct perf_event_context *ctx;
3697 unsigned int size;
3698 char comm[TASK_COMM_LEN]; 3958 char comm[TASK_COMM_LEN];
3959 unsigned int size;
3960 struct pmu *pmu;
3961 int ctxn;
3699 3962
3700 memset(comm, 0, sizeof(comm)); 3963 memset(comm, 0, sizeof(comm));
3701 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3964 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3707,21 +3970,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3707 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3970 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3708 3971
3709 rcu_read_lock(); 3972 rcu_read_lock();
3710 cpuctx = &get_cpu_var(perf_cpu_context); 3973 list_for_each_entry_rcu(pmu, &pmus, entry) {
3711 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3974 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3712 ctx = rcu_dereference(current->perf_event_ctxp); 3975 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3713 if (ctx) 3976
3714 perf_event_comm_ctx(ctx, comm_event); 3977 ctxn = pmu->task_ctx_nr;
3715 put_cpu_var(perf_cpu_context); 3978 if (ctxn < 0)
3979 goto next;
3980
3981 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3982 if (ctx)
3983 perf_event_comm_ctx(ctx, comm_event);
3984next:
3985 put_cpu_ptr(pmu->pmu_cpu_context);
3986 }
3716 rcu_read_unlock(); 3987 rcu_read_unlock();
3717} 3988}
3718 3989
3719void perf_event_comm(struct task_struct *task) 3990void perf_event_comm(struct task_struct *task)
3720{ 3991{
3721 struct perf_comm_event comm_event; 3992 struct perf_comm_event comm_event;
3993 struct perf_event_context *ctx;
3994 int ctxn;
3722 3995
3723 if (task->perf_event_ctxp) 3996 for_each_task_context_nr(ctxn) {
3724 perf_event_enable_on_exec(task); 3997 ctx = task->perf_event_ctxp[ctxn];
3998 if (!ctx)
3999 continue;
4000
4001 perf_event_enable_on_exec(ctx);
4002 }
3725 4003
3726 if (!atomic_read(&nr_comm_events)) 4004 if (!atomic_read(&nr_comm_events))
3727 return; 4005 return;
@@ -3823,6 +4101,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3823 char tmp[16]; 4101 char tmp[16];
3824 char *buf = NULL; 4102 char *buf = NULL;
3825 const char *name; 4103 const char *name;
4104 struct pmu *pmu;
4105 int ctxn;
3826 4106
3827 memset(tmp, 0, sizeof(tmp)); 4107 memset(tmp, 0, sizeof(tmp));
3828 4108
@@ -3875,12 +4155,23 @@ got_name:
3875 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4155 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3876 4156
3877 rcu_read_lock(); 4157 rcu_read_lock();
3878 cpuctx = &get_cpu_var(perf_cpu_context); 4158 list_for_each_entry_rcu(pmu, &pmus, entry) {
3879 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4159 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3880 ctx = rcu_dereference(current->perf_event_ctxp); 4160 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3881 if (ctx) 4161 vma->vm_flags & VM_EXEC);
3882 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4162
3883 put_cpu_var(perf_cpu_context); 4163 ctxn = pmu->task_ctx_nr;
4164 if (ctxn < 0)
4165 goto next;
4166
4167 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4168 if (ctx) {
4169 perf_event_mmap_ctx(ctx, mmap_event,
4170 vma->vm_flags & VM_EXEC);
4171 }
4172next:
4173 put_cpu_ptr(pmu->pmu_cpu_context);
4174 }
3884 rcu_read_unlock(); 4175 rcu_read_unlock();
3885 4176
3886 kfree(buf); 4177 kfree(buf);
@@ -3962,8 +4253,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3962 struct hw_perf_event *hwc = &event->hw; 4253 struct hw_perf_event *hwc = &event->hw;
3963 int ret = 0; 4254 int ret = 0;
3964 4255
3965 throttle = (throttle && event->pmu->unthrottle != NULL);
3966
3967 if (!throttle) { 4256 if (!throttle) {
3968 hwc->interrupts++; 4257 hwc->interrupts++;
3969 } else { 4258 } else {
@@ -4031,6 +4320,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4031 * Generic software event infrastructure 4320 * Generic software event infrastructure
4032 */ 4321 */
4033 4322
4323struct swevent_htable {
4324 struct swevent_hlist *swevent_hlist;
4325 struct mutex hlist_mutex;
4326 int hlist_refcount;
4327
4328 /* Recursion avoidance in each contexts */
4329 int recursion[PERF_NR_CONTEXTS];
4330};
4331
4332static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4333
4034/* 4334/*
4035 * We directly increment event->count and keep a second value in 4335 * We directly increment event->count and keep a second value in
4036 * event->hw.period_left to count intervals. This period event 4336 * event->hw.period_left to count intervals. This period event
@@ -4088,7 +4388,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4088 } 4388 }
4089} 4389}
4090 4390
4091static void perf_swevent_add(struct perf_event *event, u64 nr, 4391static void perf_swevent_event(struct perf_event *event, u64 nr,
4092 int nmi, struct perf_sample_data *data, 4392 int nmi, struct perf_sample_data *data,
4093 struct pt_regs *regs) 4393 struct pt_regs *regs)
4094{ 4394{
@@ -4114,6 +4414,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4114static int perf_exclude_event(struct perf_event *event, 4414static int perf_exclude_event(struct perf_event *event,
4115 struct pt_regs *regs) 4415 struct pt_regs *regs)
4116{ 4416{
4417 if (event->hw.state & PERF_HES_STOPPED)
4418 return 0;
4419
4117 if (regs) { 4420 if (regs) {
4118 if (event->attr.exclude_user && user_mode(regs)) 4421 if (event->attr.exclude_user && user_mode(regs))
4119 return 1; 4422 return 1;
@@ -4160,11 +4463,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4160 4463
4161/* For the read side: events when they trigger */ 4464/* For the read side: events when they trigger */
4162static inline struct hlist_head * 4465static inline struct hlist_head *
4163find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4466find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4164{ 4467{
4165 struct swevent_hlist *hlist; 4468 struct swevent_hlist *hlist;
4166 4469
4167 hlist = rcu_dereference(ctx->swevent_hlist); 4470 hlist = rcu_dereference(swhash->swevent_hlist);
4168 if (!hlist) 4471 if (!hlist)
4169 return NULL; 4472 return NULL;
4170 4473
@@ -4173,7 +4476,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4173 4476
4174/* For the event head insertion and removal in the hlist */ 4477/* For the event head insertion and removal in the hlist */
4175static inline struct hlist_head * 4478static inline struct hlist_head *
4176find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4479find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4177{ 4480{
4178 struct swevent_hlist *hlist; 4481 struct swevent_hlist *hlist;
4179 u32 event_id = event->attr.config; 4482 u32 event_id = event->attr.config;
@@ -4184,7 +4487,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4184 * and release. Which makes the protected version suitable here. 4487 * and release. Which makes the protected version suitable here.
4185 * The context lock guarantees that. 4488 * The context lock guarantees that.
4186 */ 4489 */
4187 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4490 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4188 lockdep_is_held(&event->ctx->lock)); 4491 lockdep_is_held(&event->ctx->lock));
4189 if (!hlist) 4492 if (!hlist)
4190 return NULL; 4493 return NULL;
@@ -4197,23 +4500,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4197 struct perf_sample_data *data, 4500 struct perf_sample_data *data,
4198 struct pt_regs *regs) 4501 struct pt_regs *regs)
4199{ 4502{
4200 struct perf_cpu_context *cpuctx; 4503 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4201 struct perf_event *event; 4504 struct perf_event *event;
4202 struct hlist_node *node; 4505 struct hlist_node *node;
4203 struct hlist_head *head; 4506 struct hlist_head *head;
4204 4507
4205 cpuctx = &__get_cpu_var(perf_cpu_context);
4206
4207 rcu_read_lock(); 4508 rcu_read_lock();
4208 4509 head = find_swevent_head_rcu(swhash, type, event_id);
4209 head = find_swevent_head_rcu(cpuctx, type, event_id);
4210
4211 if (!head) 4510 if (!head)
4212 goto end; 4511 goto end;
4213 4512
4214 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4513 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4215 if (perf_swevent_match(event, type, event_id, data, regs)) 4514 if (perf_swevent_match(event, type, event_id, data, regs))
4216 perf_swevent_add(event, nr, nmi, data, regs); 4515 perf_swevent_event(event, nr, nmi, data, regs);
4217 } 4516 }
4218end: 4517end:
4219 rcu_read_unlock(); 4518 rcu_read_unlock();
@@ -4221,33 +4520,17 @@ end:
4221 4520
4222int perf_swevent_get_recursion_context(void) 4521int perf_swevent_get_recursion_context(void)
4223{ 4522{
4224 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4523 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4225 int rctx;
4226
4227 if (in_nmi())
4228 rctx = 3;
4229 else if (in_irq())
4230 rctx = 2;
4231 else if (in_softirq())
4232 rctx = 1;
4233 else
4234 rctx = 0;
4235 4524
4236 if (cpuctx->recursion[rctx]) 4525 return get_recursion_context(swhash->recursion);
4237 return -1;
4238
4239 cpuctx->recursion[rctx]++;
4240 barrier();
4241
4242 return rctx;
4243} 4526}
4244EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4527EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4245 4528
4246void inline perf_swevent_put_recursion_context(int rctx) 4529void inline perf_swevent_put_recursion_context(int rctx)
4247{ 4530{
4248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4531 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4249 barrier(); 4532
4250 cpuctx->recursion[rctx]--; 4533 put_recursion_context(swhash->recursion, rctx);
4251} 4534}
4252 4535
4253void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4536void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4273,20 +4556,20 @@ static void perf_swevent_read(struct perf_event *event)
4273{ 4556{
4274} 4557}
4275 4558
4276static int perf_swevent_enable(struct perf_event *event) 4559static int perf_swevent_add(struct perf_event *event, int flags)
4277{ 4560{
4561 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4278 struct hw_perf_event *hwc = &event->hw; 4562 struct hw_perf_event *hwc = &event->hw;
4279 struct perf_cpu_context *cpuctx;
4280 struct hlist_head *head; 4563 struct hlist_head *head;
4281 4564
4282 cpuctx = &__get_cpu_var(perf_cpu_context);
4283
4284 if (hwc->sample_period) { 4565 if (hwc->sample_period) {
4285 hwc->last_period = hwc->sample_period; 4566 hwc->last_period = hwc->sample_period;
4286 perf_swevent_set_period(event); 4567 perf_swevent_set_period(event);
4287 } 4568 }
4288 4569
4289 head = find_swevent_head(cpuctx, event); 4570 hwc->state = !(flags & PERF_EF_START);
4571
4572 head = find_swevent_head(swhash, event);
4290 if (WARN_ON_ONCE(!head)) 4573 if (WARN_ON_ONCE(!head))
4291 return -EINVAL; 4574 return -EINVAL;
4292 4575
@@ -4295,202 +4578,27 @@ static int perf_swevent_enable(struct perf_event *event)
4295 return 0; 4578 return 0;
4296} 4579}
4297 4580
4298static void perf_swevent_disable(struct perf_event *event) 4581static void perf_swevent_del(struct perf_event *event, int flags)
4299{ 4582{
4300 hlist_del_rcu(&event->hlist_entry); 4583 hlist_del_rcu(&event->hlist_entry);
4301} 4584}
4302 4585
4303static void perf_swevent_void(struct perf_event *event) 4586static void perf_swevent_start(struct perf_event *event, int flags)
4304{
4305}
4306
4307static int perf_swevent_int(struct perf_event *event)
4308{
4309 return 0;
4310}
4311
4312static const struct pmu perf_ops_generic = {
4313 .enable = perf_swevent_enable,
4314 .disable = perf_swevent_disable,
4315 .start = perf_swevent_int,
4316 .stop = perf_swevent_void,
4317 .read = perf_swevent_read,
4318 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4319};
4320
4321/*
4322 * hrtimer based swevent callback
4323 */
4324
4325static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4326{
4327 enum hrtimer_restart ret = HRTIMER_RESTART;
4328 struct perf_sample_data data;
4329 struct pt_regs *regs;
4330 struct perf_event *event;
4331 u64 period;
4332
4333 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4334 event->pmu->read(event);
4335
4336 perf_sample_data_init(&data, 0);
4337 data.period = event->hw.last_period;
4338 regs = get_irq_regs();
4339
4340 if (regs && !perf_exclude_event(event, regs)) {
4341 if (!(event->attr.exclude_idle && current->pid == 0))
4342 if (perf_event_overflow(event, 0, &data, regs))
4343 ret = HRTIMER_NORESTART;
4344 }
4345
4346 period = max_t(u64, 10000, event->hw.sample_period);
4347 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4348
4349 return ret;
4350}
4351
4352static void perf_swevent_start_hrtimer(struct perf_event *event)
4353{ 4587{
4354 struct hw_perf_event *hwc = &event->hw; 4588 event->hw.state = 0;
4355
4356 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4357 hwc->hrtimer.function = perf_swevent_hrtimer;
4358 if (hwc->sample_period) {
4359 u64 period;
4360
4361 if (hwc->remaining) {
4362 if (hwc->remaining < 0)
4363 period = 10000;
4364 else
4365 period = hwc->remaining;
4366 hwc->remaining = 0;
4367 } else {
4368 period = max_t(u64, 10000, hwc->sample_period);
4369 }
4370 __hrtimer_start_range_ns(&hwc->hrtimer,
4371 ns_to_ktime(period), 0,
4372 HRTIMER_MODE_REL, 0);
4373 }
4374} 4589}
4375 4590
4376static void perf_swevent_cancel_hrtimer(struct perf_event *event) 4591static void perf_swevent_stop(struct perf_event *event, int flags)
4377{ 4592{
4378 struct hw_perf_event *hwc = &event->hw; 4593 event->hw.state = PERF_HES_STOPPED;
4379
4380 if (hwc->sample_period) {
4381 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4382 hwc->remaining = ktime_to_ns(remaining);
4383
4384 hrtimer_cancel(&hwc->hrtimer);
4385 }
4386} 4594}
4387 4595
4388/*
4389 * Software event: cpu wall time clock
4390 */
4391
4392static void cpu_clock_perf_event_update(struct perf_event *event)
4393{
4394 int cpu = raw_smp_processor_id();
4395 s64 prev;
4396 u64 now;
4397
4398 now = cpu_clock(cpu);
4399 prev = local64_xchg(&event->hw.prev_count, now);
4400 local64_add(now - prev, &event->count);
4401}
4402
4403static int cpu_clock_perf_event_enable(struct perf_event *event)
4404{
4405 struct hw_perf_event *hwc = &event->hw;
4406 int cpu = raw_smp_processor_id();
4407
4408 local64_set(&hwc->prev_count, cpu_clock(cpu));
4409 perf_swevent_start_hrtimer(event);
4410
4411 return 0;
4412}
4413
4414static void cpu_clock_perf_event_disable(struct perf_event *event)
4415{
4416 perf_swevent_cancel_hrtimer(event);
4417 cpu_clock_perf_event_update(event);
4418}
4419
4420static void cpu_clock_perf_event_read(struct perf_event *event)
4421{
4422 cpu_clock_perf_event_update(event);
4423}
4424
4425static const struct pmu perf_ops_cpu_clock = {
4426 .enable = cpu_clock_perf_event_enable,
4427 .disable = cpu_clock_perf_event_disable,
4428 .read = cpu_clock_perf_event_read,
4429};
4430
4431/*
4432 * Software event: task time clock
4433 */
4434
4435static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4436{
4437 u64 prev;
4438 s64 delta;
4439
4440 prev = local64_xchg(&event->hw.prev_count, now);
4441 delta = now - prev;
4442 local64_add(delta, &event->count);
4443}
4444
4445static int task_clock_perf_event_enable(struct perf_event *event)
4446{
4447 struct hw_perf_event *hwc = &event->hw;
4448 u64 now;
4449
4450 now = event->ctx->time;
4451
4452 local64_set(&hwc->prev_count, now);
4453
4454 perf_swevent_start_hrtimer(event);
4455
4456 return 0;
4457}
4458
4459static void task_clock_perf_event_disable(struct perf_event *event)
4460{
4461 perf_swevent_cancel_hrtimer(event);
4462 task_clock_perf_event_update(event, event->ctx->time);
4463
4464}
4465
4466static void task_clock_perf_event_read(struct perf_event *event)
4467{
4468 u64 time;
4469
4470 if (!in_nmi()) {
4471 update_context_time(event->ctx);
4472 time = event->ctx->time;
4473 } else {
4474 u64 now = perf_clock();
4475 u64 delta = now - event->ctx->timestamp;
4476 time = event->ctx->time + delta;
4477 }
4478
4479 task_clock_perf_event_update(event, time);
4480}
4481
4482static const struct pmu perf_ops_task_clock = {
4483 .enable = task_clock_perf_event_enable,
4484 .disable = task_clock_perf_event_disable,
4485 .read = task_clock_perf_event_read,
4486};
4487
4488/* Deref the hlist from the update side */ 4596/* Deref the hlist from the update side */
4489static inline struct swevent_hlist * 4597static inline struct swevent_hlist *
4490swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4598swevent_hlist_deref(struct swevent_htable *swhash)
4491{ 4599{
4492 return rcu_dereference_protected(cpuctx->swevent_hlist, 4600 return rcu_dereference_protected(swhash->swevent_hlist,
4493 lockdep_is_held(&cpuctx->hlist_mutex)); 4601 lockdep_is_held(&swhash->hlist_mutex));
4494} 4602}
4495 4603
4496static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4604static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4501,27 +4609,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4501 kfree(hlist); 4609 kfree(hlist);
4502} 4610}
4503 4611
4504static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4612static void swevent_hlist_release(struct swevent_htable *swhash)
4505{ 4613{
4506 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4614 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4507 4615
4508 if (!hlist) 4616 if (!hlist)
4509 return; 4617 return;
4510 4618
4511 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4619 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4512 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4620 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4513} 4621}
4514 4622
4515static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4623static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4516{ 4624{
4517 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4625 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4518 4626
4519 mutex_lock(&cpuctx->hlist_mutex); 4627 mutex_lock(&swhash->hlist_mutex);
4520 4628
4521 if (!--cpuctx->hlist_refcount) 4629 if (!--swhash->hlist_refcount)
4522 swevent_hlist_release(cpuctx); 4630 swevent_hlist_release(swhash);
4523 4631
4524 mutex_unlock(&cpuctx->hlist_mutex); 4632 mutex_unlock(&swhash->hlist_mutex);
4525} 4633}
4526 4634
4527static void swevent_hlist_put(struct perf_event *event) 4635static void swevent_hlist_put(struct perf_event *event)
@@ -4539,12 +4647,12 @@ static void swevent_hlist_put(struct perf_event *event)
4539 4647
4540static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4648static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4541{ 4649{
4542 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4650 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4543 int err = 0; 4651 int err = 0;
4544 4652
4545 mutex_lock(&cpuctx->hlist_mutex); 4653 mutex_lock(&swhash->hlist_mutex);
4546 4654
4547 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4655 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4548 struct swevent_hlist *hlist; 4656 struct swevent_hlist *hlist;
4549 4657
4550 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4658 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4552,11 +4660,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4552 err = -ENOMEM; 4660 err = -ENOMEM;
4553 goto exit; 4661 goto exit;
4554 } 4662 }
4555 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4663 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4556 } 4664 }
4557 cpuctx->hlist_refcount++; 4665 swhash->hlist_refcount++;
4558 exit: 4666exit:
4559 mutex_unlock(&cpuctx->hlist_mutex); 4667 mutex_unlock(&swhash->hlist_mutex);
4560 4668
4561 return err; 4669 return err;
4562} 4670}
@@ -4580,7 +4688,7 @@ static int swevent_hlist_get(struct perf_event *event)
4580 put_online_cpus(); 4688 put_online_cpus();
4581 4689
4582 return 0; 4690 return 0;
4583 fail: 4691fail:
4584 for_each_possible_cpu(cpu) { 4692 for_each_possible_cpu(cpu) {
4585 if (cpu == failed_cpu) 4693 if (cpu == failed_cpu)
4586 break; 4694 break;
@@ -4591,17 +4699,64 @@ static int swevent_hlist_get(struct perf_event *event)
4591 return err; 4699 return err;
4592} 4700}
4593 4701
4594#ifdef CONFIG_EVENT_TRACING 4702atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4703
4704static void sw_perf_event_destroy(struct perf_event *event)
4705{
4706 u64 event_id = event->attr.config;
4707
4708 WARN_ON(event->parent);
4709
4710 atomic_dec(&perf_swevent_enabled[event_id]);
4711 swevent_hlist_put(event);
4712}
4713
4714static int perf_swevent_init(struct perf_event *event)
4715{
4716 int event_id = event->attr.config;
4595 4717
4596static const struct pmu perf_ops_tracepoint = { 4718 if (event->attr.type != PERF_TYPE_SOFTWARE)
4597 .enable = perf_trace_enable, 4719 return -ENOENT;
4598 .disable = perf_trace_disable, 4720
4599 .start = perf_swevent_int, 4721 switch (event_id) {
4600 .stop = perf_swevent_void, 4722 case PERF_COUNT_SW_CPU_CLOCK:
4723 case PERF_COUNT_SW_TASK_CLOCK:
4724 return -ENOENT;
4725
4726 default:
4727 break;
4728 }
4729
4730 if (event_id > PERF_COUNT_SW_MAX)
4731 return -ENOENT;
4732
4733 if (!event->parent) {
4734 int err;
4735
4736 err = swevent_hlist_get(event);
4737 if (err)
4738 return err;
4739
4740 atomic_inc(&perf_swevent_enabled[event_id]);
4741 event->destroy = sw_perf_event_destroy;
4742 }
4743
4744 return 0;
4745}
4746
4747static struct pmu perf_swevent = {
4748 .task_ctx_nr = perf_sw_context,
4749
4750 .event_init = perf_swevent_init,
4751 .add = perf_swevent_add,
4752 .del = perf_swevent_del,
4753 .start = perf_swevent_start,
4754 .stop = perf_swevent_stop,
4601 .read = perf_swevent_read, 4755 .read = perf_swevent_read,
4602 .unthrottle = perf_swevent_void,
4603}; 4756};
4604 4757
4758#ifdef CONFIG_EVENT_TRACING
4759
4605static int perf_tp_filter_match(struct perf_event *event, 4760static int perf_tp_filter_match(struct perf_event *event,
4606 struct perf_sample_data *data) 4761 struct perf_sample_data *data)
4607{ 4762{
@@ -4645,7 +4800,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4645 4800
4646 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4801 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4647 if (perf_tp_event_match(event, &data, regs)) 4802 if (perf_tp_event_match(event, &data, regs))
4648 perf_swevent_add(event, count, 1, &data, regs); 4803 perf_swevent_event(event, count, 1, &data, regs);
4649 } 4804 }
4650 4805
4651 perf_swevent_put_recursion_context(rctx); 4806 perf_swevent_put_recursion_context(rctx);
@@ -4657,10 +4812,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4657 perf_trace_destroy(event); 4812 perf_trace_destroy(event);
4658} 4813}
4659 4814
4660static const struct pmu *tp_perf_event_init(struct perf_event *event) 4815static int perf_tp_event_init(struct perf_event *event)
4661{ 4816{
4662 int err; 4817 int err;
4663 4818
4819 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4820 return -ENOENT;
4821
4664 /* 4822 /*
4665 * Raw tracepoint data is a severe data leak, only allow root to 4823 * Raw tracepoint data is a severe data leak, only allow root to
4666 * have these. 4824 * have these.
@@ -4668,15 +4826,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4668 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4826 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4669 perf_paranoid_tracepoint_raw() && 4827 perf_paranoid_tracepoint_raw() &&
4670 !capable(CAP_SYS_ADMIN)) 4828 !capable(CAP_SYS_ADMIN))
4671 return ERR_PTR(-EPERM); 4829 return -EPERM;
4672 4830
4673 err = perf_trace_init(event); 4831 err = perf_trace_init(event);
4674 if (err) 4832 if (err)
4675 return NULL; 4833 return err;
4676 4834
4677 event->destroy = tp_perf_event_destroy; 4835 event->destroy = tp_perf_event_destroy;
4678 4836
4679 return &perf_ops_tracepoint; 4837 return 0;
4838}
4839
4840static struct pmu perf_tracepoint = {
4841 .task_ctx_nr = perf_sw_context,
4842
4843 .event_init = perf_tp_event_init,
4844 .add = perf_trace_add,
4845 .del = perf_trace_del,
4846 .start = perf_swevent_start,
4847 .stop = perf_swevent_stop,
4848 .read = perf_swevent_read,
4849};
4850
4851static inline void perf_tp_register(void)
4852{
4853 perf_pmu_register(&perf_tracepoint);
4680} 4854}
4681 4855
4682static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4856static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4704,9 +4878,8 @@ static void perf_event_free_filter(struct perf_event *event)
4704 4878
4705#else 4879#else
4706 4880
4707static const struct pmu *tp_perf_event_init(struct perf_event *event) 4881static inline void perf_tp_register(void)
4708{ 4882{
4709 return NULL;
4710} 4883}
4711 4884
4712static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4885static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4721,105 +4894,389 @@ static void perf_event_free_filter(struct perf_event *event)
4721#endif /* CONFIG_EVENT_TRACING */ 4894#endif /* CONFIG_EVENT_TRACING */
4722 4895
4723#ifdef CONFIG_HAVE_HW_BREAKPOINT 4896#ifdef CONFIG_HAVE_HW_BREAKPOINT
4724static void bp_perf_event_destroy(struct perf_event *event) 4897void perf_bp_event(struct perf_event *bp, void *data)
4725{ 4898{
4726 release_bp_slot(event); 4899 struct perf_sample_data sample;
4900 struct pt_regs *regs = data;
4901
4902 perf_sample_data_init(&sample, bp->attr.bp_addr);
4903
4904 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4905 perf_swevent_event(bp, 1, 1, &sample, regs);
4727} 4906}
4907#endif
4908
4909/*
4910 * hrtimer based swevent callback
4911 */
4728 4912
4729static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4913static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4730{ 4914{
4731 int err; 4915 enum hrtimer_restart ret = HRTIMER_RESTART;
4916 struct perf_sample_data data;
4917 struct pt_regs *regs;
4918 struct perf_event *event;
4919 u64 period;
4732 4920
4733 err = register_perf_hw_breakpoint(bp); 4921 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4734 if (err) 4922 event->pmu->read(event);
4735 return ERR_PTR(err);
4736 4923
4737 bp->destroy = bp_perf_event_destroy; 4924 perf_sample_data_init(&data, 0);
4925 data.period = event->hw.last_period;
4926 regs = get_irq_regs();
4927
4928 if (regs && !perf_exclude_event(event, regs)) {
4929 if (!(event->attr.exclude_idle && current->pid == 0))
4930 if (perf_event_overflow(event, 0, &data, regs))
4931 ret = HRTIMER_NORESTART;
4932 }
4738 4933
4739 return &perf_ops_bp; 4934 period = max_t(u64, 10000, event->hw.sample_period);
4935 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4936
4937 return ret;
4740} 4938}
4741 4939
4742void perf_bp_event(struct perf_event *bp, void *data) 4940static void perf_swevent_start_hrtimer(struct perf_event *event)
4743{ 4941{
4744 struct perf_sample_data sample; 4942 struct hw_perf_event *hwc = &event->hw;
4745 struct pt_regs *regs = data;
4746 4943
4747 perf_sample_data_init(&sample, bp->attr.bp_addr); 4944 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4945 hwc->hrtimer.function = perf_swevent_hrtimer;
4946 if (hwc->sample_period) {
4947 s64 period = local64_read(&hwc->period_left);
4748 4948
4749 if (!perf_exclude_event(bp, regs)) 4949 if (period) {
4750 perf_swevent_add(bp, 1, 1, &sample, regs); 4950 if (period < 0)
4951 period = 10000;
4952
4953 local64_set(&hwc->period_left, 0);
4954 } else {
4955 period = max_t(u64, 10000, hwc->sample_period);
4956 }
4957 __hrtimer_start_range_ns(&hwc->hrtimer,
4958 ns_to_ktime(period), 0,
4959 HRTIMER_MODE_REL_PINNED, 0);
4960 }
4751} 4961}
4752#else 4962
4753static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4963static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4754{ 4964{
4755 return NULL; 4965 struct hw_perf_event *hwc = &event->hw;
4966
4967 if (hwc->sample_period) {
4968 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4969 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4970
4971 hrtimer_cancel(&hwc->hrtimer);
4972 }
4756} 4973}
4757 4974
4758void perf_bp_event(struct perf_event *bp, void *regs) 4975/*
4976 * Software event: cpu wall time clock
4977 */
4978
4979static void cpu_clock_event_update(struct perf_event *event)
4759{ 4980{
4981 s64 prev;
4982 u64 now;
4983
4984 now = local_clock();
4985 prev = local64_xchg(&event->hw.prev_count, now);
4986 local64_add(now - prev, &event->count);
4760} 4987}
4761#endif
4762 4988
4763atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4989static void cpu_clock_event_start(struct perf_event *event, int flags)
4990{
4991 local64_set(&event->hw.prev_count, local_clock());
4992 perf_swevent_start_hrtimer(event);
4993}
4764 4994
4765static void sw_perf_event_destroy(struct perf_event *event) 4995static void cpu_clock_event_stop(struct perf_event *event, int flags)
4766{ 4996{
4767 u64 event_id = event->attr.config; 4997 perf_swevent_cancel_hrtimer(event);
4998 cpu_clock_event_update(event);
4999}
4768 5000
4769 WARN_ON(event->parent); 5001static int cpu_clock_event_add(struct perf_event *event, int flags)
5002{
5003 if (flags & PERF_EF_START)
5004 cpu_clock_event_start(event, flags);
4770 5005
4771 atomic_dec(&perf_swevent_enabled[event_id]); 5006 return 0;
4772 swevent_hlist_put(event);
4773} 5007}
4774 5008
4775static const struct pmu *sw_perf_event_init(struct perf_event *event) 5009static void cpu_clock_event_del(struct perf_event *event, int flags)
4776{ 5010{
4777 const struct pmu *pmu = NULL; 5011 cpu_clock_event_stop(event, flags);
4778 u64 event_id = event->attr.config; 5012}
5013
5014static void cpu_clock_event_read(struct perf_event *event)
5015{
5016 cpu_clock_event_update(event);
5017}
5018
5019static int cpu_clock_event_init(struct perf_event *event)
5020{
5021 if (event->attr.type != PERF_TYPE_SOFTWARE)
5022 return -ENOENT;
5023
5024 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5025 return -ENOENT;
5026
5027 return 0;
5028}
5029
5030static struct pmu perf_cpu_clock = {
5031 .task_ctx_nr = perf_sw_context,
5032
5033 .event_init = cpu_clock_event_init,
5034 .add = cpu_clock_event_add,
5035 .del = cpu_clock_event_del,
5036 .start = cpu_clock_event_start,
5037 .stop = cpu_clock_event_stop,
5038 .read = cpu_clock_event_read,
5039};
5040
5041/*
5042 * Software event: task time clock
5043 */
5044
5045static void task_clock_event_update(struct perf_event *event, u64 now)
5046{
5047 u64 prev;
5048 s64 delta;
5049
5050 prev = local64_xchg(&event->hw.prev_count, now);
5051 delta = now - prev;
5052 local64_add(delta, &event->count);
5053}
5054
5055static void task_clock_event_start(struct perf_event *event, int flags)
5056{
5057 local64_set(&event->hw.prev_count, event->ctx->time);
5058 perf_swevent_start_hrtimer(event);
5059}
5060
5061static void task_clock_event_stop(struct perf_event *event, int flags)
5062{
5063 perf_swevent_cancel_hrtimer(event);
5064 task_clock_event_update(event, event->ctx->time);
5065}
5066
5067static int task_clock_event_add(struct perf_event *event, int flags)
5068{
5069 if (flags & PERF_EF_START)
5070 task_clock_event_start(event, flags);
5071
5072 return 0;
5073}
5074
5075static void task_clock_event_del(struct perf_event *event, int flags)
5076{
5077 task_clock_event_stop(event, PERF_EF_UPDATE);
5078}
5079
5080static void task_clock_event_read(struct perf_event *event)
5081{
5082 u64 time;
5083
5084 if (!in_nmi()) {
5085 update_context_time(event->ctx);
5086 time = event->ctx->time;
5087 } else {
5088 u64 now = perf_clock();
5089 u64 delta = now - event->ctx->timestamp;
5090 time = event->ctx->time + delta;
5091 }
5092
5093 task_clock_event_update(event, time);
5094}
5095
5096static int task_clock_event_init(struct perf_event *event)
5097{
5098 if (event->attr.type != PERF_TYPE_SOFTWARE)
5099 return -ENOENT;
5100
5101 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5102 return -ENOENT;
5103
5104 return 0;
5105}
5106
5107static struct pmu perf_task_clock = {
5108 .task_ctx_nr = perf_sw_context,
5109
5110 .event_init = task_clock_event_init,
5111 .add = task_clock_event_add,
5112 .del = task_clock_event_del,
5113 .start = task_clock_event_start,
5114 .stop = task_clock_event_stop,
5115 .read = task_clock_event_read,
5116};
5117
5118static void perf_pmu_nop_void(struct pmu *pmu)
5119{
5120}
5121
5122static int perf_pmu_nop_int(struct pmu *pmu)
5123{
5124 return 0;
5125}
5126
5127static void perf_pmu_start_txn(struct pmu *pmu)
5128{
5129 perf_pmu_disable(pmu);
5130}
5131
5132static int perf_pmu_commit_txn(struct pmu *pmu)
5133{
5134 perf_pmu_enable(pmu);
5135 return 0;
5136}
4779 5137
5138static void perf_pmu_cancel_txn(struct pmu *pmu)
5139{
5140 perf_pmu_enable(pmu);
5141}
5142
5143/*
5144 * Ensures all contexts with the same task_ctx_nr have the same
5145 * pmu_cpu_context too.
5146 */
5147static void *find_pmu_context(int ctxn)
5148{
5149 struct pmu *pmu;
5150
5151 if (ctxn < 0)
5152 return NULL;
5153
5154 list_for_each_entry(pmu, &pmus, entry) {
5155 if (pmu->task_ctx_nr == ctxn)
5156 return pmu->pmu_cpu_context;
5157 }
5158
5159 return NULL;
5160}
5161
5162static void free_pmu_context(void * __percpu cpu_context)
5163{
5164 struct pmu *pmu;
5165
5166 mutex_lock(&pmus_lock);
4780 /* 5167 /*
4781 * Software events (currently) can't in general distinguish 5168 * Like a real lame refcount.
4782 * between user, kernel and hypervisor events.
4783 * However, context switches and cpu migrations are considered
4784 * to be kernel events, and page faults are never hypervisor
4785 * events.
4786 */ 5169 */
4787 switch (event_id) { 5170 list_for_each_entry(pmu, &pmus, entry) {
4788 case PERF_COUNT_SW_CPU_CLOCK: 5171 if (pmu->pmu_cpu_context == cpu_context)
4789 pmu = &perf_ops_cpu_clock; 5172 goto out;
5173 }
4790 5174
4791 break; 5175 free_percpu(cpu_context);
4792 case PERF_COUNT_SW_TASK_CLOCK: 5176out:
4793 /* 5177 mutex_unlock(&pmus_lock);
4794 * If the user instantiates this as a per-cpu event, 5178}
4795 * use the cpu_clock event instead.
4796 */
4797 if (event->ctx->task)
4798 pmu = &perf_ops_task_clock;
4799 else
4800 pmu = &perf_ops_cpu_clock;
4801 5179
4802 break; 5180int perf_pmu_register(struct pmu *pmu)
4803 case PERF_COUNT_SW_PAGE_FAULTS: 5181{
4804 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5182 int cpu, ret;
4805 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4806 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4807 case PERF_COUNT_SW_CPU_MIGRATIONS:
4808 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4809 case PERF_COUNT_SW_EMULATION_FAULTS:
4810 if (!event->parent) {
4811 int err;
4812
4813 err = swevent_hlist_get(event);
4814 if (err)
4815 return ERR_PTR(err);
4816 5183
4817 atomic_inc(&perf_swevent_enabled[event_id]); 5184 mutex_lock(&pmus_lock);
4818 event->destroy = sw_perf_event_destroy; 5185 ret = -ENOMEM;
5186 pmu->pmu_disable_count = alloc_percpu(int);
5187 if (!pmu->pmu_disable_count)
5188 goto unlock;
5189
5190 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5191 if (pmu->pmu_cpu_context)
5192 goto got_cpu_context;
5193
5194 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5195 if (!pmu->pmu_cpu_context)
5196 goto free_pdc;
5197
5198 for_each_possible_cpu(cpu) {
5199 struct perf_cpu_context *cpuctx;
5200
5201 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5202 __perf_event_init_context(&cpuctx->ctx);
5203 cpuctx->ctx.type = cpu_context;
5204 cpuctx->ctx.pmu = pmu;
5205 cpuctx->jiffies_interval = 1;
5206 INIT_LIST_HEAD(&cpuctx->rotation_list);
5207 }
5208
5209got_cpu_context:
5210 if (!pmu->start_txn) {
5211 if (pmu->pmu_enable) {
5212 /*
5213 * If we have pmu_enable/pmu_disable calls, install
5214 * transaction stubs that use that to try and batch
5215 * hardware accesses.
5216 */
5217 pmu->start_txn = perf_pmu_start_txn;
5218 pmu->commit_txn = perf_pmu_commit_txn;
5219 pmu->cancel_txn = perf_pmu_cancel_txn;
5220 } else {
5221 pmu->start_txn = perf_pmu_nop_void;
5222 pmu->commit_txn = perf_pmu_nop_int;
5223 pmu->cancel_txn = perf_pmu_nop_void;
5224 }
5225 }
5226
5227 if (!pmu->pmu_enable) {
5228 pmu->pmu_enable = perf_pmu_nop_void;
5229 pmu->pmu_disable = perf_pmu_nop_void;
5230 }
5231
5232 list_add_rcu(&pmu->entry, &pmus);
5233 ret = 0;
5234unlock:
5235 mutex_unlock(&pmus_lock);
5236
5237 return ret;
5238
5239free_pdc:
5240 free_percpu(pmu->pmu_disable_count);
5241 goto unlock;
5242}
5243
5244void perf_pmu_unregister(struct pmu *pmu)
5245{
5246 mutex_lock(&pmus_lock);
5247 list_del_rcu(&pmu->entry);
5248 mutex_unlock(&pmus_lock);
5249
5250 /*
5251 * We dereference the pmu list under both SRCU and regular RCU, so
5252 * synchronize against both of those.
5253 */
5254 synchronize_srcu(&pmus_srcu);
5255 synchronize_rcu();
5256
5257 free_percpu(pmu->pmu_disable_count);
5258 free_pmu_context(pmu->pmu_cpu_context);
5259}
5260
5261struct pmu *perf_init_event(struct perf_event *event)
5262{
5263 struct pmu *pmu = NULL;
5264 int idx;
5265
5266 idx = srcu_read_lock(&pmus_srcu);
5267 list_for_each_entry_rcu(pmu, &pmus, entry) {
5268 int ret = pmu->event_init(event);
5269 if (!ret)
5270 goto unlock;
5271
5272 if (ret != -ENOENT) {
5273 pmu = ERR_PTR(ret);
5274 goto unlock;
4819 } 5275 }
4820 pmu = &perf_ops_generic;
4821 break;
4822 } 5276 }
5277 pmu = ERR_PTR(-ENOENT);
5278unlock:
5279 srcu_read_unlock(&pmus_srcu, idx);
4823 5280
4824 return pmu; 5281 return pmu;
4825} 5282}
@@ -4828,20 +5285,17 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4828 * Allocate and initialize a event structure 5285 * Allocate and initialize a event structure
4829 */ 5286 */
4830static struct perf_event * 5287static struct perf_event *
4831perf_event_alloc(struct perf_event_attr *attr, 5288perf_event_alloc(struct perf_event_attr *attr, int cpu,
4832 int cpu,
4833 struct perf_event_context *ctx,
4834 struct perf_event *group_leader, 5289 struct perf_event *group_leader,
4835 struct perf_event *parent_event, 5290 struct perf_event *parent_event,
4836 perf_overflow_handler_t overflow_handler, 5291 perf_overflow_handler_t overflow_handler)
4837 gfp_t gfpflags)
4838{ 5292{
4839 const struct pmu *pmu; 5293 struct pmu *pmu;
4840 struct perf_event *event; 5294 struct perf_event *event;
4841 struct hw_perf_event *hwc; 5295 struct hw_perf_event *hwc;
4842 long err; 5296 long err;
4843 5297
4844 event = kzalloc(sizeof(*event), gfpflags); 5298 event = kzalloc(sizeof(*event), GFP_KERNEL);
4845 if (!event) 5299 if (!event)
4846 return ERR_PTR(-ENOMEM); 5300 return ERR_PTR(-ENOMEM);
4847 5301
@@ -4866,7 +5320,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4866 event->attr = *attr; 5320 event->attr = *attr;
4867 event->group_leader = group_leader; 5321 event->group_leader = group_leader;
4868 event->pmu = NULL; 5322 event->pmu = NULL;
4869 event->ctx = ctx;
4870 event->oncpu = -1; 5323 event->oncpu = -1;
4871 5324
4872 event->parent = parent_event; 5325 event->parent = parent_event;
@@ -4900,29 +5353,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4900 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5353 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4901 goto done; 5354 goto done;
4902 5355
4903 switch (attr->type) { 5356 pmu = perf_init_event(event);
4904 case PERF_TYPE_RAW:
4905 case PERF_TYPE_HARDWARE:
4906 case PERF_TYPE_HW_CACHE:
4907 pmu = hw_perf_event_init(event);
4908 break;
4909 5357
4910 case PERF_TYPE_SOFTWARE:
4911 pmu = sw_perf_event_init(event);
4912 break;
4913
4914 case PERF_TYPE_TRACEPOINT:
4915 pmu = tp_perf_event_init(event);
4916 break;
4917
4918 case PERF_TYPE_BREAKPOINT:
4919 pmu = bp_perf_event_init(event);
4920 break;
4921
4922
4923 default:
4924 break;
4925 }
4926done: 5358done:
4927 err = 0; 5359 err = 0;
4928 if (!pmu) 5360 if (!pmu)
@@ -4947,6 +5379,13 @@ done:
4947 atomic_inc(&nr_comm_events); 5379 atomic_inc(&nr_comm_events);
4948 if (event->attr.task) 5380 if (event->attr.task)
4949 atomic_inc(&nr_task_events); 5381 atomic_inc(&nr_task_events);
5382 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5383 err = get_callchain_buffers();
5384 if (err) {
5385 free_event(event);
5386 return ERR_PTR(err);
5387 }
5388 }
4950 } 5389 }
4951 5390
4952 return event; 5391 return event;
@@ -5094,12 +5533,16 @@ SYSCALL_DEFINE5(perf_event_open,
5094 struct perf_event_attr __user *, attr_uptr, 5533 struct perf_event_attr __user *, attr_uptr,
5095 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5534 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5096{ 5535{
5097 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5536 struct perf_event *group_leader = NULL, *output_event = NULL;
5537 struct perf_event *event, *sibling;
5098 struct perf_event_attr attr; 5538 struct perf_event_attr attr;
5099 struct perf_event_context *ctx; 5539 struct perf_event_context *ctx;
5100 struct file *event_file = NULL; 5540 struct file *event_file = NULL;
5101 struct file *group_file = NULL; 5541 struct file *group_file = NULL;
5542 struct task_struct *task = NULL;
5543 struct pmu *pmu;
5102 int event_fd; 5544 int event_fd;
5545 int move_group = 0;
5103 int fput_needed = 0; 5546 int fput_needed = 0;
5104 int err; 5547 int err;
5105 5548
@@ -5125,20 +5568,11 @@ SYSCALL_DEFINE5(perf_event_open,
5125 if (event_fd < 0) 5568 if (event_fd < 0)
5126 return event_fd; 5569 return event_fd;
5127 5570
5128 /*
5129 * Get the target context (task or percpu):
5130 */
5131 ctx = find_get_context(pid, cpu);
5132 if (IS_ERR(ctx)) {
5133 err = PTR_ERR(ctx);
5134 goto err_fd;
5135 }
5136
5137 if (group_fd != -1) { 5571 if (group_fd != -1) {
5138 group_leader = perf_fget_light(group_fd, &fput_needed); 5572 group_leader = perf_fget_light(group_fd, &fput_needed);
5139 if (IS_ERR(group_leader)) { 5573 if (IS_ERR(group_leader)) {
5140 err = PTR_ERR(group_leader); 5574 err = PTR_ERR(group_leader);
5141 goto err_put_context; 5575 goto err_fd;
5142 } 5576 }
5143 group_file = group_leader->filp; 5577 group_file = group_leader->filp;
5144 if (flags & PERF_FLAG_FD_OUTPUT) 5578 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5147,6 +5581,58 @@ SYSCALL_DEFINE5(perf_event_open,
5147 group_leader = NULL; 5581 group_leader = NULL;
5148 } 5582 }
5149 5583
5584 event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
5585 if (IS_ERR(event)) {
5586 err = PTR_ERR(event);
5587 goto err_fd;
5588 }
5589
5590 /*
5591 * Special case software events and allow them to be part of
5592 * any hardware group.
5593 */
5594 pmu = event->pmu;
5595
5596 if (group_leader &&
5597 (is_software_event(event) != is_software_event(group_leader))) {
5598 if (is_software_event(event)) {
5599 /*
5600 * If event and group_leader are not both a software
5601 * event, and event is, then group leader is not.
5602 *
5603 * Allow the addition of software events to !software
5604 * groups, this is safe because software events never
5605 * fail to schedule.
5606 */
5607 pmu = group_leader->pmu;
5608 } else if (is_software_event(group_leader) &&
5609 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5610 /*
5611 * In case the group is a pure software group, and we
5612 * try to add a hardware event, move the whole group to
5613 * the hardware context.
5614 */
5615 move_group = 1;
5616 }
5617 }
5618
5619 if (pid != -1) {
5620 task = find_lively_task_by_vpid(pid);
5621 if (IS_ERR(task)) {
5622 err = PTR_ERR(task);
5623 goto err_group_fd;
5624 }
5625 }
5626
5627 /*
5628 * Get the target context (task or percpu):
5629 */
5630 ctx = find_get_context(pmu, task, cpu);
5631 if (IS_ERR(ctx)) {
5632 err = PTR_ERR(ctx);
5633 goto err_group_fd;
5634 }
5635
5150 /* 5636 /*
5151 * Look up the group leader (we will attach this event to it): 5637 * Look up the group leader (we will attach this event to it):
5152 */ 5638 */
@@ -5158,42 +5644,66 @@ SYSCALL_DEFINE5(perf_event_open,
5158 * becoming part of another group-sibling): 5644 * becoming part of another group-sibling):
5159 */ 5645 */
5160 if (group_leader->group_leader != group_leader) 5646 if (group_leader->group_leader != group_leader)
5161 goto err_put_context; 5647 goto err_context;
5162 /* 5648 /*
5163 * Do not allow to attach to a group in a different 5649 * Do not allow to attach to a group in a different
5164 * task or CPU context: 5650 * task or CPU context:
5165 */ 5651 */
5166 if (group_leader->ctx != ctx) 5652 if (move_group) {
5167 goto err_put_context; 5653 if (group_leader->ctx->type != ctx->type)
5654 goto err_context;
5655 } else {
5656 if (group_leader->ctx != ctx)
5657 goto err_context;
5658 }
5659
5168 /* 5660 /*
5169 * Only a group leader can be exclusive or pinned 5661 * Only a group leader can be exclusive or pinned
5170 */ 5662 */
5171 if (attr.exclusive || attr.pinned) 5663 if (attr.exclusive || attr.pinned)
5172 goto err_put_context; 5664 goto err_context;
5173 }
5174
5175 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5176 NULL, NULL, GFP_KERNEL);
5177 if (IS_ERR(event)) {
5178 err = PTR_ERR(event);
5179 goto err_put_context;
5180 } 5665 }
5181 5666
5182 if (output_event) { 5667 if (output_event) {
5183 err = perf_event_set_output(event, output_event); 5668 err = perf_event_set_output(event, output_event);
5184 if (err) 5669 if (err)
5185 goto err_free_put_context; 5670 goto err_context;
5186 } 5671 }
5187 5672
5188 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5673 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5189 if (IS_ERR(event_file)) { 5674 if (IS_ERR(event_file)) {
5190 err = PTR_ERR(event_file); 5675 err = PTR_ERR(event_file);
5191 goto err_free_put_context; 5676 goto err_context;
5677 }
5678
5679 if (move_group) {
5680 struct perf_event_context *gctx = group_leader->ctx;
5681
5682 mutex_lock(&gctx->mutex);
5683 perf_event_remove_from_context(group_leader);
5684 list_for_each_entry(sibling, &group_leader->sibling_list,
5685 group_entry) {
5686 perf_event_remove_from_context(sibling);
5687 put_ctx(gctx);
5688 }
5689 mutex_unlock(&gctx->mutex);
5690 put_ctx(gctx);
5192 } 5691 }
5193 5692
5194 event->filp = event_file; 5693 event->filp = event_file;
5195 WARN_ON_ONCE(ctx->parent_ctx); 5694 WARN_ON_ONCE(ctx->parent_ctx);
5196 mutex_lock(&ctx->mutex); 5695 mutex_lock(&ctx->mutex);
5696
5697 if (move_group) {
5698 perf_install_in_context(ctx, group_leader, cpu);
5699 get_ctx(ctx);
5700 list_for_each_entry(sibling, &group_leader->sibling_list,
5701 group_entry) {
5702 perf_install_in_context(ctx, sibling, cpu);
5703 get_ctx(ctx);
5704 }
5705 }
5706
5197 perf_install_in_context(ctx, event, cpu); 5707 perf_install_in_context(ctx, event, cpu);
5198 ++ctx->generation; 5708 ++ctx->generation;
5199 mutex_unlock(&ctx->mutex); 5709 mutex_unlock(&ctx->mutex);
@@ -5214,11 +5724,11 @@ SYSCALL_DEFINE5(perf_event_open,
5214 fd_install(event_fd, event_file); 5724 fd_install(event_fd, event_file);
5215 return event_fd; 5725 return event_fd;
5216 5726
5217err_free_put_context: 5727err_context:
5218 free_event(event);
5219err_put_context:
5220 fput_light(group_file, fput_needed);
5221 put_ctx(ctx); 5728 put_ctx(ctx);
5729err_group_fd:
5730 fput_light(group_file, fput_needed);
5731 free_event(event);
5222err_fd: 5732err_fd:
5223 put_unused_fd(event_fd); 5733 put_unused_fd(event_fd);
5224 return err; 5734 return err;
@@ -5229,32 +5739,31 @@ err_fd:
5229 * 5739 *
5230 * @attr: attributes of the counter to create 5740 * @attr: attributes of the counter to create
5231 * @cpu: cpu in which the counter is bound 5741 * @cpu: cpu in which the counter is bound
5232 * @pid: task to profile 5742 * @task: task to profile (NULL for percpu)
5233 */ 5743 */
5234struct perf_event * 5744struct perf_event *
5235perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5745perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5236 pid_t pid, 5746 struct task_struct *task,
5237 perf_overflow_handler_t overflow_handler) 5747 perf_overflow_handler_t overflow_handler)
5238{ 5748{
5239 struct perf_event *event;
5240 struct perf_event_context *ctx; 5749 struct perf_event_context *ctx;
5750 struct perf_event *event;
5241 int err; 5751 int err;
5242 5752
5243 /* 5753 /*
5244 * Get the target context (task or percpu): 5754 * Get the target context (task or percpu):
5245 */ 5755 */
5246 5756
5247 ctx = find_get_context(pid, cpu); 5757 event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
5248 if (IS_ERR(ctx)) {
5249 err = PTR_ERR(ctx);
5250 goto err_exit;
5251 }
5252
5253 event = perf_event_alloc(attr, cpu, ctx, NULL,
5254 NULL, overflow_handler, GFP_KERNEL);
5255 if (IS_ERR(event)) { 5758 if (IS_ERR(event)) {
5256 err = PTR_ERR(event); 5759 err = PTR_ERR(event);
5257 goto err_put_context; 5760 goto err;
5761 }
5762
5763 ctx = find_get_context(event->pmu, task, cpu);
5764 if (IS_ERR(ctx)) {
5765 err = PTR_ERR(ctx);
5766 goto err_free;
5258 } 5767 }
5259 5768
5260 event->filp = NULL; 5769 event->filp = NULL;
@@ -5272,112 +5781,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5272 5781
5273 return event; 5782 return event;
5274 5783
5275 err_put_context: 5784err_free:
5276 put_ctx(ctx); 5785 free_event(event);
5277 err_exit: 5786err:
5278 return ERR_PTR(err); 5787 return ERR_PTR(err);
5279} 5788}
5280EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5789EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5281 5790
5282/*
5283 * inherit a event from parent task to child task:
5284 */
5285static struct perf_event *
5286inherit_event(struct perf_event *parent_event,
5287 struct task_struct *parent,
5288 struct perf_event_context *parent_ctx,
5289 struct task_struct *child,
5290 struct perf_event *group_leader,
5291 struct perf_event_context *child_ctx)
5292{
5293 struct perf_event *child_event;
5294
5295 /*
5296 * Instead of creating recursive hierarchies of events,
5297 * we link inherited events back to the original parent,
5298 * which has a filp for sure, which we use as the reference
5299 * count:
5300 */
5301 if (parent_event->parent)
5302 parent_event = parent_event->parent;
5303
5304 child_event = perf_event_alloc(&parent_event->attr,
5305 parent_event->cpu, child_ctx,
5306 group_leader, parent_event,
5307 NULL, GFP_KERNEL);
5308 if (IS_ERR(child_event))
5309 return child_event;
5310 get_ctx(child_ctx);
5311
5312 /*
5313 * Make the child state follow the state of the parent event,
5314 * not its attr.disabled bit. We hold the parent's mutex,
5315 * so we won't race with perf_event_{en, dis}able_family.
5316 */
5317 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5318 child_event->state = PERF_EVENT_STATE_INACTIVE;
5319 else
5320 child_event->state = PERF_EVENT_STATE_OFF;
5321
5322 if (parent_event->attr.freq) {
5323 u64 sample_period = parent_event->hw.sample_period;
5324 struct hw_perf_event *hwc = &child_event->hw;
5325
5326 hwc->sample_period = sample_period;
5327 hwc->last_period = sample_period;
5328
5329 local64_set(&hwc->period_left, sample_period);
5330 }
5331
5332 child_event->overflow_handler = parent_event->overflow_handler;
5333
5334 /*
5335 * Link it up in the child's context:
5336 */
5337 add_event_to_ctx(child_event, child_ctx);
5338
5339 /*
5340 * Get a reference to the parent filp - we will fput it
5341 * when the child event exits. This is safe to do because
5342 * we are in the parent and we know that the filp still
5343 * exists and has a nonzero count:
5344 */
5345 atomic_long_inc(&parent_event->filp->f_count);
5346
5347 /*
5348 * Link this into the parent event's child list
5349 */
5350 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5351 mutex_lock(&parent_event->child_mutex);
5352 list_add_tail(&child_event->child_list, &parent_event->child_list);
5353 mutex_unlock(&parent_event->child_mutex);
5354
5355 return child_event;
5356}
5357
5358static int inherit_group(struct perf_event *parent_event,
5359 struct task_struct *parent,
5360 struct perf_event_context *parent_ctx,
5361 struct task_struct *child,
5362 struct perf_event_context *child_ctx)
5363{
5364 struct perf_event *leader;
5365 struct perf_event *sub;
5366 struct perf_event *child_ctr;
5367
5368 leader = inherit_event(parent_event, parent, parent_ctx,
5369 child, NULL, child_ctx);
5370 if (IS_ERR(leader))
5371 return PTR_ERR(leader);
5372 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5373 child_ctr = inherit_event(sub, parent, parent_ctx,
5374 child, leader, child_ctx);
5375 if (IS_ERR(child_ctr))
5376 return PTR_ERR(child_ctr);
5377 }
5378 return 0;
5379}
5380
5381static void sync_child_event(struct perf_event *child_event, 5791static void sync_child_event(struct perf_event *child_event,
5382 struct task_struct *child) 5792 struct task_struct *child)
5383{ 5793{
@@ -5434,16 +5844,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5434 } 5844 }
5435} 5845}
5436 5846
5437/* 5847static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5438 * When a child task exits, feed back event values to parent events.
5439 */
5440void perf_event_exit_task(struct task_struct *child)
5441{ 5848{
5442 struct perf_event *child_event, *tmp; 5849 struct perf_event *child_event, *tmp;
5443 struct perf_event_context *child_ctx; 5850 struct perf_event_context *child_ctx;
5444 unsigned long flags; 5851 unsigned long flags;
5445 5852
5446 if (likely(!child->perf_event_ctxp)) { 5853 if (likely(!child->perf_event_ctxp[ctxn])) {
5447 perf_event_task(child, NULL, 0); 5854 perf_event_task(child, NULL, 0);
5448 return; 5855 return;
5449 } 5856 }
@@ -5455,7 +5862,7 @@ void perf_event_exit_task(struct task_struct *child)
5455 * scheduled, so we are now safe from rescheduling changing 5862 * scheduled, so we are now safe from rescheduling changing
5456 * our context. 5863 * our context.
5457 */ 5864 */
5458 child_ctx = child->perf_event_ctxp; 5865 child_ctx = child->perf_event_ctxp[ctxn];
5459 __perf_event_task_sched_out(child_ctx); 5866 __perf_event_task_sched_out(child_ctx);
5460 5867
5461 /* 5868 /*
@@ -5464,7 +5871,7 @@ void perf_event_exit_task(struct task_struct *child)
5464 * incremented the context's refcount before we do put_ctx below. 5871 * incremented the context's refcount before we do put_ctx below.
5465 */ 5872 */
5466 raw_spin_lock(&child_ctx->lock); 5873 raw_spin_lock(&child_ctx->lock);
5467 child->perf_event_ctxp = NULL; 5874 child->perf_event_ctxp[ctxn] = NULL;
5468 /* 5875 /*
5469 * If this context is a clone; unclone it so it can't get 5876 * If this context is a clone; unclone it so it can't get
5470 * swapped to another process while we're removing all 5877 * swapped to another process while we're removing all
@@ -5517,6 +5924,17 @@ again:
5517 put_ctx(child_ctx); 5924 put_ctx(child_ctx);
5518} 5925}
5519 5926
5927/*
5928 * When a child task exits, feed back event values to parent events.
5929 */
5930void perf_event_exit_task(struct task_struct *child)
5931{
5932 int ctxn;
5933
5934 for_each_task_context_nr(ctxn)
5935 perf_event_exit_task_context(child, ctxn);
5936}
5937
5520static void perf_free_event(struct perf_event *event, 5938static void perf_free_event(struct perf_event *event,
5521 struct perf_event_context *ctx) 5939 struct perf_event_context *ctx)
5522{ 5940{
@@ -5538,48 +5956,165 @@ static void perf_free_event(struct perf_event *event,
5538 5956
5539/* 5957/*
5540 * free an unexposed, unused context as created by inheritance by 5958 * free an unexposed, unused context as created by inheritance by
5541 * init_task below, used by fork() in case of fail. 5959 * perf_event_init_task below, used by fork() in case of fail.
5542 */ 5960 */
5543void perf_event_free_task(struct task_struct *task) 5961void perf_event_free_task(struct task_struct *task)
5544{ 5962{
5545 struct perf_event_context *ctx = task->perf_event_ctxp; 5963 struct perf_event_context *ctx;
5546 struct perf_event *event, *tmp; 5964 struct perf_event *event, *tmp;
5965 int ctxn;
5547 5966
5548 if (!ctx) 5967 for_each_task_context_nr(ctxn) {
5549 return; 5968 ctx = task->perf_event_ctxp[ctxn];
5969 if (!ctx)
5970 continue;
5550 5971
5551 mutex_lock(&ctx->mutex); 5972 mutex_lock(&ctx->mutex);
5552again: 5973again:
5553 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5974 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5554 perf_free_event(event, ctx); 5975 group_entry)
5976 perf_free_event(event, ctx);
5555 5977
5556 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5978 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5557 group_entry) 5979 group_entry)
5558 perf_free_event(event, ctx); 5980 perf_free_event(event, ctx);
5559 5981
5560 if (!list_empty(&ctx->pinned_groups) || 5982 if (!list_empty(&ctx->pinned_groups) ||
5561 !list_empty(&ctx->flexible_groups)) 5983 !list_empty(&ctx->flexible_groups))
5562 goto again; 5984 goto again;
5563 5985
5564 mutex_unlock(&ctx->mutex); 5986 mutex_unlock(&ctx->mutex);
5565 5987
5566 put_ctx(ctx); 5988 put_ctx(ctx);
5989 }
5990}
5991
5992void perf_event_delayed_put(struct task_struct *task)
5993{
5994 int ctxn;
5995
5996 for_each_task_context_nr(ctxn)
5997 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5998}
5999
6000/*
6001 * inherit a event from parent task to child task:
6002 */
6003static struct perf_event *
6004inherit_event(struct perf_event *parent_event,
6005 struct task_struct *parent,
6006 struct perf_event_context *parent_ctx,
6007 struct task_struct *child,
6008 struct perf_event *group_leader,
6009 struct perf_event_context *child_ctx)
6010{
6011 struct perf_event *child_event;
6012 unsigned long flags;
6013
6014 /*
6015 * Instead of creating recursive hierarchies of events,
6016 * we link inherited events back to the original parent,
6017 * which has a filp for sure, which we use as the reference
6018 * count:
6019 */
6020 if (parent_event->parent)
6021 parent_event = parent_event->parent;
6022
6023 child_event = perf_event_alloc(&parent_event->attr,
6024 parent_event->cpu,
6025 group_leader, parent_event,
6026 NULL);
6027 if (IS_ERR(child_event))
6028 return child_event;
6029 get_ctx(child_ctx);
6030
6031 /*
6032 * Make the child state follow the state of the parent event,
6033 * not its attr.disabled bit. We hold the parent's mutex,
6034 * so we won't race with perf_event_{en, dis}able_family.
6035 */
6036 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6037 child_event->state = PERF_EVENT_STATE_INACTIVE;
6038 else
6039 child_event->state = PERF_EVENT_STATE_OFF;
6040
6041 if (parent_event->attr.freq) {
6042 u64 sample_period = parent_event->hw.sample_period;
6043 struct hw_perf_event *hwc = &child_event->hw;
6044
6045 hwc->sample_period = sample_period;
6046 hwc->last_period = sample_period;
6047
6048 local64_set(&hwc->period_left, sample_period);
6049 }
6050
6051 child_event->ctx = child_ctx;
6052 child_event->overflow_handler = parent_event->overflow_handler;
6053
6054 /*
6055 * Link it up in the child's context:
6056 */
6057 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6058 add_event_to_ctx(child_event, child_ctx);
6059 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6060
6061 /*
6062 * Get a reference to the parent filp - we will fput it
6063 * when the child event exits. This is safe to do because
6064 * we are in the parent and we know that the filp still
6065 * exists and has a nonzero count:
6066 */
6067 atomic_long_inc(&parent_event->filp->f_count);
6068
6069 /*
6070 * Link this into the parent event's child list
6071 */
6072 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6073 mutex_lock(&parent_event->child_mutex);
6074 list_add_tail(&child_event->child_list, &parent_event->child_list);
6075 mutex_unlock(&parent_event->child_mutex);
6076
6077 return child_event;
6078}
6079
6080static int inherit_group(struct perf_event *parent_event,
6081 struct task_struct *parent,
6082 struct perf_event_context *parent_ctx,
6083 struct task_struct *child,
6084 struct perf_event_context *child_ctx)
6085{
6086 struct perf_event *leader;
6087 struct perf_event *sub;
6088 struct perf_event *child_ctr;
6089
6090 leader = inherit_event(parent_event, parent, parent_ctx,
6091 child, NULL, child_ctx);
6092 if (IS_ERR(leader))
6093 return PTR_ERR(leader);
6094 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6095 child_ctr = inherit_event(sub, parent, parent_ctx,
6096 child, leader, child_ctx);
6097 if (IS_ERR(child_ctr))
6098 return PTR_ERR(child_ctr);
6099 }
6100 return 0;
5567} 6101}
5568 6102
5569static int 6103static int
5570inherit_task_group(struct perf_event *event, struct task_struct *parent, 6104inherit_task_group(struct perf_event *event, struct task_struct *parent,
5571 struct perf_event_context *parent_ctx, 6105 struct perf_event_context *parent_ctx,
5572 struct task_struct *child, 6106 struct task_struct *child, int ctxn,
5573 int *inherited_all) 6107 int *inherited_all)
5574{ 6108{
5575 int ret; 6109 int ret;
5576 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6110 struct perf_event_context *child_ctx;
5577 6111
5578 if (!event->attr.inherit) { 6112 if (!event->attr.inherit) {
5579 *inherited_all = 0; 6113 *inherited_all = 0;
5580 return 0; 6114 return 0;
5581 } 6115 }
5582 6116
6117 child_ctx = child->perf_event_ctxp[ctxn];
5583 if (!child_ctx) { 6118 if (!child_ctx) {
5584 /* 6119 /*
5585 * This is executed from the parent task context, so 6120 * This is executed from the parent task context, so
@@ -5588,14 +6123,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5588 * child. 6123 * child.
5589 */ 6124 */
5590 6125
5591 child_ctx = kzalloc(sizeof(struct perf_event_context), 6126 child_ctx = alloc_perf_context(event->pmu, child);
5592 GFP_KERNEL);
5593 if (!child_ctx) 6127 if (!child_ctx)
5594 return -ENOMEM; 6128 return -ENOMEM;
5595 6129
5596 __perf_event_init_context(child_ctx, child); 6130 child->perf_event_ctxp[ctxn] = child_ctx;
5597 child->perf_event_ctxp = child_ctx;
5598 get_task_struct(child);
5599 } 6131 }
5600 6132
5601 ret = inherit_group(event, parent, parent_ctx, 6133 ret = inherit_group(event, parent, parent_ctx,
@@ -5607,11 +6139,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5607 return ret; 6139 return ret;
5608} 6140}
5609 6141
5610
5611/* 6142/*
5612 * Initialize the perf_event context in task_struct 6143 * Initialize the perf_event context in task_struct
5613 */ 6144 */
5614int perf_event_init_task(struct task_struct *child) 6145int perf_event_init_context(struct task_struct *child, int ctxn)
5615{ 6146{
5616 struct perf_event_context *child_ctx, *parent_ctx; 6147 struct perf_event_context *child_ctx, *parent_ctx;
5617 struct perf_event_context *cloned_ctx; 6148 struct perf_event_context *cloned_ctx;
@@ -5620,19 +6151,19 @@ int perf_event_init_task(struct task_struct *child)
5620 int inherited_all = 1; 6151 int inherited_all = 1;
5621 int ret = 0; 6152 int ret = 0;
5622 6153
5623 child->perf_event_ctxp = NULL; 6154 child->perf_event_ctxp[ctxn] = NULL;
5624 6155
5625 mutex_init(&child->perf_event_mutex); 6156 mutex_init(&child->perf_event_mutex);
5626 INIT_LIST_HEAD(&child->perf_event_list); 6157 INIT_LIST_HEAD(&child->perf_event_list);
5627 6158
5628 if (likely(!parent->perf_event_ctxp)) 6159 if (likely(!parent->perf_event_ctxp[ctxn]))
5629 return 0; 6160 return 0;
5630 6161
5631 /* 6162 /*
5632 * If the parent's context is a clone, pin it so it won't get 6163 * If the parent's context is a clone, pin it so it won't get
5633 * swapped under us. 6164 * swapped under us.
5634 */ 6165 */
5635 parent_ctx = perf_pin_task_context(parent); 6166 parent_ctx = perf_pin_task_context(parent, ctxn);
5636 6167
5637 /* 6168 /*
5638 * No need to check if parent_ctx != NULL here; since we saw 6169 * No need to check if parent_ctx != NULL here; since we saw
@@ -5652,20 +6183,20 @@ int perf_event_init_task(struct task_struct *child)
5652 * the list, not manipulating it: 6183 * the list, not manipulating it:
5653 */ 6184 */
5654 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6185 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5655 ret = inherit_task_group(event, parent, parent_ctx, child, 6186 ret = inherit_task_group(event, parent, parent_ctx,
5656 &inherited_all); 6187 child, ctxn, &inherited_all);
5657 if (ret) 6188 if (ret)
5658 break; 6189 break;
5659 } 6190 }
5660 6191
5661 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6192 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5662 ret = inherit_task_group(event, parent, parent_ctx, child, 6193 ret = inherit_task_group(event, parent, parent_ctx,
5663 &inherited_all); 6194 child, ctxn, &inherited_all);
5664 if (ret) 6195 if (ret)
5665 break; 6196 break;
5666 } 6197 }
5667 6198
5668 child_ctx = child->perf_event_ctxp; 6199 child_ctx = child->perf_event_ctxp[ctxn];
5669 6200
5670 if (child_ctx && inherited_all) { 6201 if (child_ctx && inherited_all) {
5671 /* 6202 /*
@@ -5694,63 +6225,98 @@ int perf_event_init_task(struct task_struct *child)
5694 return ret; 6225 return ret;
5695} 6226}
5696 6227
6228/*
6229 * Initialize the perf_event context in task_struct
6230 */
6231int perf_event_init_task(struct task_struct *child)
6232{
6233 int ctxn, ret;
6234
6235 for_each_task_context_nr(ctxn) {
6236 ret = perf_event_init_context(child, ctxn);
6237 if (ret)
6238 return ret;
6239 }
6240
6241 return 0;
6242}
6243
5697static void __init perf_event_init_all_cpus(void) 6244static void __init perf_event_init_all_cpus(void)
5698{ 6245{
6246 struct swevent_htable *swhash;
5699 int cpu; 6247 int cpu;
5700 struct perf_cpu_context *cpuctx;
5701 6248
5702 for_each_possible_cpu(cpu) { 6249 for_each_possible_cpu(cpu) {
5703 cpuctx = &per_cpu(perf_cpu_context, cpu); 6250 swhash = &per_cpu(swevent_htable, cpu);
5704 mutex_init(&cpuctx->hlist_mutex); 6251 mutex_init(&swhash->hlist_mutex);
5705 __perf_event_init_context(&cpuctx->ctx, NULL); 6252 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5706 } 6253 }
5707} 6254}
5708 6255
5709static void __cpuinit perf_event_init_cpu(int cpu) 6256static void __cpuinit perf_event_init_cpu(int cpu)
5710{ 6257{
5711 struct perf_cpu_context *cpuctx; 6258 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5712
5713 cpuctx = &per_cpu(perf_cpu_context, cpu);
5714
5715 spin_lock(&perf_resource_lock);
5716 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5717 spin_unlock(&perf_resource_lock);
5718 6259
5719 mutex_lock(&cpuctx->hlist_mutex); 6260 mutex_lock(&swhash->hlist_mutex);
5720 if (cpuctx->hlist_refcount > 0) { 6261 if (swhash->hlist_refcount > 0) {
5721 struct swevent_hlist *hlist; 6262 struct swevent_hlist *hlist;
5722 6263
5723 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6264 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5724 WARN_ON_ONCE(!hlist); 6265 WARN_ON(!hlist);
5725 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6266 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5726 } 6267 }
5727 mutex_unlock(&cpuctx->hlist_mutex); 6268 mutex_unlock(&swhash->hlist_mutex);
5728} 6269}
5729 6270
5730#ifdef CONFIG_HOTPLUG_CPU 6271#ifdef CONFIG_HOTPLUG_CPU
5731static void __perf_event_exit_cpu(void *info) 6272static void perf_pmu_rotate_stop(struct pmu *pmu)
5732{ 6273{
5733 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6274 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5734 struct perf_event_context *ctx = &cpuctx->ctx; 6275
6276 WARN_ON(!irqs_disabled());
6277
6278 list_del_init(&cpuctx->rotation_list);
6279}
6280
6281static void __perf_event_exit_context(void *__info)
6282{
6283 struct perf_event_context *ctx = __info;
5735 struct perf_event *event, *tmp; 6284 struct perf_event *event, *tmp;
5736 6285
6286 perf_pmu_rotate_stop(ctx->pmu);
6287
5737 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6288 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5738 __perf_event_remove_from_context(event); 6289 __perf_event_remove_from_context(event);
5739 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6290 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5740 __perf_event_remove_from_context(event); 6291 __perf_event_remove_from_context(event);
5741} 6292}
6293
6294static void perf_event_exit_cpu_context(int cpu)
6295{
6296 struct perf_event_context *ctx;
6297 struct pmu *pmu;
6298 int idx;
6299
6300 idx = srcu_read_lock(&pmus_srcu);
6301 list_for_each_entry_rcu(pmu, &pmus, entry) {
6302 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6303
6304 mutex_lock(&ctx->mutex);
6305 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6306 mutex_unlock(&ctx->mutex);
6307 }
6308 srcu_read_unlock(&pmus_srcu, idx);
6309}
6310
5742static void perf_event_exit_cpu(int cpu) 6311static void perf_event_exit_cpu(int cpu)
5743{ 6312{
5744 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6313 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5745 struct perf_event_context *ctx = &cpuctx->ctx;
5746 6314
5747 mutex_lock(&cpuctx->hlist_mutex); 6315 mutex_lock(&swhash->hlist_mutex);
5748 swevent_hlist_release(cpuctx); 6316 swevent_hlist_release(swhash);
5749 mutex_unlock(&cpuctx->hlist_mutex); 6317 mutex_unlock(&swhash->hlist_mutex);
5750 6318
5751 mutex_lock(&ctx->mutex); 6319 perf_event_exit_cpu_context(cpu);
5752 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5753 mutex_unlock(&ctx->mutex);
5754} 6320}
5755#else 6321#else
5756static inline void perf_event_exit_cpu(int cpu) { } 6322static inline void perf_event_exit_cpu(int cpu) { }
@@ -5780,118 +6346,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5780 return NOTIFY_OK; 6346 return NOTIFY_OK;
5781} 6347}
5782 6348
5783/*
5784 * This has to have a higher priority than migration_notifier in sched.c.
5785 */
5786static struct notifier_block __cpuinitdata perf_cpu_nb = {
5787 .notifier_call = perf_cpu_notify,
5788 .priority = 20,
5789};
5790
5791void __init perf_event_init(void) 6349void __init perf_event_init(void)
5792{ 6350{
5793 perf_event_init_all_cpus(); 6351 perf_event_init_all_cpus();
5794 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6352 init_srcu_struct(&pmus_srcu);
5795 (void *)(long)smp_processor_id()); 6353 perf_pmu_register(&perf_swevent);
5796 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6354 perf_pmu_register(&perf_cpu_clock);
5797 (void *)(long)smp_processor_id()); 6355 perf_pmu_register(&perf_task_clock);
5798 register_cpu_notifier(&perf_cpu_nb); 6356 perf_tp_register();
5799} 6357 perf_cpu_notifier(perf_cpu_notify);
5800
5801static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5802 struct sysdev_class_attribute *attr,
5803 char *buf)
5804{
5805 return sprintf(buf, "%d\n", perf_reserved_percpu);
5806}
5807
5808static ssize_t
5809perf_set_reserve_percpu(struct sysdev_class *class,
5810 struct sysdev_class_attribute *attr,
5811 const char *buf,
5812 size_t count)
5813{
5814 struct perf_cpu_context *cpuctx;
5815 unsigned long val;
5816 int err, cpu, mpt;
5817
5818 err = strict_strtoul(buf, 10, &val);
5819 if (err)
5820 return err;
5821 if (val > perf_max_events)
5822 return -EINVAL;
5823
5824 spin_lock(&perf_resource_lock);
5825 perf_reserved_percpu = val;
5826 for_each_online_cpu(cpu) {
5827 cpuctx = &per_cpu(perf_cpu_context, cpu);
5828 raw_spin_lock_irq(&cpuctx->ctx.lock);
5829 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5830 perf_max_events - perf_reserved_percpu);
5831 cpuctx->max_pertask = mpt;
5832 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5833 }
5834 spin_unlock(&perf_resource_lock);
5835
5836 return count;
5837}
5838
5839static ssize_t perf_show_overcommit(struct sysdev_class *class,
5840 struct sysdev_class_attribute *attr,
5841 char *buf)
5842{
5843 return sprintf(buf, "%d\n", perf_overcommit);
5844}
5845
5846static ssize_t
5847perf_set_overcommit(struct sysdev_class *class,
5848 struct sysdev_class_attribute *attr,
5849 const char *buf, size_t count)
5850{
5851 unsigned long val;
5852 int err;
5853
5854 err = strict_strtoul(buf, 10, &val);
5855 if (err)
5856 return err;
5857 if (val > 1)
5858 return -EINVAL;
5859
5860 spin_lock(&perf_resource_lock);
5861 perf_overcommit = val;
5862 spin_unlock(&perf_resource_lock);
5863
5864 return count;
5865}
5866
5867static SYSDEV_CLASS_ATTR(
5868 reserve_percpu,
5869 0644,
5870 perf_show_reserve_percpu,
5871 perf_set_reserve_percpu
5872 );
5873
5874static SYSDEV_CLASS_ATTR(
5875 overcommit,
5876 0644,
5877 perf_show_overcommit,
5878 perf_set_overcommit
5879 );
5880
5881static struct attribute *perfclass_attrs[] = {
5882 &attr_reserve_percpu.attr,
5883 &attr_overcommit.attr,
5884 NULL
5885};
5886
5887static struct attribute_group perfclass_attr_group = {
5888 .attrs = perfclass_attrs,
5889 .name = "perf_events",
5890};
5891
5892static int __init perf_event_sysfs_init(void)
5893{
5894 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5895 &perfclass_attr_group);
5896} 6358}
5897device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..c0d2067f3e0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3584,7 +3584,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3584 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3585 raw_spin_unlock(&rq->lock);
3586 3586
3587 perf_event_task_tick(curr); 3587 perf_event_task_tick();
3588 3588
3589#ifdef CONFIG_SMP 3589#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3590 rq->idle_at_tick = idle_cpu(cpu);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..65fb077ea79c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
884 FTRACE_ENABLE_CALLS = (1 << 0), 884 FTRACE_ENABLE_CALLS = (1 << 0),
885 FTRACE_DISABLE_CALLS = (1 << 1), 885 FTRACE_DISABLE_CALLS = (1 << 1),
886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
887 FTRACE_ENABLE_MCOUNT = (1 << 3), 887 FTRACE_START_FUNC_RET = (1 << 3),
888 FTRACE_DISABLE_MCOUNT = (1 << 4), 888 FTRACE_STOP_FUNC_RET = (1 << 4),
889 FTRACE_START_FUNC_RET = (1 << 5),
890 FTRACE_STOP_FUNC_RET = (1 << 6),
891}; 889};
892 890
893static int ftrace_filtered; 891static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
1226 1224
1227static void ftrace_startup_sysctl(void) 1225static void ftrace_startup_sysctl(void)
1228{ 1226{
1229 int command = FTRACE_ENABLE_MCOUNT;
1230
1231 if (unlikely(ftrace_disabled)) 1227 if (unlikely(ftrace_disabled))
1232 return; 1228 return;
1233 1229
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
1235 saved_ftrace_func = NULL; 1231 saved_ftrace_func = NULL;
1236 /* ftrace_start_up is true if we want ftrace running */ 1232 /* ftrace_start_up is true if we want ftrace running */
1237 if (ftrace_start_up) 1233 if (ftrace_start_up)
1238 command |= FTRACE_ENABLE_CALLS; 1234 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1239
1240 ftrace_run_update_code(command);
1241} 1235}
1242 1236
1243static void ftrace_shutdown_sysctl(void) 1237static void ftrace_shutdown_sysctl(void)
1244{ 1238{
1245 int command = FTRACE_DISABLE_MCOUNT;
1246
1247 if (unlikely(ftrace_disabled)) 1239 if (unlikely(ftrace_disabled))
1248 return; 1240 return;
1249 1241
1250 /* ftrace_start_up is true if ftrace is running */ 1242 /* ftrace_start_up is true if ftrace is running */
1251 if (ftrace_start_up) 1243 if (ftrace_start_up)
1252 command |= FTRACE_DISABLE_CALLS; 1244 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1253
1254 ftrace_run_update_code(command);
1255} 1245}
1256 1246
1257static cycle_t ftrace_update_time; 1247static cycle_t ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
1368#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1358#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1369 1359
1370struct ftrace_iterator { 1360struct ftrace_iterator {
1371 struct ftrace_page *pg; 1361 loff_t pos;
1372 int hidx; 1362 loff_t func_pos;
1373 int idx; 1363 struct ftrace_page *pg;
1374 unsigned flags; 1364 struct dyn_ftrace *func;
1375 struct trace_parser parser; 1365 struct ftrace_func_probe *probe;
1366 struct trace_parser parser;
1367 int hidx;
1368 int idx;
1369 unsigned flags;
1376}; 1370};
1377 1371
1378static void * 1372static void *
1379t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1373t_hash_next(struct seq_file *m, loff_t *pos)
1380{ 1374{
1381 struct ftrace_iterator *iter = m->private; 1375 struct ftrace_iterator *iter = m->private;
1382 struct hlist_node *hnd = v; 1376 struct hlist_node *hnd = NULL;
1383 struct hlist_head *hhd; 1377 struct hlist_head *hhd;
1384 1378
1385 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1386
1387 (*pos)++; 1379 (*pos)++;
1380 iter->pos = *pos;
1388 1381
1382 if (iter->probe)
1383 hnd = &iter->probe->node;
1389 retry: 1384 retry:
1390 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1385 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1391 return NULL; 1386 return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1408 } 1403 }
1409 } 1404 }
1410 1405
1411 return hnd; 1406 if (WARN_ON_ONCE(!hnd))
1407 return NULL;
1408
1409 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1410
1411 return iter;
1412} 1412}
1413 1413
1414static void *t_hash_start(struct seq_file *m, loff_t *pos) 1414static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417 void *p = NULL; 1417 void *p = NULL;
1418 loff_t l; 1418 loff_t l;
1419 1419
1420 if (!(iter->flags & FTRACE_ITER_HASH)) 1420 if (iter->func_pos > *pos)
1421 *pos = 0; 1421 return NULL;
1422
1423 iter->flags |= FTRACE_ITER_HASH;
1424 1422
1425 iter->hidx = 0; 1423 iter->hidx = 0;
1426 for (l = 0; l <= *pos; ) { 1424 for (l = 0; l <= (*pos - iter->func_pos); ) {
1427 p = t_hash_next(m, p, &l); 1425 p = t_hash_next(m, &l);
1428 if (!p) 1426 if (!p)
1429 break; 1427 break;
1430 } 1428 }
1431 return p; 1429 if (!p)
1430 return NULL;
1431
1432 /* Only set this if we have an item */
1433 iter->flags |= FTRACE_ITER_HASH;
1434
1435 return iter;
1432} 1436}
1433 1437
1434static int t_hash_show(struct seq_file *m, void *v) 1438static int
1439t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1435{ 1440{
1436 struct ftrace_func_probe *rec; 1441 struct ftrace_func_probe *rec;
1437 struct hlist_node *hnd = v;
1438 1442
1439 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1443 rec = iter->probe;
1444 if (WARN_ON_ONCE(!rec))
1445 return -EIO;
1440 1446
1441 if (rec->ops->print) 1447 if (rec->ops->print)
1442 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1448 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1457 struct dyn_ftrace *rec = NULL; 1463 struct dyn_ftrace *rec = NULL;
1458 1464
1459 if (iter->flags & FTRACE_ITER_HASH) 1465 if (iter->flags & FTRACE_ITER_HASH)
1460 return t_hash_next(m, v, pos); 1466 return t_hash_next(m, pos);
1461 1467
1462 (*pos)++; 1468 (*pos)++;
1469 iter->pos = *pos;
1463 1470
1464 if (iter->flags & FTRACE_ITER_PRINTALL) 1471 if (iter->flags & FTRACE_ITER_PRINTALL)
1465 return NULL; 1472 return t_hash_start(m, pos);
1466 1473
1467 retry: 1474 retry:
1468 if (iter->idx >= iter->pg->index) { 1475 if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1491 } 1498 }
1492 } 1499 }
1493 1500
1494 return rec; 1501 if (!rec)
1502 return t_hash_start(m, pos);
1503
1504 iter->func_pos = *pos;
1505 iter->func = rec;
1506
1507 return iter;
1508}
1509
1510static void reset_iter_read(struct ftrace_iterator *iter)
1511{
1512 iter->pos = 0;
1513 iter->func_pos = 0;
1514 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1495} 1515}
1496 1516
1497static void *t_start(struct seq_file *m, loff_t *pos) 1517static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1502 1522
1503 mutex_lock(&ftrace_lock); 1523 mutex_lock(&ftrace_lock);
1504 /* 1524 /*
1525 * If an lseek was done, then reset and start from beginning.
1526 */
1527 if (*pos < iter->pos)
1528 reset_iter_read(iter);
1529
1530 /*
1505 * For set_ftrace_filter reading, if we have the filter 1531 * For set_ftrace_filter reading, if we have the filter
1506 * off, we can short cut and just print out that all 1532 * off, we can short cut and just print out that all
1507 * functions are enabled. 1533 * functions are enabled.
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1518 if (iter->flags & FTRACE_ITER_HASH) 1544 if (iter->flags & FTRACE_ITER_HASH)
1519 return t_hash_start(m, pos); 1545 return t_hash_start(m, pos);
1520 1546
1547 /*
1548 * Unfortunately, we need to restart at ftrace_pages_start
1549 * every time we let go of the ftrace_mutex. This is because
1550 * those pointers can change without the lock.
1551 */
1521 iter->pg = ftrace_pages_start; 1552 iter->pg = ftrace_pages_start;
1522 iter->idx = 0; 1553 iter->idx = 0;
1523 for (l = 0; l <= *pos; ) { 1554 for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1526 break; 1557 break;
1527 } 1558 }
1528 1559
1529 if (!p && iter->flags & FTRACE_ITER_FILTER) 1560 if (!p) {
1530 return t_hash_start(m, pos); 1561 if (iter->flags & FTRACE_ITER_FILTER)
1562 return t_hash_start(m, pos);
1531 1563
1532 return p; 1564 return NULL;
1565 }
1566
1567 return iter;
1533} 1568}
1534 1569
1535static void t_stop(struct seq_file *m, void *p) 1570static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
1540static int t_show(struct seq_file *m, void *v) 1575static int t_show(struct seq_file *m, void *v)
1541{ 1576{
1542 struct ftrace_iterator *iter = m->private; 1577 struct ftrace_iterator *iter = m->private;
1543 struct dyn_ftrace *rec = v; 1578 struct dyn_ftrace *rec;
1544 1579
1545 if (iter->flags & FTRACE_ITER_HASH) 1580 if (iter->flags & FTRACE_ITER_HASH)
1546 return t_hash_show(m, v); 1581 return t_hash_show(m, iter);
1547 1582
1548 if (iter->flags & FTRACE_ITER_PRINTALL) { 1583 if (iter->flags & FTRACE_ITER_PRINTALL) {
1549 seq_printf(m, "#### all functions enabled ####\n"); 1584 seq_printf(m, "#### all functions enabled ####\n");
1550 return 0; 1585 return 0;
1551 } 1586 }
1552 1587
1588 rec = iter->func;
1589
1553 if (!rec) 1590 if (!rec)
1554 return 0; 1591 return 0;
1555 1592
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
2418 .open = ftrace_filter_open, 2455 .open = ftrace_filter_open,
2419 .read = seq_read, 2456 .read = seq_read,
2420 .write = ftrace_filter_write, 2457 .write = ftrace_filter_write,
2421 .llseek = no_llseek, 2458 .llseek = ftrace_regex_lseek,
2422 .release = ftrace_filter_release, 2459 .release = ftrace_filter_release,
2423}; 2460};
2424 2461
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 492197e2f86c..4e2f03410377 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2606}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2608
2609/*
2610 * The total entries in the ring buffer is the running counter
2611 * of entries entered into the ring buffer, minus the sum of
2612 * the entries read from the ring buffer and the number of
2613 * entries that were overwritten.
2614 */
2615static inline unsigned long
2616rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2617{
2618 return local_read(&cpu_buffer->entries) -
2619 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2620}
2621
2609/** 2622/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2623 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2624 * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2627unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2628{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2629 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2630
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2631 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2632 return 0;
2621 2633
2622 cpu_buffer = buffer->buffers[cpu]; 2634 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2635
2626 return ret; 2636 return rb_num_of_entries(cpu_buffer);
2627} 2637}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2639
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2694 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2695 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2696 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2697 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2698 }
2690 2699
2691 return entries; 2700 return entries;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int total_ref_count;
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 struct hlist_head *list; 27 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu; 29 int cpu;
30 30
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 42 tp_event->perf_events = list;
43 43
44 if (!total_ref_count) { 44 if (!total_ref_count) {
45 char *buf; 45 char __percpu *buf;
46 int i; 46 int i;
47 47
48 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 50 if (!buf)
51 goto fail; 51 goto fail;
52 52
@@ -65,7 +65,7 @@ fail:
65 if (!total_ref_count) { 65 if (!total_ref_count) {
66 int i; 66 int i;
67 67
68 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
71 } 71 }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
101 return ret; 101 return ret;
102} 102}
103 103
104int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
105{ 105{
106 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
107 struct hlist_head *list; 108 struct hlist_head *list;
108 109
109 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
110 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
111 return -EINVAL; 112 return -EINVAL;
112 113
113 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
114 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
115 119
116 return 0; 120 return 0;
117} 121}
118 122
119void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
120{ 124{
121 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
122} 126}
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
142 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
143 147
144 if (!--total_ref_count) { 148 if (!--total_ref_count) {
145 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
146 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
147 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
148 } 152 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
600 600
601enum { 601enum {
602 FORMAT_HEADER = 1, 602 FORMAT_HEADER = 1,
603 FORMAT_PRINTFMT = 2, 603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
604}; 605};
605 606
606static void *f_next(struct seq_file *m, void *v, loff_t *pos) 607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
607{ 608{
608 struct ftrace_event_call *call = m->private; 609 struct ftrace_event_call *call = m->private;
609 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
610 struct list_head *head; 611 struct list_head *common_head = &ftrace_common_fields;
612 struct list_head *head = trace_get_fields(call);
611 613
612 (*pos)++; 614 (*pos)++;
613 615
614 switch ((unsigned long)v) { 616 switch ((unsigned long)v) {
615 case FORMAT_HEADER: 617 case FORMAT_HEADER:
616 head = &ftrace_common_fields; 618 if (unlikely(list_empty(common_head)))
619 return NULL;
620
621 field = list_entry(common_head->prev,
622 struct ftrace_event_field, link);
623 return field;
617 624
625 case FORMAT_FIELD_SEPERATOR:
618 if (unlikely(list_empty(head))) 626 if (unlikely(list_empty(head)))
619 return NULL; 627 return NULL;
620 628
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
626 return NULL; 634 return NULL;
627 } 635 }
628 636
629 head = trace_get_fields(call);
630
631 /*
632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
636
637 field = v; 637 field = v;
638 /* 638 if (field->link.prev == common_head)
639 * If this is a common field, and at the end of the list, then 639 return (void *)FORMAT_FIELD_SEPERATOR;
640 * continue with main list. 640 else if (field->link.prev == head)
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
650 }
651
652 /* If we are done tell f_show to print the format */
653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT; 641 return (void *)FORMAT_PRINTFMT;
655 642
656 field = list_entry(field->link.prev, struct ftrace_event_field, link); 643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
688 seq_printf(m, "format:\n"); 675 seq_printf(m, "format:\n");
689 return 0; 676 return 0;
690 677
678 case FORMAT_FIELD_SEPERATOR:
679 seq_putc(m, '\n');
680 return 0;
681
691 case FORMAT_PRINTFMT: 682 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n", 683 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt); 684 call->print_fmt);
694 return 0; 685 return 0;
695 } 686 }
696 687
697 /*
698 * To separate common fields from event fields, the
699 * LSB is set on the first event field. Clear it and
700 * print a newline if it is set.
701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v; 688 field = v;
708 689
709 /* 690 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..02c708ae0d42 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -855,6 +871,92 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
855 return 0; 871 return 0;
856} 872}
857 873
874/*
875 * Entry check for irq code
876 *
877 * returns 1 if
878 * - we are inside irq code
879 * - we just extered irq code
880 *
881 * retunns 0 if
882 * - funcgraph-interrupts option is set
883 * - we are not inside irq code
884 */
885static int
886check_irq_entry(struct trace_iterator *iter, u32 flags,
887 unsigned long addr, int depth)
888{
889 int cpu = iter->cpu;
890 struct fgraph_data *data = iter->private;
891 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
892
893 if (flags & TRACE_GRAPH_PRINT_IRQS)
894 return 0;
895
896 /*
897 * We are inside the irq code
898 */
899 if (*depth_irq >= 0)
900 return 1;
901
902 if ((addr < (unsigned long)__irqentry_text_start) ||
903 (addr >= (unsigned long)__irqentry_text_end))
904 return 0;
905
906 /*
907 * We are entering irq code.
908 */
909 *depth_irq = depth;
910 return 1;
911}
912
913/*
914 * Return check for irq code
915 *
916 * returns 1 if
917 * - we are inside irq code
918 * - we just left irq code
919 *
920 * returns 0 if
921 * - funcgraph-interrupts option is set
922 * - we are not inside irq code
923 */
924static int
925check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
926{
927 int cpu = iter->cpu;
928 struct fgraph_data *data = iter->private;
929 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
930
931 if (flags & TRACE_GRAPH_PRINT_IRQS)
932 return 0;
933
934 /*
935 * We are not inside the irq code.
936 */
937 if (*depth_irq == -1)
938 return 0;
939
940 /*
941 * We are inside the irq code, and this is returning entry.
942 * Let's not trace it and clear the entry depth, since
943 * we are out of irq code.
944 *
945 * This condition ensures that we 'leave the irq code' once
946 * we are out of the entry depth. Thus protecting us from
947 * the RETURN entry loss.
948 */
949 if (*depth_irq >= depth) {
950 *depth_irq = -1;
951 return 1;
952 }
953
954 /*
955 * We are inside the irq code, and this is not the entry.
956 */
957 return 1;
958}
959
858static enum print_line_t 960static enum print_line_t
859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 961print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
860 struct trace_iterator *iter, u32 flags) 962 struct trace_iterator *iter, u32 flags)
@@ -865,6 +967,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
865 static enum print_line_t ret; 967 static enum print_line_t ret;
866 int cpu = iter->cpu; 968 int cpu = iter->cpu;
867 969
970 if (check_irq_entry(iter, flags, call->func, call->depth))
971 return TRACE_TYPE_HANDLED;
972
868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 973 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
869 return TRACE_TYPE_PARTIAL_LINE; 974 return TRACE_TYPE_PARTIAL_LINE;
870 975
@@ -902,6 +1007,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
902 int ret; 1007 int ret;
903 int i; 1008 int i;
904 1009
1010 if (check_irq_return(iter, flags, trace->depth))
1011 return TRACE_TYPE_HANDLED;
1012
905 if (data) { 1013 if (data) {
906 struct fgraph_cpu_data *cpu_data; 1014 struct fgraph_cpu_data *cpu_data;
907 int cpu = iter->cpu; 1015 int cpu = iter->cpu;
@@ -1210,9 +1318,12 @@ void graph_trace_open(struct trace_iterator *iter)
1210 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1318 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1211 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1319 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1212 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1320 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1321 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1322
1213 *pid = -1; 1323 *pid = -1;
1214 *depth = 0; 1324 *depth = 0;
1215 *ignore = 0; 1325 *ignore = 0;
1326 *depth_irq = -1;
1216 } 1327 }
1217 1328
1218 iter->private = data; 1329 iter->private = data;
@@ -1235,6 +1346,14 @@ void graph_trace_close(struct trace_iterator *iter)
1235 } 1346 }
1236} 1347}
1237 1348
1349static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1350{
1351 if (bit == TRACE_GRAPH_PRINT_IRQS)
1352 ftrace_graph_skip_irqs = !set;
1353
1354 return 0;
1355}
1356
1238static struct trace_event_functions graph_functions = { 1357static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event, 1358 .trace = print_graph_function_event,
1240}; 1359};
@@ -1261,6 +1380,7 @@ static struct tracer graph_trace __read_mostly = {
1261 .print_line = print_graph_function, 1380 .print_line = print_graph_function,
1262 .print_header = print_graph_headers, 1381 .print_header = print_graph_headers,
1263 .flags = &tracer_flags, 1382 .flags = &tracer_flags,
1383 .set_flag = func_graph_set_flag,
1264#ifdef CONFIG_FTRACE_SELFTEST 1384#ifdef CONFIG_FTRACE_SELFTEST
1265 .selftest = trace_selftest_startup_function_graph, 1385 .selftest = trace_selftest_startup_function_graph,
1266#endif 1386#endif
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..d6073a50a6ca 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 enable_jump_label(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 disable_jump_label(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 disable_jump_label(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..dc8e16824b51 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog; 46static int __initdata no_watchdog;
48 47
49 48
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
187 return 0; 186 return 0;
188} 187}
189 188
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 189#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = { 190static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 191 .type = PERF_TYPE_HARDWARE,
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
371 /* Try to register using hardware perf events */ 358 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 359 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(); 360 wd_attr->sample_period = hw_nmi_get_sample_period();
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
375 if (!IS_ERR(event)) { 362 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 364 goto out_save;
378 } 365 }
379 366
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
381 return -1; 368 return PTR_ERR(event);
382 369
383 /* success path */ 370 /* success path */
384out_save: 371out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
422static int watchdog_enable(int cpu) 409static int watchdog_enable(int cpu)
423{ 410{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
425 413
426 /* enable the perf event */ 414 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0) 415 err = watchdog_nmi_enable(cpu);
428 return -1; 416 if (err)
417 return err;
429 418
430 /* create the watchdog thread */ 419 /* create the watchdog thread */
431 if (!p) { 420 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) { 422 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1; 424 return PTR_ERR(p);
436 } 425 }
437 kthread_bind(p, cpu); 426 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0; 427 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
484{ 473{
485 int cpu; 474 int cpu;
486 475
476 if (no_watchdog)
477 return;
478
487 for_each_online_cpu(cpu) 479 for_each_online_cpu(cpu)
488 watchdog_disable(cpu); 480 watchdog_disable(cpu);
489 481
@@ -526,17 +518,16 @@ static int __cpuinit
526cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
527{ 519{
528 int hotcpu = (unsigned long)hcpu; 520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
529 522
530 switch (action) { 523 switch (action) {
531 case CPU_UP_PREPARE: 524 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN: 525 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu)) 526 err = watchdog_prepare_cpu(hotcpu);
534 return NOTIFY_BAD;
535 break; 527 break;
536 case CPU_ONLINE: 528 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN: 529 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu)) 530 err = watchdog_enable(hotcpu);
539 return NOTIFY_BAD;
540 break; 531 break;
541#ifdef CONFIG_HOTPLUG_CPU 532#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED: 533 case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
549 break; 540 break;
550#endif /* CONFIG_HOTPLUG_CPU */ 541#endif /* CONFIG_HOTPLUG_CPU */
551 } 542 }
552 return NOTIFY_OK; 543 return notifier_from_errno(err);
553} 544}
554 545
555static struct notifier_block __cpuinitdata cpu_nfb = { 546static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
565 return 0; 556 return 0;
566 557
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD); 559 WARN_ON(notifier_to_errno(err));
569 560
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
572 563
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
574
575 return 0; 564 return 0;
576} 565}
577early_initcall(spawn_watchdog_task); 566early_initcall(spawn_watchdog_task);