aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorRobert Richter <robert.richter@amd.com>2010-10-15 06:45:00 -0400
committerRobert Richter <robert.richter@amd.com>2010-10-15 06:45:00 -0400
commit6268464b370e234e0255330190f9bd5d19386ad7 (patch)
tree5742641092ce64227dd2086d78baaede57da1f80 /kernel
parent7df01d96b295e400167e78061b81d4c91630b12d (diff)
parent0fdf13606b67f830559abdaad15980c7f4f05ec4 (diff)
Merge remote branch 'tip/perf/core' into oprofile/core
Conflicts: arch/arm/oprofile/common.c kernel/perf_event.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/hw_breakpoint.c67
-rw-r--r--kernel/jump_label.c429
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kprobes.c26
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/perf_event.c2357
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/smp.c17
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/ftrace.c123
-rw-r--r--kernel/trace/ring_buffer.c21
-rw-r--r--kernel/trace/trace_event_perf.c28
-rw-r--r--kernel/trace/trace_events.c55
-rw-r--r--kernel/trace/trace_functions_graph.c131
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/watchdog.c41
21 files changed, 2243 insertions, 1115 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..d52b473c99a1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o 14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 15obj-y += groups.o
16 16
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..e2bdf37f9fde 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 149{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 151
152#ifdef CONFIG_PERF_EVENTS 152 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 153 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 154 put_task_struct(tsk);
157} 155}
diff --git a/kernel/fork.c b/kernel/fork.c
index b7e9d60a675d..c445f8cc408d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -356,10 +356,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
356 if (IS_ERR(pol)) 356 if (IS_ERR(pol))
357 goto fail_nomem_policy; 357 goto fail_nomem_policy;
358 vma_set_policy(tmp, pol); 358 vma_set_policy(tmp, pol);
359 tmp->vm_mm = mm;
359 if (anon_vma_fork(tmp, mpnt)) 360 if (anon_vma_fork(tmp, mpnt))
360 goto fail_nomem_anon_vma_fork; 361 goto fail_nomem_anon_vma_fork;
361 tmp->vm_flags &= ~VM_LOCKED; 362 tmp->vm_flags &= ~VM_LOCKED;
362 tmp->vm_mm = mm;
363 tmp->vm_next = tmp->vm_prev = NULL; 363 tmp->vm_next = tmp->vm_prev = NULL;
364 file = tmp->vm_file; 364 file = tmp->vm_file;
365 if (file) { 365 if (file) {
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2dc..3b714e839c10 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437 triggered);
438} 437}
439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
440 439
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
516 get_online_cpus(); 515 get_online_cpus();
517 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
518 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
519 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
520 519
521 *pevent = bp; 520 *pevent = bp;
522 521
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
566 .priority = 0x7fffffff 565 .priority = 0x7fffffff
567}; 566};
568 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
569static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
570{ 624{
571 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
587 641
588 constraints_initialized = 1; 642 constraints_initialized = 1;
589 643
644 perf_pmu_register(&perf_breakpoint);
645
590 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
591 647
592 err_alloc: 648 err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
602core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
603 659
604 660
605struct pmu perf_ops_bp = {
606 .enable = arch_install_hw_breakpoint,
607 .disable = arch_uninstall_hw_breakpoint,
608 .read = hw_breakpoint_pmu_read,
609};
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42static int jump_label_cmp(const void *a, const void *b)
43{
44 const struct jump_entry *jea = a;
45 const struct jump_entry *jeb = b;
46
47 if (jea->key < jeb->key)
48 return -1;
49
50 if (jea->key > jeb->key)
51 return 1;
52
53 return 0;
54}
55
56static void
57sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
58{
59 unsigned long size;
60
61 size = (((unsigned long)stop - (unsigned long)start)
62 / sizeof(struct jump_entry));
63 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
64}
65
66static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
67{
68 struct hlist_head *head;
69 struct hlist_node *node;
70 struct jump_label_entry *e;
71 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
72
73 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
74 hlist_for_each_entry(e, node, head, hlist) {
75 if (key == e->key)
76 return e;
77 }
78 return NULL;
79}
80
81static struct jump_label_entry *
82add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
83{
84 struct hlist_head *head;
85 struct jump_label_entry *e;
86 u32 hash;
87
88 e = get_jump_label_entry(key);
89 if (e)
90 return ERR_PTR(-EEXIST);
91
92 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
93 if (!e)
94 return ERR_PTR(-ENOMEM);
95
96 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
97 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
98 e->key = key;
99 e->table = table;
100 e->nr_entries = nr_entries;
101 INIT_HLIST_HEAD(&(e->modules));
102 hlist_add_head(&e->hlist, head);
103 return e;
104}
105
106static int
107build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
108{
109 struct jump_entry *iter, *iter_begin;
110 struct jump_label_entry *entry;
111 int count;
112
113 sort_jump_label_entries(start, stop);
114 iter = start;
115 while (iter < stop) {
116 entry = get_jump_label_entry(iter->key);
117 if (!entry) {
118 iter_begin = iter;
119 count = 0;
120 while ((iter < stop) &&
121 (iter->key == iter_begin->key)) {
122 iter++;
123 count++;
124 }
125 entry = add_jump_label_entry(iter_begin->key,
126 count, iter_begin);
127 if (IS_ERR(entry))
128 return PTR_ERR(entry);
129 } else {
130 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
131 return -1;
132 }
133 }
134 return 0;
135}
136
137/***
138 * jump_label_update - update jump label text
139 * @key - key value associated with a a jump label
140 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
141 *
142 * Will enable/disable the jump for jump label @key, depending on the
143 * value of @type.
144 *
145 */
146
147void jump_label_update(unsigned long key, enum jump_label_type type)
148{
149 struct jump_entry *iter;
150 struct jump_label_entry *entry;
151 struct hlist_node *module_node;
152 struct jump_label_module_entry *e_module;
153 int count;
154
155 mutex_lock(&jump_label_mutex);
156 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) {
158 count = entry->nr_entries;
159 iter = entry->table;
160 while (count--) {
161 if (kernel_text_address(iter->code))
162 arch_jump_label_transform(iter, type);
163 iter++;
164 }
165 /* eanble/disable jump labels in modules */
166 hlist_for_each_entry(e_module, module_node, &(entry->modules),
167 hlist) {
168 count = e_module->nr_entries;
169 iter = e_module->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 }
176 }
177 mutex_unlock(&jump_label_mutex);
178}
179
180static int addr_conflict(struct jump_entry *entry, void *start, void *end)
181{
182 if (entry->code <= (unsigned long)end &&
183 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
184 return 1;
185
186 return 0;
187}
188
189#ifdef CONFIG_MODULES
190
191static int module_conflict(void *start, void *end)
192{
193 struct hlist_head *head;
194 struct hlist_node *node, *node_next, *module_node, *module_node_next;
195 struct jump_label_entry *e;
196 struct jump_label_module_entry *e_module;
197 struct jump_entry *iter;
198 int i, count;
199 int conflict = 0;
200
201 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
202 head = &jump_label_table[i];
203 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
204 hlist_for_each_entry_safe(e_module, module_node,
205 module_node_next,
206 &(e->modules), hlist) {
207 count = e_module->nr_entries;
208 iter = e_module->table;
209 while (count--) {
210 if (addr_conflict(iter, start, end)) {
211 conflict = 1;
212 goto out;
213 }
214 iter++;
215 }
216 }
217 }
218 }
219out:
220 return conflict;
221}
222
223#endif
224
225/***
226 * jump_label_text_reserved - check if addr range is reserved
227 * @start: start text addr
228 * @end: end text addr
229 *
230 * checks if the text addr located between @start and @end
231 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses.
234 *
235 * returns 1 if there is an overlap, 0 otherwise
236 */
237int jump_label_text_reserved(void *start, void *end)
238{
239 struct jump_entry *iter;
240 struct jump_entry *iter_start = __start___jump_table;
241 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0;
243
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start;
246 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) {
248 conflict = 1;
249 goto out;
250 }
251 iter++;
252 }
253
254 /* now check modules */
255#ifdef CONFIG_MODULES
256 conflict = module_conflict(start, end);
257#endif
258out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict;
261}
262
263static __init int init_jump_label(void)
264{
265 int ret;
266 struct jump_entry *iter_start = __start___jump_table;
267 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter;
269
270 mutex_lock(&jump_label_mutex);
271 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table);
273 iter = iter_start;
274 while (iter < iter_stop) {
275 arch_jump_label_text_poke_early(iter->code);
276 iter++;
277 }
278 mutex_unlock(&jump_label_mutex);
279 return ret;
280}
281early_initcall(init_jump_label);
282
283#ifdef CONFIG_MODULES
284
285static struct jump_label_module_entry *
286add_jump_label_module_entry(struct jump_label_entry *entry,
287 struct jump_entry *iter_begin,
288 int count, struct module *mod)
289{
290 struct jump_label_module_entry *e;
291
292 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
293 if (!e)
294 return ERR_PTR(-ENOMEM);
295 e->mod = mod;
296 e->nr_entries = count;
297 e->table = iter_begin;
298 hlist_add_head(&e->hlist, &entry->modules);
299 return e;
300}
301
302static int add_jump_label_module(struct module *mod)
303{
304 struct jump_entry *iter, *iter_begin;
305 struct jump_label_entry *entry;
306 struct jump_label_module_entry *module_entry;
307 int count;
308
309 /* if the module doesn't have jump label entries, just return */
310 if (!mod->num_jump_entries)
311 return 0;
312
313 sort_jump_label_entries(mod->jump_entries,
314 mod->jump_entries + mod->num_jump_entries);
315 iter = mod->jump_entries;
316 while (iter < mod->jump_entries + mod->num_jump_entries) {
317 entry = get_jump_label_entry(iter->key);
318 iter_begin = iter;
319 count = 0;
320 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
321 (iter->key == iter_begin->key)) {
322 iter++;
323 count++;
324 }
325 if (!entry) {
326 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
327 if (IS_ERR(entry))
328 return PTR_ERR(entry);
329 }
330 module_entry = add_jump_label_module_entry(entry, iter_begin,
331 count, mod);
332 if (IS_ERR(module_entry))
333 return PTR_ERR(module_entry);
334 }
335 return 0;
336}
337
338static void remove_jump_label_module(struct module *mod)
339{
340 struct hlist_head *head;
341 struct hlist_node *node, *node_next, *module_node, *module_node_next;
342 struct jump_label_entry *e;
343 struct jump_label_module_entry *e_module;
344 int i;
345
346 /* if the module doesn't have jump label entries, just return */
347 if (!mod->num_jump_entries)
348 return;
349
350 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
351 head = &jump_label_table[i];
352 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
353 hlist_for_each_entry_safe(e_module, module_node,
354 module_node_next,
355 &(e->modules), hlist) {
356 if (e_module->mod == mod) {
357 hlist_del(&e_module->hlist);
358 kfree(e_module);
359 }
360 }
361 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
362 hlist_del(&e->hlist);
363 kfree(e);
364 }
365 }
366 }
367}
368
369static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data)
372{
373 struct module *mod = data;
374 int ret = 0;
375
376 switch (val) {
377 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex);
379 ret = add_jump_label_module(mod);
380 if (ret)
381 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex);
383 break;
384 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex);
386 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex);
388 break;
389 }
390 return ret;
391}
392
393/***
394 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
395 * @mod: module to patch
396 *
397 * Allow for run-time selection of the optimal nops. Before the module
398 * loads patch these with arch_get_jump_label_nop(), which is specified by
399 * the arch specific jump label code.
400 */
401void jump_label_apply_nops(struct module *mod)
402{
403 struct jump_entry *iter;
404
405 /* if the module doesn't have jump label entries, just return */
406 if (!mod->num_jump_entries)
407 return;
408
409 iter = mod->jump_entries;
410 while (iter < mod->jump_entries + mod->num_jump_entries) {
411 arch_jump_label_text_poke_early(iter->code);
412 iter++;
413 }
414}
415
416struct notifier_block jump_label_module_nb = {
417 .notifier_call = jump_label_module_notify,
418 .priority = 0,
419};
420
421static __init int init_jump_label_module(void)
422{
423 return register_module_notifier(&jump_label_module_nb);
424}
425early_initcall(init_jump_label_module);
426
427#endif /* CONFIG_MODULES */
428
429#endif
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 6b5580c57644..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l); 365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); 366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367 367
368 if (n)
369 sg_mark_end(sgl + n - 1);
370 return n; 368 return n;
371} 369}
372 370
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..ec4210c6501e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 400 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 401 * instructions including addr (exclude breakpoint).
401 */ 402 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 403static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 404{
404 int i; 405 int i;
405 struct kprobe *p = NULL; 406 struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 832
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 833void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 834 struct hlist_head **head, unsigned long *flags)
835__acquires(hlist_lock)
834{ 836{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 837 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 838 spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 844
843static void __kprobes kretprobe_table_lock(unsigned long hash, 845static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 846 unsigned long *flags)
847__acquires(hlist_lock)
845{ 848{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 849 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 850 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 852
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 853void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 854 unsigned long *flags)
855__releases(hlist_lock)
852{ 856{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 857 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 858 spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 861 spin_unlock_irqrestore(hlist_lock, *flags);
858} 862}
859 863
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 864static void __kprobes kretprobe_table_unlock(unsigned long hash,
865 unsigned long *flags)
866__releases(hlist_lock)
861{ 867{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 868 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 869 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1141 preempt_disable(); 1147 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1148 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1149 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1150 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) {
1145 preempt_enable(); 1152 preempt_enable();
1146 return -EINVAL; 1153 return -EINVAL;
1147 } 1154 }
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1346 if (num <= 0)
1340 return -EINVAL; 1347 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1348 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1349 unsigned long addr, offset;
1343 jp = jps[i]; 1350 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1351 addr = arch_deref_entry_point(jp->entry);
1345 1352
1346 if (!kernel_text_address(addr)) 1353 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1354 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1355 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1356 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1357 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1358 ret = register_kprobe(&jp->kp);
1353 } 1359 } else
1360 ret = -EINVAL;
1361
1354 if (ret < 0) { 1362 if (ret < 0) {
1355 if (i > 0) 1363 if (i > 0)
1356 unregister_jprobes(jps, i); 1364 unregister_jprobes(jps, i);
diff --git a/kernel/module.c b/kernel/module.c
index d0b5f8db11b4..2df46301a7a4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -1537,6 +1538,7 @@ static int __unlink_module(void *_mod)
1537{ 1538{
1538 struct module *mod = _mod; 1539 struct module *mod = _mod;
1539 list_del(&mod->list); 1540 list_del(&mod->list);
1541 module_bug_cleanup(mod);
1540 return 0; 1542 return 0;
1541} 1543}
1542 1544
@@ -2308,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2308 sizeof(*mod->tracepoints), 2310 sizeof(*mod->tracepoints),
2309 &mod->num_tracepoints); 2311 &mod->num_tracepoints);
2310#endif 2312#endif
2313#ifdef HAVE_JUMP_LABEL
2314 mod->jump_entries = section_objs(info, "__jump_table",
2315 sizeof(*mod->jump_entries),
2316 &mod->num_jump_entries);
2317#endif
2311#ifdef CONFIG_EVENT_TRACING 2318#ifdef CONFIG_EVENT_TRACING
2312 mod->trace_events = section_objs(info, "_ftrace_events", 2319 mod->trace_events = section_objs(info, "_ftrace_events",
2313 sizeof(*mod->trace_events), 2320 sizeof(*mod->trace_events),
@@ -2625,6 +2632,7 @@ static struct module *load_module(void __user *umod,
2625 if (err < 0) 2632 if (err < 0)
2626 goto ddebug; 2633 goto ddebug;
2627 2634
2635 module_bug_finalize(info.hdr, info.sechdrs, mod);
2628 list_add_rcu(&mod->list, &modules); 2636 list_add_rcu(&mod->list, &modules);
2629 mutex_unlock(&module_mutex); 2637 mutex_unlock(&module_mutex);
2630 2638
@@ -2650,6 +2658,8 @@ static struct module *load_module(void __user *umod,
2650 mutex_lock(&module_mutex); 2658 mutex_lock(&module_mutex);
2651 /* Unlink carefully: kallsyms could be walking list. */ 2659 /* Unlink carefully: kallsyms could be walking list. */
2652 list_del_rcu(&mod->list); 2660 list_del_rcu(&mod->list);
2661 module_bug_cleanup(mod);
2662
2653 ddebug: 2663 ddebug:
2654 if (!mod->taints) 2664 if (!mod->taints)
2655 dynamic_debug_remove(info.debug); 2665 dynamic_debug_remove(info.debug);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fc512684423f..1ec3916ffef0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/*
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly; 37static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,22 +61,6 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/*
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74
75/*
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{
80 return NULL;
81}
82
83void __weak hw_perf_disable(void) { barrier(); }
84void __weak hw_perf_enable(void) { barrier(); }
85
86void __weak perf_event_print_debug(void) { } 64void __weak perf_event_print_debug(void) { }
87 65
88extern __weak const char *perf_pmu_name(void) 66extern __weak const char *perf_pmu_name(void)
@@ -90,18 +68,36 @@ extern __weak const char *perf_pmu_name(void)
90 return "pmu"; 68 return "pmu";
91} 69}
92 70
93static DEFINE_PER_CPU(int, perf_disable_count); 71void perf_pmu_disable(struct pmu *pmu)
72{
73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
74 if (!(*count)++)
75 pmu->pmu_disable(pmu);
76}
94 77
95void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
96{ 79{
97 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
98 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
99} 83}
100 84
101void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
102{ 93{
103 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
104 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
105} 101}
106 102
107static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -156,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
156 * the context could get moved to another task. 152 * the context could get moved to another task.
157 */ 153 */
158static struct perf_event_context * 154static struct perf_event_context *
159perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
160{ 156{
161 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
162 158
163 rcu_read_lock(); 159 rcu_read_lock();
164 retry: 160retry:
165 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
166 if (ctx) { 162 if (ctx) {
167 /* 163 /*
168 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -175,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 * can't get swapped on us any more. 171 * can't get swapped on us any more.
176 */ 172 */
177 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
178 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
179 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
180 goto retry; 176 goto retry;
181 } 177 }
@@ -194,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
194 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
195 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
196 */ 192 */
197static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
198{ 195{
199 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
200 unsigned long flags; 197 unsigned long flags;
201 198
202 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
203 if (ctx) { 200 if (ctx) {
204 ++ctx->pin_count; 201 ++ctx->pin_count;
205 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -307,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
307 } 304 }
308 305
309 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
310 ctx->nr_events++; 309 ctx->nr_events++;
311 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
312 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -441,7 +440,7 @@ event_sched_out(struct perf_event *event,
441 event->state = PERF_EVENT_STATE_OFF; 440 event->state = PERF_EVENT_STATE_OFF;
442 } 441 }
443 event->tstamp_stopped = ctx->time; 442 event->tstamp_stopped = ctx->time;
444 event->pmu->disable(event); 443 event->pmu->del(event, 0);
445 event->oncpu = -1; 444 event->oncpu = -1;
446 445
447 if (!is_software_event(event)) 446 if (!is_software_event(event))
@@ -471,6 +470,12 @@ group_sched_out(struct perf_event *group_event,
471 cpuctx->exclusive = 0; 470 cpuctx->exclusive = 0;
472} 471}
473 472
473static inline struct perf_cpu_context *
474__get_cpu_context(struct perf_event_context *ctx)
475{
476 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
477}
478
474/* 479/*
475 * Cross CPU call to remove a performance event 480 * Cross CPU call to remove a performance event
476 * 481 *
@@ -479,9 +484,9 @@ group_sched_out(struct perf_event *group_event,
479 */ 484 */
480static void __perf_event_remove_from_context(void *info) 485static void __perf_event_remove_from_context(void *info)
481{ 486{
482 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
483 struct perf_event *event = info; 487 struct perf_event *event = info;
484 struct perf_event_context *ctx = event->ctx; 488 struct perf_event_context *ctx = event->ctx;
489 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
485 490
486 /* 491 /*
487 * If this is a task context, we need to check whether it is 492 * If this is a task context, we need to check whether it is
@@ -492,27 +497,11 @@ static void __perf_event_remove_from_context(void *info)
492 return; 497 return;
493 498
494 raw_spin_lock(&ctx->lock); 499 raw_spin_lock(&ctx->lock);
495 /*
496 * Protect the list operation against NMI by disabling the
497 * events on a global level.
498 */
499 perf_disable();
500 500
501 event_sched_out(event, cpuctx, ctx); 501 event_sched_out(event, cpuctx, ctx);
502 502
503 list_del_event(event, ctx); 503 list_del_event(event, ctx);
504 504
505 if (!ctx->task) {
506 /*
507 * Allow more per task events with respect to the
508 * reservation:
509 */
510 cpuctx->max_pertask =
511 min(perf_max_events - ctx->nr_events,
512 perf_max_events - perf_reserved_percpu);
513 }
514
515 perf_enable();
516 raw_spin_unlock(&ctx->lock); 505 raw_spin_unlock(&ctx->lock);
517} 506}
518 507
@@ -577,8 +566,8 @@ retry:
577static void __perf_event_disable(void *info) 566static void __perf_event_disable(void *info)
578{ 567{
579 struct perf_event *event = info; 568 struct perf_event *event = info;
580 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
581 struct perf_event_context *ctx = event->ctx; 569 struct perf_event_context *ctx = event->ctx;
570 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
582 571
583 /* 572 /*
584 * If this is a per-task event, need to check whether this 573 * If this is a per-task event, need to check whether this
@@ -633,7 +622,7 @@ void perf_event_disable(struct perf_event *event)
633 return; 622 return;
634 } 623 }
635 624
636 retry: 625retry:
637 task_oncpu_function_call(task, __perf_event_disable, event); 626 task_oncpu_function_call(task, __perf_event_disable, event);
638 627
639 raw_spin_lock_irq(&ctx->lock); 628 raw_spin_lock_irq(&ctx->lock);
@@ -672,7 +661,7 @@ event_sched_in(struct perf_event *event,
672 */ 661 */
673 smp_wmb(); 662 smp_wmb();
674 663
675 if (event->pmu->enable(event)) { 664 if (event->pmu->add(event, PERF_EF_START)) {
676 event->state = PERF_EVENT_STATE_INACTIVE; 665 event->state = PERF_EVENT_STATE_INACTIVE;
677 event->oncpu = -1; 666 event->oncpu = -1;
678 return -EAGAIN; 667 return -EAGAIN;
@@ -696,22 +685,15 @@ group_sched_in(struct perf_event *group_event,
696 struct perf_event_context *ctx) 685 struct perf_event_context *ctx)
697{ 686{
698 struct perf_event *event, *partial_group = NULL; 687 struct perf_event *event, *partial_group = NULL;
699 const struct pmu *pmu = group_event->pmu; 688 struct pmu *pmu = group_event->pmu;
700 bool txn = false;
701 689
702 if (group_event->state == PERF_EVENT_STATE_OFF) 690 if (group_event->state == PERF_EVENT_STATE_OFF)
703 return 0; 691 return 0;
704 692
705 /* Check if group transaction availabe */ 693 pmu->start_txn(pmu);
706 if (pmu->start_txn)
707 txn = true;
708
709 if (txn)
710 pmu->start_txn(pmu);
711 694
712 if (event_sched_in(group_event, cpuctx, ctx)) { 695 if (event_sched_in(group_event, cpuctx, ctx)) {
713 if (txn) 696 pmu->cancel_txn(pmu);
714 pmu->cancel_txn(pmu);
715 return -EAGAIN; 697 return -EAGAIN;
716 } 698 }
717 699
@@ -725,7 +707,7 @@ group_sched_in(struct perf_event *group_event,
725 } 707 }
726 } 708 }
727 709
728 if (!txn || !pmu->commit_txn(pmu)) 710 if (!pmu->commit_txn(pmu))
729 return 0; 711 return 0;
730 712
731group_error: 713group_error:
@@ -740,8 +722,7 @@ group_error:
740 } 722 }
741 event_sched_out(group_event, cpuctx, ctx); 723 event_sched_out(group_event, cpuctx, ctx);
742 724
743 if (txn) 725 pmu->cancel_txn(pmu);
744 pmu->cancel_txn(pmu);
745 726
746 return -EAGAIN; 727 return -EAGAIN;
747} 728}
@@ -794,10 +775,10 @@ static void add_event_to_ctx(struct perf_event *event,
794 */ 775 */
795static void __perf_install_in_context(void *info) 776static void __perf_install_in_context(void *info)
796{ 777{
797 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
798 struct perf_event *event = info; 778 struct perf_event *event = info;
799 struct perf_event_context *ctx = event->ctx; 779 struct perf_event_context *ctx = event->ctx;
800 struct perf_event *leader = event->group_leader; 780 struct perf_event *leader = event->group_leader;
781 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
801 int err; 782 int err;
802 783
803 /* 784 /*
@@ -817,12 +798,6 @@ static void __perf_install_in_context(void *info)
817 ctx->is_active = 1; 798 ctx->is_active = 1;
818 update_context_time(ctx); 799 update_context_time(ctx);
819 800
820 /*
821 * Protect the list operation against NMI by disabling the
822 * events on a global level. NOP for non NMI based events.
823 */
824 perf_disable();
825
826 add_event_to_ctx(event, ctx); 801 add_event_to_ctx(event, ctx);
827 802
828 if (event->cpu != -1 && event->cpu != smp_processor_id()) 803 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -860,12 +835,7 @@ static void __perf_install_in_context(void *info)
860 } 835 }
861 } 836 }
862 837
863 if (!err && !ctx->task && cpuctx->max_pertask) 838unlock:
864 cpuctx->max_pertask--;
865
866 unlock:
867 perf_enable();
868
869 raw_spin_unlock(&ctx->lock); 839 raw_spin_unlock(&ctx->lock);
870} 840}
871 841
@@ -888,6 +858,8 @@ perf_install_in_context(struct perf_event_context *ctx,
888{ 858{
889 struct task_struct *task = ctx->task; 859 struct task_struct *task = ctx->task;
890 860
861 event->ctx = ctx;
862
891 if (!task) { 863 if (!task) {
892 /* 864 /*
893 * Per cpu events are installed via an smp call and 865 * Per cpu events are installed via an smp call and
@@ -936,10 +908,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
936 908
937 event->state = PERF_EVENT_STATE_INACTIVE; 909 event->state = PERF_EVENT_STATE_INACTIVE;
938 event->tstamp_enabled = ctx->time - event->total_time_enabled; 910 event->tstamp_enabled = ctx->time - event->total_time_enabled;
939 list_for_each_entry(sub, &event->sibling_list, group_entry) 911 list_for_each_entry(sub, &event->sibling_list, group_entry) {
940 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 912 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
941 sub->tstamp_enabled = 913 sub->tstamp_enabled =
942 ctx->time - sub->total_time_enabled; 914 ctx->time - sub->total_time_enabled;
915 }
916 }
943} 917}
944 918
945/* 919/*
@@ -948,9 +922,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
948static void __perf_event_enable(void *info) 922static void __perf_event_enable(void *info)
949{ 923{
950 struct perf_event *event = info; 924 struct perf_event *event = info;
951 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
952 struct perf_event_context *ctx = event->ctx; 925 struct perf_event_context *ctx = event->ctx;
953 struct perf_event *leader = event->group_leader; 926 struct perf_event *leader = event->group_leader;
927 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
954 int err; 928 int err;
955 929
956 /* 930 /*
@@ -984,12 +958,10 @@ static void __perf_event_enable(void *info)
984 if (!group_can_go_on(event, cpuctx, 1)) { 958 if (!group_can_go_on(event, cpuctx, 1)) {
985 err = -EEXIST; 959 err = -EEXIST;
986 } else { 960 } else {
987 perf_disable();
988 if (event == leader) 961 if (event == leader)
989 err = group_sched_in(event, cpuctx, ctx); 962 err = group_sched_in(event, cpuctx, ctx);
990 else 963 else
991 err = event_sched_in(event, cpuctx, ctx); 964 err = event_sched_in(event, cpuctx, ctx);
992 perf_enable();
993 } 965 }
994 966
995 if (err) { 967 if (err) {
@@ -1005,7 +977,7 @@ static void __perf_event_enable(void *info)
1005 } 977 }
1006 } 978 }
1007 979
1008 unlock: 980unlock:
1009 raw_spin_unlock(&ctx->lock); 981 raw_spin_unlock(&ctx->lock);
1010} 982}
1011 983
@@ -1046,7 +1018,7 @@ void perf_event_enable(struct perf_event *event)
1046 if (event->state == PERF_EVENT_STATE_ERROR) 1018 if (event->state == PERF_EVENT_STATE_ERROR)
1047 event->state = PERF_EVENT_STATE_OFF; 1019 event->state = PERF_EVENT_STATE_OFF;
1048 1020
1049 retry: 1021retry:
1050 raw_spin_unlock_irq(&ctx->lock); 1022 raw_spin_unlock_irq(&ctx->lock);
1051 task_oncpu_function_call(task, __perf_event_enable, event); 1023 task_oncpu_function_call(task, __perf_event_enable, event);
1052 1024
@@ -1066,7 +1038,7 @@ void perf_event_enable(struct perf_event *event)
1066 if (event->state == PERF_EVENT_STATE_OFF) 1038 if (event->state == PERF_EVENT_STATE_OFF)
1067 __perf_event_mark_enabled(event, ctx); 1039 __perf_event_mark_enabled(event, ctx);
1068 1040
1069 out: 1041out:
1070 raw_spin_unlock_irq(&ctx->lock); 1042 raw_spin_unlock_irq(&ctx->lock);
1071} 1043}
1072 1044
@@ -1097,26 +1069,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1097 struct perf_event *event; 1069 struct perf_event *event;
1098 1070
1099 raw_spin_lock(&ctx->lock); 1071 raw_spin_lock(&ctx->lock);
1072 perf_pmu_disable(ctx->pmu);
1100 ctx->is_active = 0; 1073 ctx->is_active = 0;
1101 if (likely(!ctx->nr_events)) 1074 if (likely(!ctx->nr_events))
1102 goto out; 1075 goto out;
1103 update_context_time(ctx); 1076 update_context_time(ctx);
1104 1077
1105 perf_disable();
1106 if (!ctx->nr_active) 1078 if (!ctx->nr_active)
1107 goto out_enable; 1079 goto out;
1108 1080
1109 if (event_type & EVENT_PINNED) 1081 if (event_type & EVENT_PINNED) {
1110 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1082 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1111 group_sched_out(event, cpuctx, ctx); 1083 group_sched_out(event, cpuctx, ctx);
1084 }
1112 1085
1113 if (event_type & EVENT_FLEXIBLE) 1086 if (event_type & EVENT_FLEXIBLE) {
1114 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1087 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1115 group_sched_out(event, cpuctx, ctx); 1088 group_sched_out(event, cpuctx, ctx);
1116 1089 }
1117 out_enable: 1090out:
1118 perf_enable(); 1091 perf_pmu_enable(ctx->pmu);
1119 out:
1120 raw_spin_unlock(&ctx->lock); 1092 raw_spin_unlock(&ctx->lock);
1121} 1093}
1122 1094
@@ -1214,34 +1186,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1214 } 1186 }
1215} 1187}
1216 1188
1217/* 1189void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1218 * Called from scheduler to remove the events of the current task, 1190 struct task_struct *next)
1219 * with interrupts disabled.
1220 *
1221 * We stop each event and update the event value in event->count.
1222 *
1223 * This does not protect us against NMI, but disable()
1224 * sets the disabled bit in the control field of event _before_
1225 * accessing the event control register. If a NMI hits, then it will
1226 * not restart the event.
1227 */
1228void perf_event_task_sched_out(struct task_struct *task,
1229 struct task_struct *next)
1230{ 1191{
1231 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1192 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1232 struct perf_event_context *ctx = task->perf_event_ctxp;
1233 struct perf_event_context *next_ctx; 1193 struct perf_event_context *next_ctx;
1234 struct perf_event_context *parent; 1194 struct perf_event_context *parent;
1195 struct perf_cpu_context *cpuctx;
1235 int do_switch = 1; 1196 int do_switch = 1;
1236 1197
1237 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1198 if (likely(!ctx))
1199 return;
1238 1200
1239 if (likely(!ctx || !cpuctx->task_ctx)) 1201 cpuctx = __get_cpu_context(ctx);
1202 if (!cpuctx->task_ctx)
1240 return; 1203 return;
1241 1204
1242 rcu_read_lock(); 1205 rcu_read_lock();
1243 parent = rcu_dereference(ctx->parent_ctx); 1206 parent = rcu_dereference(ctx->parent_ctx);
1244 next_ctx = next->perf_event_ctxp; 1207 next_ctx = next->perf_event_ctxp[ctxn];
1245 if (parent && next_ctx && 1208 if (parent && next_ctx &&
1246 rcu_dereference(next_ctx->parent_ctx) == parent) { 1209 rcu_dereference(next_ctx->parent_ctx) == parent) {
1247 /* 1210 /*
@@ -1260,8 +1223,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1260 * XXX do we need a memory barrier of sorts 1223 * XXX do we need a memory barrier of sorts
1261 * wrt to rcu_dereference() of perf_event_ctxp 1224 * wrt to rcu_dereference() of perf_event_ctxp
1262 */ 1225 */
1263 task->perf_event_ctxp = next_ctx; 1226 task->perf_event_ctxp[ctxn] = next_ctx;
1264 next->perf_event_ctxp = ctx; 1227 next->perf_event_ctxp[ctxn] = ctx;
1265 ctx->task = next; 1228 ctx->task = next;
1266 next_ctx->task = task; 1229 next_ctx->task = task;
1267 do_switch = 0; 1230 do_switch = 0;
@@ -1279,10 +1242,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1279 } 1242 }
1280} 1243}
1281 1244
1245#define for_each_task_context_nr(ctxn) \
1246 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1247
1248/*
1249 * Called from scheduler to remove the events of the current task,
1250 * with interrupts disabled.
1251 *
1252 * We stop each event and update the event value in event->count.
1253 *
1254 * This does not protect us against NMI, but disable()
1255 * sets the disabled bit in the control field of event _before_
1256 * accessing the event control register. If a NMI hits, then it will
1257 * not restart the event.
1258 */
1259void perf_event_task_sched_out(struct task_struct *task,
1260 struct task_struct *next)
1261{
1262 int ctxn;
1263
1264 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1265
1266 for_each_task_context_nr(ctxn)
1267 perf_event_context_sched_out(task, ctxn, next);
1268}
1269
1282static void task_ctx_sched_out(struct perf_event_context *ctx, 1270static void task_ctx_sched_out(struct perf_event_context *ctx,
1283 enum event_type_t event_type) 1271 enum event_type_t event_type)
1284{ 1272{
1285 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1273 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1286 1274
1287 if (!cpuctx->task_ctx) 1275 if (!cpuctx->task_ctx)
1288 return; 1276 return;
@@ -1355,9 +1343,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1355 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1343 if (event->cpu != -1 && event->cpu != smp_processor_id())
1356 continue; 1344 continue;
1357 1345
1358 if (group_can_go_on(event, cpuctx, can_add_hw)) 1346 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1359 if (group_sched_in(event, cpuctx, ctx)) 1347 if (group_sched_in(event, cpuctx, ctx))
1360 can_add_hw = 0; 1348 can_add_hw = 0;
1349 }
1361 } 1350 }
1362} 1351}
1363 1352
@@ -1373,8 +1362,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1373 1362
1374 ctx->timestamp = perf_clock(); 1363 ctx->timestamp = perf_clock();
1375 1364
1376 perf_disable();
1377
1378 /* 1365 /*
1379 * First go through the list and put on any pinned groups 1366 * First go through the list and put on any pinned groups
1380 * in order to give them the best chance of going on. 1367 * in order to give them the best chance of going on.
@@ -1386,8 +1373,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1386 if (event_type & EVENT_FLEXIBLE) 1373 if (event_type & EVENT_FLEXIBLE)
1387 ctx_flexible_sched_in(ctx, cpuctx); 1374 ctx_flexible_sched_in(ctx, cpuctx);
1388 1375
1389 perf_enable(); 1376out:
1390 out:
1391 raw_spin_unlock(&ctx->lock); 1377 raw_spin_unlock(&ctx->lock);
1392} 1378}
1393 1379
@@ -1399,43 +1385,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1399 ctx_sched_in(ctx, cpuctx, event_type); 1385 ctx_sched_in(ctx, cpuctx, event_type);
1400} 1386}
1401 1387
1402static void task_ctx_sched_in(struct task_struct *task, 1388static void task_ctx_sched_in(struct perf_event_context *ctx,
1403 enum event_type_t event_type) 1389 enum event_type_t event_type)
1404{ 1390{
1405 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1391 struct perf_cpu_context *cpuctx;
1406 struct perf_event_context *ctx = task->perf_event_ctxp;
1407 1392
1408 if (likely(!ctx)) 1393 cpuctx = __get_cpu_context(ctx);
1409 return;
1410 if (cpuctx->task_ctx == ctx) 1394 if (cpuctx->task_ctx == ctx)
1411 return; 1395 return;
1396
1412 ctx_sched_in(ctx, cpuctx, event_type); 1397 ctx_sched_in(ctx, cpuctx, event_type);
1413 cpuctx->task_ctx = ctx; 1398 cpuctx->task_ctx = ctx;
1414} 1399}
1415/*
1416 * Called from scheduler to add the events of the current task
1417 * with interrupts disabled.
1418 *
1419 * We restore the event value and then enable it.
1420 *
1421 * This does not protect us against NMI, but enable()
1422 * sets the enabled bit in the control field of event _before_
1423 * accessing the event control register. If a NMI hits, then it will
1424 * keep the event running.
1425 */
1426void perf_event_task_sched_in(struct task_struct *task)
1427{
1428 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1429 struct perf_event_context *ctx = task->perf_event_ctxp;
1430 1400
1431 if (likely(!ctx)) 1401void perf_event_context_sched_in(struct perf_event_context *ctx)
1432 return; 1402{
1403 struct perf_cpu_context *cpuctx;
1433 1404
1405 cpuctx = __get_cpu_context(ctx);
1434 if (cpuctx->task_ctx == ctx) 1406 if (cpuctx->task_ctx == ctx)
1435 return; 1407 return;
1436 1408
1437 perf_disable(); 1409 perf_pmu_disable(ctx->pmu);
1438
1439 /* 1410 /*
1440 * We want to keep the following priority order: 1411 * We want to keep the following priority order:
1441 * cpu pinned (that don't need to move), task pinned, 1412 * cpu pinned (that don't need to move), task pinned,
@@ -1449,7 +1420,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1449 1420
1450 cpuctx->task_ctx = ctx; 1421 cpuctx->task_ctx = ctx;
1451 1422
1452 perf_enable(); 1423 /*
1424 * Since these rotations are per-cpu, we need to ensure the
1425 * cpu-context we got scheduled on is actually rotating.
1426 */
1427 perf_pmu_rotate_start(ctx->pmu);
1428 perf_pmu_enable(ctx->pmu);
1429}
1430
1431/*
1432 * Called from scheduler to add the events of the current task
1433 * with interrupts disabled.
1434 *
1435 * We restore the event value and then enable it.
1436 *
1437 * This does not protect us against NMI, but enable()
1438 * sets the enabled bit in the control field of event _before_
1439 * accessing the event control register. If a NMI hits, then it will
1440 * keep the event running.
1441 */
1442void perf_event_task_sched_in(struct task_struct *task)
1443{
1444 struct perf_event_context *ctx;
1445 int ctxn;
1446
1447 for_each_task_context_nr(ctxn) {
1448 ctx = task->perf_event_ctxp[ctxn];
1449 if (likely(!ctx))
1450 continue;
1451
1452 perf_event_context_sched_in(ctx);
1453 }
1453} 1454}
1454 1455
1455#define MAX_INTERRUPTS (~0ULL) 1456#define MAX_INTERRUPTS (~0ULL)
@@ -1529,22 +1530,6 @@ do { \
1529 return div64_u64(dividend, divisor); 1530 return div64_u64(dividend, divisor);
1530} 1531}
1531 1532
1532static void perf_event_stop(struct perf_event *event)
1533{
1534 if (!event->pmu->stop)
1535 return event->pmu->disable(event);
1536
1537 return event->pmu->stop(event);
1538}
1539
1540static int perf_event_start(struct perf_event *event)
1541{
1542 if (!event->pmu->start)
1543 return event->pmu->enable(event);
1544
1545 return event->pmu->start(event);
1546}
1547
1548static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1533static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1549{ 1534{
1550 struct hw_perf_event *hwc = &event->hw; 1535 struct hw_perf_event *hwc = &event->hw;
@@ -1564,15 +1549,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1564 hwc->sample_period = sample_period; 1549 hwc->sample_period = sample_period;
1565 1550
1566 if (local64_read(&hwc->period_left) > 8*sample_period) { 1551 if (local64_read(&hwc->period_left) > 8*sample_period) {
1567 perf_disable(); 1552 event->pmu->stop(event, PERF_EF_UPDATE);
1568 perf_event_stop(event);
1569 local64_set(&hwc->period_left, 0); 1553 local64_set(&hwc->period_left, 0);
1570 perf_event_start(event); 1554 event->pmu->start(event, PERF_EF_RELOAD);
1571 perf_enable();
1572 } 1555 }
1573} 1556}
1574 1557
1575static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1558static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1576{ 1559{
1577 struct perf_event *event; 1560 struct perf_event *event;
1578 struct hw_perf_event *hwc; 1561 struct hw_perf_event *hwc;
@@ -1597,23 +1580,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1597 */ 1580 */
1598 if (interrupts == MAX_INTERRUPTS) { 1581 if (interrupts == MAX_INTERRUPTS) {
1599 perf_log_throttle(event, 1); 1582 perf_log_throttle(event, 1);
1600 perf_disable(); 1583 event->pmu->start(event, 0);
1601 event->pmu->unthrottle(event);
1602 perf_enable();
1603 } 1584 }
1604 1585
1605 if (!event->attr.freq || !event->attr.sample_freq) 1586 if (!event->attr.freq || !event->attr.sample_freq)
1606 continue; 1587 continue;
1607 1588
1608 perf_disable();
1609 event->pmu->read(event); 1589 event->pmu->read(event);
1610 now = local64_read(&event->count); 1590 now = local64_read(&event->count);
1611 delta = now - hwc->freq_count_stamp; 1591 delta = now - hwc->freq_count_stamp;
1612 hwc->freq_count_stamp = now; 1592 hwc->freq_count_stamp = now;
1613 1593
1614 if (delta > 0) 1594 if (delta > 0)
1615 perf_adjust_period(event, TICK_NSEC, delta); 1595 perf_adjust_period(event, period, delta);
1616 perf_enable();
1617 } 1596 }
1618 raw_spin_unlock(&ctx->lock); 1597 raw_spin_unlock(&ctx->lock);
1619} 1598}
@@ -1631,32 +1610,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1631 raw_spin_unlock(&ctx->lock); 1610 raw_spin_unlock(&ctx->lock);
1632} 1611}
1633 1612
1634void perf_event_task_tick(struct task_struct *curr) 1613/*
1614 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1615 * because they're strictly cpu affine and rotate_start is called with IRQs
1616 * disabled, while rotate_context is called from IRQ context.
1617 */
1618static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1635{ 1619{
1636 struct perf_cpu_context *cpuctx; 1620 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1637 struct perf_event_context *ctx; 1621 struct perf_event_context *ctx = NULL;
1638 int rotate = 0; 1622 int rotate = 0, remove = 1;
1639
1640 if (!atomic_read(&nr_events))
1641 return;
1642 1623
1643 cpuctx = &__get_cpu_var(perf_cpu_context); 1624 if (cpuctx->ctx.nr_events) {
1644 if (cpuctx->ctx.nr_events && 1625 remove = 0;
1645 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1626 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1646 rotate = 1; 1627 rotate = 1;
1628 }
1647 1629
1648 ctx = curr->perf_event_ctxp; 1630 ctx = cpuctx->task_ctx;
1649 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1631 if (ctx && ctx->nr_events) {
1650 rotate = 1; 1632 remove = 0;
1633 if (ctx->nr_events != ctx->nr_active)
1634 rotate = 1;
1635 }
1651 1636
1652 perf_ctx_adjust_freq(&cpuctx->ctx); 1637 perf_pmu_disable(cpuctx->ctx.pmu);
1638 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1653 if (ctx) 1639 if (ctx)
1654 perf_ctx_adjust_freq(ctx); 1640 perf_ctx_adjust_freq(ctx, interval);
1655 1641
1656 if (!rotate) 1642 if (!rotate)
1657 return; 1643 goto done;
1658 1644
1659 perf_disable();
1660 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1645 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1661 if (ctx) 1646 if (ctx)
1662 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1647 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1667,8 +1652,27 @@ void perf_event_task_tick(struct task_struct *curr)
1667 1652
1668 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1653 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1669 if (ctx) 1654 if (ctx)
1670 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1655 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1671 perf_enable(); 1656
1657done:
1658 if (remove)
1659 list_del_init(&cpuctx->rotation_list);
1660
1661 perf_pmu_enable(cpuctx->ctx.pmu);
1662}
1663
1664void perf_event_task_tick(void)
1665{
1666 struct list_head *head = &__get_cpu_var(rotation_list);
1667 struct perf_cpu_context *cpuctx, *tmp;
1668
1669 WARN_ON(!irqs_disabled());
1670
1671 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1672 if (cpuctx->jiffies_interval == 1 ||
1673 !(jiffies % cpuctx->jiffies_interval))
1674 perf_rotate_context(cpuctx);
1675 }
1672} 1676}
1673 1677
1674static int event_enable_on_exec(struct perf_event *event, 1678static int event_enable_on_exec(struct perf_event *event,
@@ -1690,20 +1694,18 @@ static int event_enable_on_exec(struct perf_event *event,
1690 * Enable all of a task's events that have been marked enable-on-exec. 1694 * Enable all of a task's events that have been marked enable-on-exec.
1691 * This expects task == current. 1695 * This expects task == current.
1692 */ 1696 */
1693static void perf_event_enable_on_exec(struct task_struct *task) 1697static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1694{ 1698{
1695 struct perf_event_context *ctx;
1696 struct perf_event *event; 1699 struct perf_event *event;
1697 unsigned long flags; 1700 unsigned long flags;
1698 int enabled = 0; 1701 int enabled = 0;
1699 int ret; 1702 int ret;
1700 1703
1701 local_irq_save(flags); 1704 local_irq_save(flags);
1702 ctx = task->perf_event_ctxp;
1703 if (!ctx || !ctx->nr_events) 1705 if (!ctx || !ctx->nr_events)
1704 goto out; 1706 goto out;
1705 1707
1706 __perf_event_task_sched_out(ctx); 1708 task_ctx_sched_out(ctx, EVENT_ALL);
1707 1709
1708 raw_spin_lock(&ctx->lock); 1710 raw_spin_lock(&ctx->lock);
1709 1711
@@ -1727,8 +1729,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1727 1729
1728 raw_spin_unlock(&ctx->lock); 1730 raw_spin_unlock(&ctx->lock);
1729 1731
1730 perf_event_task_sched_in(task); 1732 perf_event_context_sched_in(ctx);
1731 out: 1733out:
1732 local_irq_restore(flags); 1734 local_irq_restore(flags);
1733} 1735}
1734 1736
@@ -1737,9 +1739,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1737 */ 1739 */
1738static void __perf_event_read(void *info) 1740static void __perf_event_read(void *info)
1739{ 1741{
1740 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1741 struct perf_event *event = info; 1742 struct perf_event *event = info;
1742 struct perf_event_context *ctx = event->ctx; 1743 struct perf_event_context *ctx = event->ctx;
1744 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1743 1745
1744 /* 1746 /*
1745 * If this is a task context, we need to check whether it is 1747 * If this is a task context, we need to check whether it is
@@ -1787,11 +1789,219 @@ static u64 perf_event_read(struct perf_event *event)
1787} 1789}
1788 1790
1789/* 1791/*
1790 * Initialize the perf_event context in a task_struct: 1792 * Callchain support
1791 */ 1793 */
1794
1795struct callchain_cpus_entries {
1796 struct rcu_head rcu_head;
1797 struct perf_callchain_entry *cpu_entries[0];
1798};
1799
1800static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1801static atomic_t nr_callchain_events;
1802static DEFINE_MUTEX(callchain_mutex);
1803struct callchain_cpus_entries *callchain_cpus_entries;
1804
1805
1806__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1807 struct pt_regs *regs)
1808{
1809}
1810
1811__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1812 struct pt_regs *regs)
1813{
1814}
1815
1816static void release_callchain_buffers_rcu(struct rcu_head *head)
1817{
1818 struct callchain_cpus_entries *entries;
1819 int cpu;
1820
1821 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1822
1823 for_each_possible_cpu(cpu)
1824 kfree(entries->cpu_entries[cpu]);
1825
1826 kfree(entries);
1827}
1828
1829static void release_callchain_buffers(void)
1830{
1831 struct callchain_cpus_entries *entries;
1832
1833 entries = callchain_cpus_entries;
1834 rcu_assign_pointer(callchain_cpus_entries, NULL);
1835 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1836}
1837
1838static int alloc_callchain_buffers(void)
1839{
1840 int cpu;
1841 int size;
1842 struct callchain_cpus_entries *entries;
1843
1844 /*
1845 * We can't use the percpu allocation API for data that can be
1846 * accessed from NMI. Use a temporary manual per cpu allocation
1847 * until that gets sorted out.
1848 */
1849 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1850 num_possible_cpus();
1851
1852 entries = kzalloc(size, GFP_KERNEL);
1853 if (!entries)
1854 return -ENOMEM;
1855
1856 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1857
1858 for_each_possible_cpu(cpu) {
1859 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1860 cpu_to_node(cpu));
1861 if (!entries->cpu_entries[cpu])
1862 goto fail;
1863 }
1864
1865 rcu_assign_pointer(callchain_cpus_entries, entries);
1866
1867 return 0;
1868
1869fail:
1870 for_each_possible_cpu(cpu)
1871 kfree(entries->cpu_entries[cpu]);
1872 kfree(entries);
1873
1874 return -ENOMEM;
1875}
1876
1877static int get_callchain_buffers(void)
1878{
1879 int err = 0;
1880 int count;
1881
1882 mutex_lock(&callchain_mutex);
1883
1884 count = atomic_inc_return(&nr_callchain_events);
1885 if (WARN_ON_ONCE(count < 1)) {
1886 err = -EINVAL;
1887 goto exit;
1888 }
1889
1890 if (count > 1) {
1891 /* If the allocation failed, give up */
1892 if (!callchain_cpus_entries)
1893 err = -ENOMEM;
1894 goto exit;
1895 }
1896
1897 err = alloc_callchain_buffers();
1898 if (err)
1899 release_callchain_buffers();
1900exit:
1901 mutex_unlock(&callchain_mutex);
1902
1903 return err;
1904}
1905
1906static void put_callchain_buffers(void)
1907{
1908 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1909 release_callchain_buffers();
1910 mutex_unlock(&callchain_mutex);
1911 }
1912}
1913
1914static int get_recursion_context(int *recursion)
1915{
1916 int rctx;
1917
1918 if (in_nmi())
1919 rctx = 3;
1920 else if (in_irq())
1921 rctx = 2;
1922 else if (in_softirq())
1923 rctx = 1;
1924 else
1925 rctx = 0;
1926
1927 if (recursion[rctx])
1928 return -1;
1929
1930 recursion[rctx]++;
1931 barrier();
1932
1933 return rctx;
1934}
1935
1936static inline void put_recursion_context(int *recursion, int rctx)
1937{
1938 barrier();
1939 recursion[rctx]--;
1940}
1941
1942static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1943{
1944 int cpu;
1945 struct callchain_cpus_entries *entries;
1946
1947 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
1948 if (*rctx == -1)
1949 return NULL;
1950
1951 entries = rcu_dereference(callchain_cpus_entries);
1952 if (!entries)
1953 return NULL;
1954
1955 cpu = smp_processor_id();
1956
1957 return &entries->cpu_entries[cpu][*rctx];
1958}
1959
1792static void 1960static void
1793__perf_event_init_context(struct perf_event_context *ctx, 1961put_callchain_entry(int rctx)
1794 struct task_struct *task) 1962{
1963 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
1964}
1965
1966static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1967{
1968 int rctx;
1969 struct perf_callchain_entry *entry;
1970
1971
1972 entry = get_callchain_entry(&rctx);
1973 if (rctx == -1)
1974 return NULL;
1975
1976 if (!entry)
1977 goto exit_put;
1978
1979 entry->nr = 0;
1980
1981 if (!user_mode(regs)) {
1982 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
1983 perf_callchain_kernel(entry, regs);
1984 if (current->mm)
1985 regs = task_pt_regs(current);
1986 else
1987 regs = NULL;
1988 }
1989
1990 if (regs) {
1991 perf_callchain_store(entry, PERF_CONTEXT_USER);
1992 perf_callchain_user(entry, regs);
1993 }
1994
1995exit_put:
1996 put_callchain_entry(rctx);
1997
1998 return entry;
1999}
2000
2001/*
2002 * Initialize the perf_event context in a task_struct:
2003 */
2004static void __perf_event_init_context(struct perf_event_context *ctx)
1795{ 2005{
1796 raw_spin_lock_init(&ctx->lock); 2006 raw_spin_lock_init(&ctx->lock);
1797 mutex_init(&ctx->mutex); 2007 mutex_init(&ctx->mutex);
@@ -1799,45 +2009,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1799 INIT_LIST_HEAD(&ctx->flexible_groups); 2009 INIT_LIST_HEAD(&ctx->flexible_groups);
1800 INIT_LIST_HEAD(&ctx->event_list); 2010 INIT_LIST_HEAD(&ctx->event_list);
1801 atomic_set(&ctx->refcount, 1); 2011 atomic_set(&ctx->refcount, 1);
1802 ctx->task = task;
1803} 2012}
1804 2013
1805static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2014static struct perf_event_context *
2015alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1806{ 2016{
1807 struct perf_event_context *ctx; 2017 struct perf_event_context *ctx;
1808 struct perf_cpu_context *cpuctx;
1809 struct task_struct *task;
1810 unsigned long flags;
1811 int err;
1812
1813 if (pid == -1 && cpu != -1) {
1814 /* Must be root to operate on a CPU event: */
1815 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1816 return ERR_PTR(-EACCES);
1817 2018
1818 if (cpu < 0 || cpu >= nr_cpumask_bits) 2019 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1819 return ERR_PTR(-EINVAL); 2020 if (!ctx)
2021 return NULL;
1820 2022
1821 /* 2023 __perf_event_init_context(ctx);
1822 * We could be clever and allow to attach a event to an 2024 if (task) {
1823 * offline CPU and activate it when the CPU comes up, but 2025 ctx->task = task;
1824 * that's for later. 2026 get_task_struct(task);
1825 */ 2027 }
1826 if (!cpu_online(cpu)) 2028 ctx->pmu = pmu;
1827 return ERR_PTR(-ENODEV);
1828 2029
1829 cpuctx = &per_cpu(perf_cpu_context, cpu); 2030 return ctx;
1830 ctx = &cpuctx->ctx; 2031}
1831 get_ctx(ctx);
1832 2032
1833 return ctx; 2033static struct task_struct *
1834 } 2034find_lively_task_by_vpid(pid_t vpid)
2035{
2036 struct task_struct *task;
2037 int err;
1835 2038
1836 rcu_read_lock(); 2039 rcu_read_lock();
1837 if (!pid) 2040 if (!vpid)
1838 task = current; 2041 task = current;
1839 else 2042 else
1840 task = find_task_by_vpid(pid); 2043 task = find_task_by_vpid(vpid);
1841 if (task) 2044 if (task)
1842 get_task_struct(task); 2045 get_task_struct(task);
1843 rcu_read_unlock(); 2046 rcu_read_unlock();
@@ -1857,35 +2060,79 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1857 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2060 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1858 goto errout; 2061 goto errout;
1859 2062
1860 retry: 2063 return task;
1861 ctx = perf_lock_task_context(task, &flags); 2064errout:
2065 put_task_struct(task);
2066 return ERR_PTR(err);
2067
2068}
2069
2070static struct perf_event_context *
2071find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2072{
2073 struct perf_event_context *ctx;
2074 struct perf_cpu_context *cpuctx;
2075 unsigned long flags;
2076 int ctxn, err;
2077
2078 if (!task && cpu != -1) {
2079 /* Must be root to operate on a CPU event: */
2080 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2081 return ERR_PTR(-EACCES);
2082
2083 if (cpu < 0 || cpu >= nr_cpumask_bits)
2084 return ERR_PTR(-EINVAL);
2085
2086 /*
2087 * We could be clever and allow to attach a event to an
2088 * offline CPU and activate it when the CPU comes up, but
2089 * that's for later.
2090 */
2091 if (!cpu_online(cpu))
2092 return ERR_PTR(-ENODEV);
2093
2094 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2095 ctx = &cpuctx->ctx;
2096 get_ctx(ctx);
2097
2098 return ctx;
2099 }
2100
2101 err = -EINVAL;
2102 ctxn = pmu->task_ctx_nr;
2103 if (ctxn < 0)
2104 goto errout;
2105
2106retry:
2107 ctx = perf_lock_task_context(task, ctxn, &flags);
1862 if (ctx) { 2108 if (ctx) {
1863 unclone_ctx(ctx); 2109 unclone_ctx(ctx);
1864 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2110 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1865 } 2111 }
1866 2112
1867 if (!ctx) { 2113 if (!ctx) {
1868 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2114 ctx = alloc_perf_context(pmu, task);
1869 err = -ENOMEM; 2115 err = -ENOMEM;
1870 if (!ctx) 2116 if (!ctx)
1871 goto errout; 2117 goto errout;
1872 __perf_event_init_context(ctx, task); 2118
1873 get_ctx(ctx); 2119 get_ctx(ctx);
1874 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2120
2121 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1875 /* 2122 /*
1876 * We raced with some other task; use 2123 * We raced with some other task; use
1877 * the context they set. 2124 * the context they set.
1878 */ 2125 */
2126 put_task_struct(task);
1879 kfree(ctx); 2127 kfree(ctx);
1880 goto retry; 2128 goto retry;
1881 } 2129 }
1882 get_task_struct(task);
1883 } 2130 }
1884 2131
1885 put_task_struct(task); 2132 put_task_struct(task);
1886 return ctx; 2133 return ctx;
1887 2134
1888 errout: 2135errout:
1889 put_task_struct(task); 2136 put_task_struct(task);
1890 return ERR_PTR(err); 2137 return ERR_PTR(err);
1891} 2138}
@@ -1918,6 +2165,8 @@ static void free_event(struct perf_event *event)
1918 atomic_dec(&nr_comm_events); 2165 atomic_dec(&nr_comm_events);
1919 if (event->attr.task) 2166 if (event->attr.task)
1920 atomic_dec(&nr_task_events); 2167 atomic_dec(&nr_task_events);
2168 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2169 put_callchain_buffers();
1921 } 2170 }
1922 2171
1923 if (event->buffer) { 2172 if (event->buffer) {
@@ -1928,7 +2177,9 @@ static void free_event(struct perf_event *event)
1928 if (event->destroy) 2177 if (event->destroy)
1929 event->destroy(event); 2178 event->destroy(event);
1930 2179
1931 put_ctx(event->ctx); 2180 if (event->ctx)
2181 put_ctx(event->ctx);
2182
1932 call_rcu(&event->rcu_head, free_event_rcu); 2183 call_rcu(&event->rcu_head, free_event_rcu);
1933} 2184}
1934 2185
@@ -2349,6 +2600,9 @@ int perf_event_task_disable(void)
2349 2600
2350static int perf_event_index(struct perf_event *event) 2601static int perf_event_index(struct perf_event *event)
2351{ 2602{
2603 if (event->hw.state & PERF_HES_STOPPED)
2604 return 0;
2605
2352 if (event->state != PERF_EVENT_STATE_ACTIVE) 2606 if (event->state != PERF_EVENT_STATE_ACTIVE)
2353 return 0; 2607 return 0;
2354 2608
@@ -2961,16 +3215,6 @@ void perf_event_do_pending(void)
2961} 3215}
2962 3216
2963/* 3217/*
2964 * Callchain support -- arch specific
2965 */
2966
2967__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2968{
2969 return NULL;
2970}
2971
2972
2973/*
2974 * We assume there is only KVM supporting the callbacks. 3218 * We assume there is only KVM supporting the callbacks.
2975 * Later on, we might change it to a list if there is 3219 * Later on, we might change it to a list if there is
2976 * another virtualization implementation supporting the callbacks. 3220 * another virtualization implementation supporting the callbacks.
@@ -3076,7 +3320,7 @@ again:
3076 if (handle->wakeup != local_read(&buffer->wakeup)) 3320 if (handle->wakeup != local_read(&buffer->wakeup))
3077 perf_output_wakeup(handle); 3321 perf_output_wakeup(handle);
3078 3322
3079 out: 3323out:
3080 preempt_enable(); 3324 preempt_enable();
3081} 3325}
3082 3326
@@ -3464,14 +3708,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3464 struct perf_output_handle handle; 3708 struct perf_output_handle handle;
3465 struct perf_event_header header; 3709 struct perf_event_header header;
3466 3710
3711 /* protect the callchain buffers */
3712 rcu_read_lock();
3713
3467 perf_prepare_sample(&header, data, event, regs); 3714 perf_prepare_sample(&header, data, event, regs);
3468 3715
3469 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3716 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3470 return; 3717 goto exit;
3471 3718
3472 perf_output_sample(&handle, &header, data, event); 3719 perf_output_sample(&handle, &header, data, event);
3473 3720
3474 perf_output_end(&handle); 3721 perf_output_end(&handle);
3722
3723exit:
3724 rcu_read_unlock();
3475} 3725}
3476 3726
3477/* 3727/*
@@ -3585,16 +3835,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3585static void perf_event_task_event(struct perf_task_event *task_event) 3835static void perf_event_task_event(struct perf_task_event *task_event)
3586{ 3836{
3587 struct perf_cpu_context *cpuctx; 3837 struct perf_cpu_context *cpuctx;
3588 struct perf_event_context *ctx = task_event->task_ctx; 3838 struct perf_event_context *ctx;
3839 struct pmu *pmu;
3840 int ctxn;
3589 3841
3590 rcu_read_lock(); 3842 rcu_read_lock();
3591 cpuctx = &get_cpu_var(perf_cpu_context); 3843 list_for_each_entry_rcu(pmu, &pmus, entry) {
3592 perf_event_task_ctx(&cpuctx->ctx, task_event); 3844 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3593 if (!ctx) 3845 perf_event_task_ctx(&cpuctx->ctx, task_event);
3594 ctx = rcu_dereference(current->perf_event_ctxp); 3846
3595 if (ctx) 3847 ctx = task_event->task_ctx;
3596 perf_event_task_ctx(ctx, task_event); 3848 if (!ctx) {
3597 put_cpu_var(perf_cpu_context); 3849 ctxn = pmu->task_ctx_nr;
3850 if (ctxn < 0)
3851 goto next;
3852 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3853 }
3854 if (ctx)
3855 perf_event_task_ctx(ctx, task_event);
3856next:
3857 put_cpu_ptr(pmu->pmu_cpu_context);
3858 }
3598 rcu_read_unlock(); 3859 rcu_read_unlock();
3599} 3860}
3600 3861
@@ -3699,8 +3960,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3699{ 3960{
3700 struct perf_cpu_context *cpuctx; 3961 struct perf_cpu_context *cpuctx;
3701 struct perf_event_context *ctx; 3962 struct perf_event_context *ctx;
3702 unsigned int size;
3703 char comm[TASK_COMM_LEN]; 3963 char comm[TASK_COMM_LEN];
3964 unsigned int size;
3965 struct pmu *pmu;
3966 int ctxn;
3704 3967
3705 memset(comm, 0, sizeof(comm)); 3968 memset(comm, 0, sizeof(comm));
3706 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3969 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3712,21 +3975,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3712 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3975 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3713 3976
3714 rcu_read_lock(); 3977 rcu_read_lock();
3715 cpuctx = &get_cpu_var(perf_cpu_context); 3978 list_for_each_entry_rcu(pmu, &pmus, entry) {
3716 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3979 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3717 ctx = rcu_dereference(current->perf_event_ctxp); 3980 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3718 if (ctx) 3981
3719 perf_event_comm_ctx(ctx, comm_event); 3982 ctxn = pmu->task_ctx_nr;
3720 put_cpu_var(perf_cpu_context); 3983 if (ctxn < 0)
3984 goto next;
3985
3986 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3987 if (ctx)
3988 perf_event_comm_ctx(ctx, comm_event);
3989next:
3990 put_cpu_ptr(pmu->pmu_cpu_context);
3991 }
3721 rcu_read_unlock(); 3992 rcu_read_unlock();
3722} 3993}
3723 3994
3724void perf_event_comm(struct task_struct *task) 3995void perf_event_comm(struct task_struct *task)
3725{ 3996{
3726 struct perf_comm_event comm_event; 3997 struct perf_comm_event comm_event;
3998 struct perf_event_context *ctx;
3999 int ctxn;
4000
4001 for_each_task_context_nr(ctxn) {
4002 ctx = task->perf_event_ctxp[ctxn];
4003 if (!ctx)
4004 continue;
3727 4005
3728 if (task->perf_event_ctxp) 4006 perf_event_enable_on_exec(ctx);
3729 perf_event_enable_on_exec(task); 4007 }
3730 4008
3731 if (!atomic_read(&nr_comm_events)) 4009 if (!atomic_read(&nr_comm_events))
3732 return; 4010 return;
@@ -3828,6 +4106,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3828 char tmp[16]; 4106 char tmp[16];
3829 char *buf = NULL; 4107 char *buf = NULL;
3830 const char *name; 4108 const char *name;
4109 struct pmu *pmu;
4110 int ctxn;
3831 4111
3832 memset(tmp, 0, sizeof(tmp)); 4112 memset(tmp, 0, sizeof(tmp));
3833 4113
@@ -3880,12 +4160,23 @@ got_name:
3880 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4160 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3881 4161
3882 rcu_read_lock(); 4162 rcu_read_lock();
3883 cpuctx = &get_cpu_var(perf_cpu_context); 4163 list_for_each_entry_rcu(pmu, &pmus, entry) {
3884 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4164 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3885 ctx = rcu_dereference(current->perf_event_ctxp); 4165 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3886 if (ctx) 4166 vma->vm_flags & VM_EXEC);
3887 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4167
3888 put_cpu_var(perf_cpu_context); 4168 ctxn = pmu->task_ctx_nr;
4169 if (ctxn < 0)
4170 goto next;
4171
4172 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4173 if (ctx) {
4174 perf_event_mmap_ctx(ctx, mmap_event,
4175 vma->vm_flags & VM_EXEC);
4176 }
4177next:
4178 put_cpu_ptr(pmu->pmu_cpu_context);
4179 }
3889 rcu_read_unlock(); 4180 rcu_read_unlock();
3890 4181
3891 kfree(buf); 4182 kfree(buf);
@@ -3967,8 +4258,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3967 struct hw_perf_event *hwc = &event->hw; 4258 struct hw_perf_event *hwc = &event->hw;
3968 int ret = 0; 4259 int ret = 0;
3969 4260
3970 throttle = (throttle && event->pmu->unthrottle != NULL);
3971
3972 if (!throttle) { 4261 if (!throttle) {
3973 hwc->interrupts++; 4262 hwc->interrupts++;
3974 } else { 4263 } else {
@@ -4036,6 +4325,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4036 * Generic software event infrastructure 4325 * Generic software event infrastructure
4037 */ 4326 */
4038 4327
4328struct swevent_htable {
4329 struct swevent_hlist *swevent_hlist;
4330 struct mutex hlist_mutex;
4331 int hlist_refcount;
4332
4333 /* Recursion avoidance in each contexts */
4334 int recursion[PERF_NR_CONTEXTS];
4335};
4336
4337static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4338
4039/* 4339/*
4040 * We directly increment event->count and keep a second value in 4340 * We directly increment event->count and keep a second value in
4041 * event->hw.period_left to count intervals. This period event 4341 * event->hw.period_left to count intervals. This period event
@@ -4093,7 +4393,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4093 } 4393 }
4094} 4394}
4095 4395
4096static void perf_swevent_add(struct perf_event *event, u64 nr, 4396static void perf_swevent_event(struct perf_event *event, u64 nr,
4097 int nmi, struct perf_sample_data *data, 4397 int nmi, struct perf_sample_data *data,
4098 struct pt_regs *regs) 4398 struct pt_regs *regs)
4099{ 4399{
@@ -4119,6 +4419,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4119static int perf_exclude_event(struct perf_event *event, 4419static int perf_exclude_event(struct perf_event *event,
4120 struct pt_regs *regs) 4420 struct pt_regs *regs)
4121{ 4421{
4422 if (event->hw.state & PERF_HES_STOPPED)
4423 return 0;
4424
4122 if (regs) { 4425 if (regs) {
4123 if (event->attr.exclude_user && user_mode(regs)) 4426 if (event->attr.exclude_user && user_mode(regs))
4124 return 1; 4427 return 1;
@@ -4165,11 +4468,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4165 4468
4166/* For the read side: events when they trigger */ 4469/* For the read side: events when they trigger */
4167static inline struct hlist_head * 4470static inline struct hlist_head *
4168find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4471find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4169{ 4472{
4170 struct swevent_hlist *hlist; 4473 struct swevent_hlist *hlist;
4171 4474
4172 hlist = rcu_dereference(ctx->swevent_hlist); 4475 hlist = rcu_dereference(swhash->swevent_hlist);
4173 if (!hlist) 4476 if (!hlist)
4174 return NULL; 4477 return NULL;
4175 4478
@@ -4178,7 +4481,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4178 4481
4179/* For the event head insertion and removal in the hlist */ 4482/* For the event head insertion and removal in the hlist */
4180static inline struct hlist_head * 4483static inline struct hlist_head *
4181find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4484find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4182{ 4485{
4183 struct swevent_hlist *hlist; 4486 struct swevent_hlist *hlist;
4184 u32 event_id = event->attr.config; 4487 u32 event_id = event->attr.config;
@@ -4189,7 +4492,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4189 * and release. Which makes the protected version suitable here. 4492 * and release. Which makes the protected version suitable here.
4190 * The context lock guarantees that. 4493 * The context lock guarantees that.
4191 */ 4494 */
4192 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4495 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4193 lockdep_is_held(&event->ctx->lock)); 4496 lockdep_is_held(&event->ctx->lock));
4194 if (!hlist) 4497 if (!hlist)
4195 return NULL; 4498 return NULL;
@@ -4202,23 +4505,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4202 struct perf_sample_data *data, 4505 struct perf_sample_data *data,
4203 struct pt_regs *regs) 4506 struct pt_regs *regs)
4204{ 4507{
4205 struct perf_cpu_context *cpuctx; 4508 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4206 struct perf_event *event; 4509 struct perf_event *event;
4207 struct hlist_node *node; 4510 struct hlist_node *node;
4208 struct hlist_head *head; 4511 struct hlist_head *head;
4209 4512
4210 cpuctx = &__get_cpu_var(perf_cpu_context);
4211
4212 rcu_read_lock(); 4513 rcu_read_lock();
4213 4514 head = find_swevent_head_rcu(swhash, type, event_id);
4214 head = find_swevent_head_rcu(cpuctx, type, event_id);
4215
4216 if (!head) 4515 if (!head)
4217 goto end; 4516 goto end;
4218 4517
4219 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4518 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4220 if (perf_swevent_match(event, type, event_id, data, regs)) 4519 if (perf_swevent_match(event, type, event_id, data, regs))
4221 perf_swevent_add(event, nr, nmi, data, regs); 4520 perf_swevent_event(event, nr, nmi, data, regs);
4222 } 4521 }
4223end: 4522end:
4224 rcu_read_unlock(); 4523 rcu_read_unlock();
@@ -4226,33 +4525,17 @@ end:
4226 4525
4227int perf_swevent_get_recursion_context(void) 4526int perf_swevent_get_recursion_context(void)
4228{ 4527{
4229 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4528 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4230 int rctx;
4231 4529
4232 if (in_nmi()) 4530 return get_recursion_context(swhash->recursion);
4233 rctx = 3;
4234 else if (in_irq())
4235 rctx = 2;
4236 else if (in_softirq())
4237 rctx = 1;
4238 else
4239 rctx = 0;
4240
4241 if (cpuctx->recursion[rctx])
4242 return -1;
4243
4244 cpuctx->recursion[rctx]++;
4245 barrier();
4246
4247 return rctx;
4248} 4531}
4249EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4532EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4250 4533
4251void inline perf_swevent_put_recursion_context(int rctx) 4534void inline perf_swevent_put_recursion_context(int rctx)
4252{ 4535{
4253 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4536 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4254 barrier(); 4537
4255 cpuctx->recursion[rctx]--; 4538 put_recursion_context(swhash->recursion, rctx);
4256} 4539}
4257 4540
4258void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4541void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4278,20 +4561,20 @@ static void perf_swevent_read(struct perf_event *event)
4278{ 4561{
4279} 4562}
4280 4563
4281static int perf_swevent_enable(struct perf_event *event) 4564static int perf_swevent_add(struct perf_event *event, int flags)
4282{ 4565{
4566 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4283 struct hw_perf_event *hwc = &event->hw; 4567 struct hw_perf_event *hwc = &event->hw;
4284 struct perf_cpu_context *cpuctx;
4285 struct hlist_head *head; 4568 struct hlist_head *head;
4286 4569
4287 cpuctx = &__get_cpu_var(perf_cpu_context);
4288
4289 if (hwc->sample_period) { 4570 if (hwc->sample_period) {
4290 hwc->last_period = hwc->sample_period; 4571 hwc->last_period = hwc->sample_period;
4291 perf_swevent_set_period(event); 4572 perf_swevent_set_period(event);
4292 } 4573 }
4293 4574
4294 head = find_swevent_head(cpuctx, event); 4575 hwc->state = !(flags & PERF_EF_START);
4576
4577 head = find_swevent_head(swhash, event);
4295 if (WARN_ON_ONCE(!head)) 4578 if (WARN_ON_ONCE(!head))
4296 return -EINVAL; 4579 return -EINVAL;
4297 4580
@@ -4300,202 +4583,27 @@ static int perf_swevent_enable(struct perf_event *event)
4300 return 0; 4583 return 0;
4301} 4584}
4302 4585
4303static void perf_swevent_disable(struct perf_event *event) 4586static void perf_swevent_del(struct perf_event *event, int flags)
4304{ 4587{
4305 hlist_del_rcu(&event->hlist_entry); 4588 hlist_del_rcu(&event->hlist_entry);
4306} 4589}
4307 4590
4308static void perf_swevent_void(struct perf_event *event) 4591static void perf_swevent_start(struct perf_event *event, int flags)
4309{ 4592{
4593 event->hw.state = 0;
4310} 4594}
4311 4595
4312static int perf_swevent_int(struct perf_event *event) 4596static void perf_swevent_stop(struct perf_event *event, int flags)
4313{ 4597{
4314 return 0; 4598 event->hw.state = PERF_HES_STOPPED;
4315}
4316
4317static const struct pmu perf_ops_generic = {
4318 .enable = perf_swevent_enable,
4319 .disable = perf_swevent_disable,
4320 .start = perf_swevent_int,
4321 .stop = perf_swevent_void,
4322 .read = perf_swevent_read,
4323 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4324};
4325
4326/*
4327 * hrtimer based swevent callback
4328 */
4329
4330static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4331{
4332 enum hrtimer_restart ret = HRTIMER_RESTART;
4333 struct perf_sample_data data;
4334 struct pt_regs *regs;
4335 struct perf_event *event;
4336 u64 period;
4337
4338 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4339 event->pmu->read(event);
4340
4341 perf_sample_data_init(&data, 0);
4342 data.period = event->hw.last_period;
4343 regs = get_irq_regs();
4344
4345 if (regs && !perf_exclude_event(event, regs)) {
4346 if (!(event->attr.exclude_idle && current->pid == 0))
4347 if (perf_event_overflow(event, 0, &data, regs))
4348 ret = HRTIMER_NORESTART;
4349 }
4350
4351 period = max_t(u64, 10000, event->hw.sample_period);
4352 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4353
4354 return ret;
4355} 4599}
4356 4600
4357static void perf_swevent_start_hrtimer(struct perf_event *event)
4358{
4359 struct hw_perf_event *hwc = &event->hw;
4360
4361 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4362 hwc->hrtimer.function = perf_swevent_hrtimer;
4363 if (hwc->sample_period) {
4364 u64 period;
4365
4366 if (hwc->remaining) {
4367 if (hwc->remaining < 0)
4368 period = 10000;
4369 else
4370 period = hwc->remaining;
4371 hwc->remaining = 0;
4372 } else {
4373 period = max_t(u64, 10000, hwc->sample_period);
4374 }
4375 __hrtimer_start_range_ns(&hwc->hrtimer,
4376 ns_to_ktime(period), 0,
4377 HRTIMER_MODE_REL, 0);
4378 }
4379}
4380
4381static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4382{
4383 struct hw_perf_event *hwc = &event->hw;
4384
4385 if (hwc->sample_period) {
4386 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4387 hwc->remaining = ktime_to_ns(remaining);
4388
4389 hrtimer_cancel(&hwc->hrtimer);
4390 }
4391}
4392
4393/*
4394 * Software event: cpu wall time clock
4395 */
4396
4397static void cpu_clock_perf_event_update(struct perf_event *event)
4398{
4399 int cpu = raw_smp_processor_id();
4400 s64 prev;
4401 u64 now;
4402
4403 now = cpu_clock(cpu);
4404 prev = local64_xchg(&event->hw.prev_count, now);
4405 local64_add(now - prev, &event->count);
4406}
4407
4408static int cpu_clock_perf_event_enable(struct perf_event *event)
4409{
4410 struct hw_perf_event *hwc = &event->hw;
4411 int cpu = raw_smp_processor_id();
4412
4413 local64_set(&hwc->prev_count, cpu_clock(cpu));
4414 perf_swevent_start_hrtimer(event);
4415
4416 return 0;
4417}
4418
4419static void cpu_clock_perf_event_disable(struct perf_event *event)
4420{
4421 perf_swevent_cancel_hrtimer(event);
4422 cpu_clock_perf_event_update(event);
4423}
4424
4425static void cpu_clock_perf_event_read(struct perf_event *event)
4426{
4427 cpu_clock_perf_event_update(event);
4428}
4429
4430static const struct pmu perf_ops_cpu_clock = {
4431 .enable = cpu_clock_perf_event_enable,
4432 .disable = cpu_clock_perf_event_disable,
4433 .read = cpu_clock_perf_event_read,
4434};
4435
4436/*
4437 * Software event: task time clock
4438 */
4439
4440static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4441{
4442 u64 prev;
4443 s64 delta;
4444
4445 prev = local64_xchg(&event->hw.prev_count, now);
4446 delta = now - prev;
4447 local64_add(delta, &event->count);
4448}
4449
4450static int task_clock_perf_event_enable(struct perf_event *event)
4451{
4452 struct hw_perf_event *hwc = &event->hw;
4453 u64 now;
4454
4455 now = event->ctx->time;
4456
4457 local64_set(&hwc->prev_count, now);
4458
4459 perf_swevent_start_hrtimer(event);
4460
4461 return 0;
4462}
4463
4464static void task_clock_perf_event_disable(struct perf_event *event)
4465{
4466 perf_swevent_cancel_hrtimer(event);
4467 task_clock_perf_event_update(event, event->ctx->time);
4468
4469}
4470
4471static void task_clock_perf_event_read(struct perf_event *event)
4472{
4473 u64 time;
4474
4475 if (!in_nmi()) {
4476 update_context_time(event->ctx);
4477 time = event->ctx->time;
4478 } else {
4479 u64 now = perf_clock();
4480 u64 delta = now - event->ctx->timestamp;
4481 time = event->ctx->time + delta;
4482 }
4483
4484 task_clock_perf_event_update(event, time);
4485}
4486
4487static const struct pmu perf_ops_task_clock = {
4488 .enable = task_clock_perf_event_enable,
4489 .disable = task_clock_perf_event_disable,
4490 .read = task_clock_perf_event_read,
4491};
4492
4493/* Deref the hlist from the update side */ 4601/* Deref the hlist from the update side */
4494static inline struct swevent_hlist * 4602static inline struct swevent_hlist *
4495swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4603swevent_hlist_deref(struct swevent_htable *swhash)
4496{ 4604{
4497 return rcu_dereference_protected(cpuctx->swevent_hlist, 4605 return rcu_dereference_protected(swhash->swevent_hlist,
4498 lockdep_is_held(&cpuctx->hlist_mutex)); 4606 lockdep_is_held(&swhash->hlist_mutex));
4499} 4607}
4500 4608
4501static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4609static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4506,27 +4614,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4506 kfree(hlist); 4614 kfree(hlist);
4507} 4615}
4508 4616
4509static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4617static void swevent_hlist_release(struct swevent_htable *swhash)
4510{ 4618{
4511 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4619 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4512 4620
4513 if (!hlist) 4621 if (!hlist)
4514 return; 4622 return;
4515 4623
4516 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4624 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4517 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4625 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4518} 4626}
4519 4627
4520static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4628static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4521{ 4629{
4522 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4630 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4523 4631
4524 mutex_lock(&cpuctx->hlist_mutex); 4632 mutex_lock(&swhash->hlist_mutex);
4525 4633
4526 if (!--cpuctx->hlist_refcount) 4634 if (!--swhash->hlist_refcount)
4527 swevent_hlist_release(cpuctx); 4635 swevent_hlist_release(swhash);
4528 4636
4529 mutex_unlock(&cpuctx->hlist_mutex); 4637 mutex_unlock(&swhash->hlist_mutex);
4530} 4638}
4531 4639
4532static void swevent_hlist_put(struct perf_event *event) 4640static void swevent_hlist_put(struct perf_event *event)
@@ -4544,12 +4652,12 @@ static void swevent_hlist_put(struct perf_event *event)
4544 4652
4545static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4653static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4546{ 4654{
4547 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4655 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4548 int err = 0; 4656 int err = 0;
4549 4657
4550 mutex_lock(&cpuctx->hlist_mutex); 4658 mutex_lock(&swhash->hlist_mutex);
4551 4659
4552 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4660 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4553 struct swevent_hlist *hlist; 4661 struct swevent_hlist *hlist;
4554 4662
4555 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4663 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4557,11 +4665,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4557 err = -ENOMEM; 4665 err = -ENOMEM;
4558 goto exit; 4666 goto exit;
4559 } 4667 }
4560 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4668 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4561 } 4669 }
4562 cpuctx->hlist_refcount++; 4670 swhash->hlist_refcount++;
4563 exit: 4671exit:
4564 mutex_unlock(&cpuctx->hlist_mutex); 4672 mutex_unlock(&swhash->hlist_mutex);
4565 4673
4566 return err; 4674 return err;
4567} 4675}
@@ -4585,7 +4693,7 @@ static int swevent_hlist_get(struct perf_event *event)
4585 put_online_cpus(); 4693 put_online_cpus();
4586 4694
4587 return 0; 4695 return 0;
4588 fail: 4696fail:
4589 for_each_possible_cpu(cpu) { 4697 for_each_possible_cpu(cpu) {
4590 if (cpu == failed_cpu) 4698 if (cpu == failed_cpu)
4591 break; 4699 break;
@@ -4596,17 +4704,64 @@ static int swevent_hlist_get(struct perf_event *event)
4596 return err; 4704 return err;
4597} 4705}
4598 4706
4599#ifdef CONFIG_EVENT_TRACING 4707atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4708
4709static void sw_perf_event_destroy(struct perf_event *event)
4710{
4711 u64 event_id = event->attr.config;
4712
4713 WARN_ON(event->parent);
4714
4715 atomic_dec(&perf_swevent_enabled[event_id]);
4716 swevent_hlist_put(event);
4717}
4718
4719static int perf_swevent_init(struct perf_event *event)
4720{
4721 int event_id = event->attr.config;
4722
4723 if (event->attr.type != PERF_TYPE_SOFTWARE)
4724 return -ENOENT;
4725
4726 switch (event_id) {
4727 case PERF_COUNT_SW_CPU_CLOCK:
4728 case PERF_COUNT_SW_TASK_CLOCK:
4729 return -ENOENT;
4730
4731 default:
4732 break;
4733 }
4734
4735 if (event_id > PERF_COUNT_SW_MAX)
4736 return -ENOENT;
4737
4738 if (!event->parent) {
4739 int err;
4740
4741 err = swevent_hlist_get(event);
4742 if (err)
4743 return err;
4744
4745 atomic_inc(&perf_swevent_enabled[event_id]);
4746 event->destroy = sw_perf_event_destroy;
4747 }
4748
4749 return 0;
4750}
4751
4752static struct pmu perf_swevent = {
4753 .task_ctx_nr = perf_sw_context,
4600 4754
4601static const struct pmu perf_ops_tracepoint = { 4755 .event_init = perf_swevent_init,
4602 .enable = perf_trace_enable, 4756 .add = perf_swevent_add,
4603 .disable = perf_trace_disable, 4757 .del = perf_swevent_del,
4604 .start = perf_swevent_int, 4758 .start = perf_swevent_start,
4605 .stop = perf_swevent_void, 4759 .stop = perf_swevent_stop,
4606 .read = perf_swevent_read, 4760 .read = perf_swevent_read,
4607 .unthrottle = perf_swevent_void,
4608}; 4761};
4609 4762
4763#ifdef CONFIG_EVENT_TRACING
4764
4610static int perf_tp_filter_match(struct perf_event *event, 4765static int perf_tp_filter_match(struct perf_event *event,
4611 struct perf_sample_data *data) 4766 struct perf_sample_data *data)
4612{ 4767{
@@ -4650,7 +4805,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4650 4805
4651 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4806 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4652 if (perf_tp_event_match(event, &data, regs)) 4807 if (perf_tp_event_match(event, &data, regs))
4653 perf_swevent_add(event, count, 1, &data, regs); 4808 perf_swevent_event(event, count, 1, &data, regs);
4654 } 4809 }
4655 4810
4656 perf_swevent_put_recursion_context(rctx); 4811 perf_swevent_put_recursion_context(rctx);
@@ -4662,10 +4817,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4662 perf_trace_destroy(event); 4817 perf_trace_destroy(event);
4663} 4818}
4664 4819
4665static const struct pmu *tp_perf_event_init(struct perf_event *event) 4820static int perf_tp_event_init(struct perf_event *event)
4666{ 4821{
4667 int err; 4822 int err;
4668 4823
4824 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4825 return -ENOENT;
4826
4669 /* 4827 /*
4670 * Raw tracepoint data is a severe data leak, only allow root to 4828 * Raw tracepoint data is a severe data leak, only allow root to
4671 * have these. 4829 * have these.
@@ -4673,15 +4831,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4673 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4831 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4674 perf_paranoid_tracepoint_raw() && 4832 perf_paranoid_tracepoint_raw() &&
4675 !capable(CAP_SYS_ADMIN)) 4833 !capable(CAP_SYS_ADMIN))
4676 return ERR_PTR(-EPERM); 4834 return -EPERM;
4677 4835
4678 err = perf_trace_init(event); 4836 err = perf_trace_init(event);
4679 if (err) 4837 if (err)
4680 return NULL; 4838 return err;
4681 4839
4682 event->destroy = tp_perf_event_destroy; 4840 event->destroy = tp_perf_event_destroy;
4683 4841
4684 return &perf_ops_tracepoint; 4842 return 0;
4843}
4844
4845static struct pmu perf_tracepoint = {
4846 .task_ctx_nr = perf_sw_context,
4847
4848 .event_init = perf_tp_event_init,
4849 .add = perf_trace_add,
4850 .del = perf_trace_del,
4851 .start = perf_swevent_start,
4852 .stop = perf_swevent_stop,
4853 .read = perf_swevent_read,
4854};
4855
4856static inline void perf_tp_register(void)
4857{
4858 perf_pmu_register(&perf_tracepoint);
4685} 4859}
4686 4860
4687static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4861static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4709,9 +4883,8 @@ static void perf_event_free_filter(struct perf_event *event)
4709 4883
4710#else 4884#else
4711 4885
4712static const struct pmu *tp_perf_event_init(struct perf_event *event) 4886static inline void perf_tp_register(void)
4713{ 4887{
4714 return NULL;
4715} 4888}
4716 4889
4717static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4890static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4726,105 +4899,389 @@ static void perf_event_free_filter(struct perf_event *event)
4726#endif /* CONFIG_EVENT_TRACING */ 4899#endif /* CONFIG_EVENT_TRACING */
4727 4900
4728#ifdef CONFIG_HAVE_HW_BREAKPOINT 4901#ifdef CONFIG_HAVE_HW_BREAKPOINT
4729static void bp_perf_event_destroy(struct perf_event *event) 4902void perf_bp_event(struct perf_event *bp, void *data)
4730{ 4903{
4731 release_bp_slot(event); 4904 struct perf_sample_data sample;
4905 struct pt_regs *regs = data;
4906
4907 perf_sample_data_init(&sample, bp->attr.bp_addr);
4908
4909 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4910 perf_swevent_event(bp, 1, 1, &sample, regs);
4732} 4911}
4912#endif
4733 4913
4734static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4914/*
4915 * hrtimer based swevent callback
4916 */
4917
4918static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4735{ 4919{
4736 int err; 4920 enum hrtimer_restart ret = HRTIMER_RESTART;
4921 struct perf_sample_data data;
4922 struct pt_regs *regs;
4923 struct perf_event *event;
4924 u64 period;
4737 4925
4738 err = register_perf_hw_breakpoint(bp); 4926 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4739 if (err) 4927 event->pmu->read(event);
4740 return ERR_PTR(err);
4741 4928
4742 bp->destroy = bp_perf_event_destroy; 4929 perf_sample_data_init(&data, 0);
4930 data.period = event->hw.last_period;
4931 regs = get_irq_regs();
4932
4933 if (regs && !perf_exclude_event(event, regs)) {
4934 if (!(event->attr.exclude_idle && current->pid == 0))
4935 if (perf_event_overflow(event, 0, &data, regs))
4936 ret = HRTIMER_NORESTART;
4937 }
4743 4938
4744 return &perf_ops_bp; 4939 period = max_t(u64, 10000, event->hw.sample_period);
4940 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4941
4942 return ret;
4745} 4943}
4746 4944
4747void perf_bp_event(struct perf_event *bp, void *data) 4945static void perf_swevent_start_hrtimer(struct perf_event *event)
4748{ 4946{
4749 struct perf_sample_data sample; 4947 struct hw_perf_event *hwc = &event->hw;
4750 struct pt_regs *regs = data;
4751 4948
4752 perf_sample_data_init(&sample, bp->attr.bp_addr); 4949 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4950 hwc->hrtimer.function = perf_swevent_hrtimer;
4951 if (hwc->sample_period) {
4952 s64 period = local64_read(&hwc->period_left);
4953
4954 if (period) {
4955 if (period < 0)
4956 period = 10000;
4753 4957
4754 if (!perf_exclude_event(bp, regs)) 4958 local64_set(&hwc->period_left, 0);
4755 perf_swevent_add(bp, 1, 1, &sample, regs); 4959 } else {
4960 period = max_t(u64, 10000, hwc->sample_period);
4961 }
4962 __hrtimer_start_range_ns(&hwc->hrtimer,
4963 ns_to_ktime(period), 0,
4964 HRTIMER_MODE_REL_PINNED, 0);
4965 }
4756} 4966}
4757#else 4967
4758static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4968static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4759{ 4969{
4760 return NULL; 4970 struct hw_perf_event *hwc = &event->hw;
4971
4972 if (hwc->sample_period) {
4973 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4974 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4975
4976 hrtimer_cancel(&hwc->hrtimer);
4977 }
4761} 4978}
4762 4979
4763void perf_bp_event(struct perf_event *bp, void *regs) 4980/*
4981 * Software event: cpu wall time clock
4982 */
4983
4984static void cpu_clock_event_update(struct perf_event *event)
4764{ 4985{
4986 s64 prev;
4987 u64 now;
4988
4989 now = local_clock();
4990 prev = local64_xchg(&event->hw.prev_count, now);
4991 local64_add(now - prev, &event->count);
4765} 4992}
4766#endif
4767 4993
4768atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4994static void cpu_clock_event_start(struct perf_event *event, int flags)
4995{
4996 local64_set(&event->hw.prev_count, local_clock());
4997 perf_swevent_start_hrtimer(event);
4998}
4769 4999
4770static void sw_perf_event_destroy(struct perf_event *event) 5000static void cpu_clock_event_stop(struct perf_event *event, int flags)
4771{ 5001{
4772 u64 event_id = event->attr.config; 5002 perf_swevent_cancel_hrtimer(event);
5003 cpu_clock_event_update(event);
5004}
4773 5005
4774 WARN_ON(event->parent); 5006static int cpu_clock_event_add(struct perf_event *event, int flags)
5007{
5008 if (flags & PERF_EF_START)
5009 cpu_clock_event_start(event, flags);
4775 5010
4776 atomic_dec(&perf_swevent_enabled[event_id]); 5011 return 0;
4777 swevent_hlist_put(event);
4778} 5012}
4779 5013
4780static const struct pmu *sw_perf_event_init(struct perf_event *event) 5014static void cpu_clock_event_del(struct perf_event *event, int flags)
4781{ 5015{
4782 const struct pmu *pmu = NULL; 5016 cpu_clock_event_stop(event, flags);
4783 u64 event_id = event->attr.config; 5017}
5018
5019static void cpu_clock_event_read(struct perf_event *event)
5020{
5021 cpu_clock_event_update(event);
5022}
5023
5024static int cpu_clock_event_init(struct perf_event *event)
5025{
5026 if (event->attr.type != PERF_TYPE_SOFTWARE)
5027 return -ENOENT;
5028
5029 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5030 return -ENOENT;
4784 5031
5032 return 0;
5033}
5034
5035static struct pmu perf_cpu_clock = {
5036 .task_ctx_nr = perf_sw_context,
5037
5038 .event_init = cpu_clock_event_init,
5039 .add = cpu_clock_event_add,
5040 .del = cpu_clock_event_del,
5041 .start = cpu_clock_event_start,
5042 .stop = cpu_clock_event_stop,
5043 .read = cpu_clock_event_read,
5044};
5045
5046/*
5047 * Software event: task time clock
5048 */
5049
5050static void task_clock_event_update(struct perf_event *event, u64 now)
5051{
5052 u64 prev;
5053 s64 delta;
5054
5055 prev = local64_xchg(&event->hw.prev_count, now);
5056 delta = now - prev;
5057 local64_add(delta, &event->count);
5058}
5059
5060static void task_clock_event_start(struct perf_event *event, int flags)
5061{
5062 local64_set(&event->hw.prev_count, event->ctx->time);
5063 perf_swevent_start_hrtimer(event);
5064}
5065
5066static void task_clock_event_stop(struct perf_event *event, int flags)
5067{
5068 perf_swevent_cancel_hrtimer(event);
5069 task_clock_event_update(event, event->ctx->time);
5070}
5071
5072static int task_clock_event_add(struct perf_event *event, int flags)
5073{
5074 if (flags & PERF_EF_START)
5075 task_clock_event_start(event, flags);
5076
5077 return 0;
5078}
5079
5080static void task_clock_event_del(struct perf_event *event, int flags)
5081{
5082 task_clock_event_stop(event, PERF_EF_UPDATE);
5083}
5084
5085static void task_clock_event_read(struct perf_event *event)
5086{
5087 u64 time;
5088
5089 if (!in_nmi()) {
5090 update_context_time(event->ctx);
5091 time = event->ctx->time;
5092 } else {
5093 u64 now = perf_clock();
5094 u64 delta = now - event->ctx->timestamp;
5095 time = event->ctx->time + delta;
5096 }
5097
5098 task_clock_event_update(event, time);
5099}
5100
5101static int task_clock_event_init(struct perf_event *event)
5102{
5103 if (event->attr.type != PERF_TYPE_SOFTWARE)
5104 return -ENOENT;
5105
5106 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5107 return -ENOENT;
5108
5109 return 0;
5110}
5111
5112static struct pmu perf_task_clock = {
5113 .task_ctx_nr = perf_sw_context,
5114
5115 .event_init = task_clock_event_init,
5116 .add = task_clock_event_add,
5117 .del = task_clock_event_del,
5118 .start = task_clock_event_start,
5119 .stop = task_clock_event_stop,
5120 .read = task_clock_event_read,
5121};
5122
5123static void perf_pmu_nop_void(struct pmu *pmu)
5124{
5125}
5126
5127static int perf_pmu_nop_int(struct pmu *pmu)
5128{
5129 return 0;
5130}
5131
5132static void perf_pmu_start_txn(struct pmu *pmu)
5133{
5134 perf_pmu_disable(pmu);
5135}
5136
5137static int perf_pmu_commit_txn(struct pmu *pmu)
5138{
5139 perf_pmu_enable(pmu);
5140 return 0;
5141}
5142
5143static void perf_pmu_cancel_txn(struct pmu *pmu)
5144{
5145 perf_pmu_enable(pmu);
5146}
5147
5148/*
5149 * Ensures all contexts with the same task_ctx_nr have the same
5150 * pmu_cpu_context too.
5151 */
5152static void *find_pmu_context(int ctxn)
5153{
5154 struct pmu *pmu;
5155
5156 if (ctxn < 0)
5157 return NULL;
5158
5159 list_for_each_entry(pmu, &pmus, entry) {
5160 if (pmu->task_ctx_nr == ctxn)
5161 return pmu->pmu_cpu_context;
5162 }
5163
5164 return NULL;
5165}
5166
5167static void free_pmu_context(void * __percpu cpu_context)
5168{
5169 struct pmu *pmu;
5170
5171 mutex_lock(&pmus_lock);
4785 /* 5172 /*
4786 * Software events (currently) can't in general distinguish 5173 * Like a real lame refcount.
4787 * between user, kernel and hypervisor events.
4788 * However, context switches and cpu migrations are considered
4789 * to be kernel events, and page faults are never hypervisor
4790 * events.
4791 */ 5174 */
4792 switch (event_id) { 5175 list_for_each_entry(pmu, &pmus, entry) {
4793 case PERF_COUNT_SW_CPU_CLOCK: 5176 if (pmu->pmu_cpu_context == cpu_context)
4794 pmu = &perf_ops_cpu_clock; 5177 goto out;
5178 }
4795 5179
4796 break; 5180 free_percpu(cpu_context);
4797 case PERF_COUNT_SW_TASK_CLOCK: 5181out:
4798 /* 5182 mutex_unlock(&pmus_lock);
4799 * If the user instantiates this as a per-cpu event, 5183}
4800 * use the cpu_clock event instead.
4801 */
4802 if (event->ctx->task)
4803 pmu = &perf_ops_task_clock;
4804 else
4805 pmu = &perf_ops_cpu_clock;
4806 5184
4807 break; 5185int perf_pmu_register(struct pmu *pmu)
4808 case PERF_COUNT_SW_PAGE_FAULTS: 5186{
4809 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5187 int cpu, ret;
4810 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4811 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4812 case PERF_COUNT_SW_CPU_MIGRATIONS:
4813 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4814 case PERF_COUNT_SW_EMULATION_FAULTS:
4815 if (!event->parent) {
4816 int err;
4817
4818 err = swevent_hlist_get(event);
4819 if (err)
4820 return ERR_PTR(err);
4821 5188
4822 atomic_inc(&perf_swevent_enabled[event_id]); 5189 mutex_lock(&pmus_lock);
4823 event->destroy = sw_perf_event_destroy; 5190 ret = -ENOMEM;
5191 pmu->pmu_disable_count = alloc_percpu(int);
5192 if (!pmu->pmu_disable_count)
5193 goto unlock;
5194
5195 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5196 if (pmu->pmu_cpu_context)
5197 goto got_cpu_context;
5198
5199 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5200 if (!pmu->pmu_cpu_context)
5201 goto free_pdc;
5202
5203 for_each_possible_cpu(cpu) {
5204 struct perf_cpu_context *cpuctx;
5205
5206 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5207 __perf_event_init_context(&cpuctx->ctx);
5208 cpuctx->ctx.type = cpu_context;
5209 cpuctx->ctx.pmu = pmu;
5210 cpuctx->jiffies_interval = 1;
5211 INIT_LIST_HEAD(&cpuctx->rotation_list);
5212 }
5213
5214got_cpu_context:
5215 if (!pmu->start_txn) {
5216 if (pmu->pmu_enable) {
5217 /*
5218 * If we have pmu_enable/pmu_disable calls, install
5219 * transaction stubs that use that to try and batch
5220 * hardware accesses.
5221 */
5222 pmu->start_txn = perf_pmu_start_txn;
5223 pmu->commit_txn = perf_pmu_commit_txn;
5224 pmu->cancel_txn = perf_pmu_cancel_txn;
5225 } else {
5226 pmu->start_txn = perf_pmu_nop_void;
5227 pmu->commit_txn = perf_pmu_nop_int;
5228 pmu->cancel_txn = perf_pmu_nop_void;
5229 }
5230 }
5231
5232 if (!pmu->pmu_enable) {
5233 pmu->pmu_enable = perf_pmu_nop_void;
5234 pmu->pmu_disable = perf_pmu_nop_void;
5235 }
5236
5237 list_add_rcu(&pmu->entry, &pmus);
5238 ret = 0;
5239unlock:
5240 mutex_unlock(&pmus_lock);
5241
5242 return ret;
5243
5244free_pdc:
5245 free_percpu(pmu->pmu_disable_count);
5246 goto unlock;
5247}
5248
5249void perf_pmu_unregister(struct pmu *pmu)
5250{
5251 mutex_lock(&pmus_lock);
5252 list_del_rcu(&pmu->entry);
5253 mutex_unlock(&pmus_lock);
5254
5255 /*
5256 * We dereference the pmu list under both SRCU and regular RCU, so
5257 * synchronize against both of those.
5258 */
5259 synchronize_srcu(&pmus_srcu);
5260 synchronize_rcu();
5261
5262 free_percpu(pmu->pmu_disable_count);
5263 free_pmu_context(pmu->pmu_cpu_context);
5264}
5265
5266struct pmu *perf_init_event(struct perf_event *event)
5267{
5268 struct pmu *pmu = NULL;
5269 int idx;
5270
5271 idx = srcu_read_lock(&pmus_srcu);
5272 list_for_each_entry_rcu(pmu, &pmus, entry) {
5273 int ret = pmu->event_init(event);
5274 if (!ret)
5275 goto unlock;
5276
5277 if (ret != -ENOENT) {
5278 pmu = ERR_PTR(ret);
5279 goto unlock;
4824 } 5280 }
4825 pmu = &perf_ops_generic;
4826 break;
4827 } 5281 }
5282 pmu = ERR_PTR(-ENOENT);
5283unlock:
5284 srcu_read_unlock(&pmus_srcu, idx);
4828 5285
4829 return pmu; 5286 return pmu;
4830} 5287}
@@ -4833,20 +5290,17 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4833 * Allocate and initialize a event structure 5290 * Allocate and initialize a event structure
4834 */ 5291 */
4835static struct perf_event * 5292static struct perf_event *
4836perf_event_alloc(struct perf_event_attr *attr, 5293perf_event_alloc(struct perf_event_attr *attr, int cpu,
4837 int cpu,
4838 struct perf_event_context *ctx,
4839 struct perf_event *group_leader, 5294 struct perf_event *group_leader,
4840 struct perf_event *parent_event, 5295 struct perf_event *parent_event,
4841 perf_overflow_handler_t overflow_handler, 5296 perf_overflow_handler_t overflow_handler)
4842 gfp_t gfpflags)
4843{ 5297{
4844 const struct pmu *pmu; 5298 struct pmu *pmu;
4845 struct perf_event *event; 5299 struct perf_event *event;
4846 struct hw_perf_event *hwc; 5300 struct hw_perf_event *hwc;
4847 long err; 5301 long err;
4848 5302
4849 event = kzalloc(sizeof(*event), gfpflags); 5303 event = kzalloc(sizeof(*event), GFP_KERNEL);
4850 if (!event) 5304 if (!event)
4851 return ERR_PTR(-ENOMEM); 5305 return ERR_PTR(-ENOMEM);
4852 5306
@@ -4871,7 +5325,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4871 event->attr = *attr; 5325 event->attr = *attr;
4872 event->group_leader = group_leader; 5326 event->group_leader = group_leader;
4873 event->pmu = NULL; 5327 event->pmu = NULL;
4874 event->ctx = ctx;
4875 event->oncpu = -1; 5328 event->oncpu = -1;
4876 5329
4877 event->parent = parent_event; 5330 event->parent = parent_event;
@@ -4905,29 +5358,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4905 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5358 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4906 goto done; 5359 goto done;
4907 5360
4908 switch (attr->type) { 5361 pmu = perf_init_event(event);
4909 case PERF_TYPE_RAW:
4910 case PERF_TYPE_HARDWARE:
4911 case PERF_TYPE_HW_CACHE:
4912 pmu = hw_perf_event_init(event);
4913 break;
4914 5362
4915 case PERF_TYPE_SOFTWARE:
4916 pmu = sw_perf_event_init(event);
4917 break;
4918
4919 case PERF_TYPE_TRACEPOINT:
4920 pmu = tp_perf_event_init(event);
4921 break;
4922
4923 case PERF_TYPE_BREAKPOINT:
4924 pmu = bp_perf_event_init(event);
4925 break;
4926
4927
4928 default:
4929 break;
4930 }
4931done: 5363done:
4932 err = 0; 5364 err = 0;
4933 if (!pmu) 5365 if (!pmu)
@@ -4952,6 +5384,13 @@ done:
4952 atomic_inc(&nr_comm_events); 5384 atomic_inc(&nr_comm_events);
4953 if (event->attr.task) 5385 if (event->attr.task)
4954 atomic_inc(&nr_task_events); 5386 atomic_inc(&nr_task_events);
5387 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5388 err = get_callchain_buffers();
5389 if (err) {
5390 free_event(event);
5391 return ERR_PTR(err);
5392 }
5393 }
4955 } 5394 }
4956 5395
4957 return event; 5396 return event;
@@ -5099,12 +5538,16 @@ SYSCALL_DEFINE5(perf_event_open,
5099 struct perf_event_attr __user *, attr_uptr, 5538 struct perf_event_attr __user *, attr_uptr,
5100 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5539 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5101{ 5540{
5102 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5541 struct perf_event *group_leader = NULL, *output_event = NULL;
5542 struct perf_event *event, *sibling;
5103 struct perf_event_attr attr; 5543 struct perf_event_attr attr;
5104 struct perf_event_context *ctx; 5544 struct perf_event_context *ctx;
5105 struct file *event_file = NULL; 5545 struct file *event_file = NULL;
5106 struct file *group_file = NULL; 5546 struct file *group_file = NULL;
5547 struct task_struct *task = NULL;
5548 struct pmu *pmu;
5107 int event_fd; 5549 int event_fd;
5550 int move_group = 0;
5108 int fput_needed = 0; 5551 int fput_needed = 0;
5109 int err; 5552 int err;
5110 5553
@@ -5130,20 +5573,11 @@ SYSCALL_DEFINE5(perf_event_open,
5130 if (event_fd < 0) 5573 if (event_fd < 0)
5131 return event_fd; 5574 return event_fd;
5132 5575
5133 /*
5134 * Get the target context (task or percpu):
5135 */
5136 ctx = find_get_context(pid, cpu);
5137 if (IS_ERR(ctx)) {
5138 err = PTR_ERR(ctx);
5139 goto err_fd;
5140 }
5141
5142 if (group_fd != -1) { 5576 if (group_fd != -1) {
5143 group_leader = perf_fget_light(group_fd, &fput_needed); 5577 group_leader = perf_fget_light(group_fd, &fput_needed);
5144 if (IS_ERR(group_leader)) { 5578 if (IS_ERR(group_leader)) {
5145 err = PTR_ERR(group_leader); 5579 err = PTR_ERR(group_leader);
5146 goto err_put_context; 5580 goto err_fd;
5147 } 5581 }
5148 group_file = group_leader->filp; 5582 group_file = group_leader->filp;
5149 if (flags & PERF_FLAG_FD_OUTPUT) 5583 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5152,6 +5586,58 @@ SYSCALL_DEFINE5(perf_event_open,
5152 group_leader = NULL; 5586 group_leader = NULL;
5153 } 5587 }
5154 5588
5589 event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
5590 if (IS_ERR(event)) {
5591 err = PTR_ERR(event);
5592 goto err_fd;
5593 }
5594
5595 /*
5596 * Special case software events and allow them to be part of
5597 * any hardware group.
5598 */
5599 pmu = event->pmu;
5600
5601 if (group_leader &&
5602 (is_software_event(event) != is_software_event(group_leader))) {
5603 if (is_software_event(event)) {
5604 /*
5605 * If event and group_leader are not both a software
5606 * event, and event is, then group leader is not.
5607 *
5608 * Allow the addition of software events to !software
5609 * groups, this is safe because software events never
5610 * fail to schedule.
5611 */
5612 pmu = group_leader->pmu;
5613 } else if (is_software_event(group_leader) &&
5614 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5615 /*
5616 * In case the group is a pure software group, and we
5617 * try to add a hardware event, move the whole group to
5618 * the hardware context.
5619 */
5620 move_group = 1;
5621 }
5622 }
5623
5624 if (pid != -1) {
5625 task = find_lively_task_by_vpid(pid);
5626 if (IS_ERR(task)) {
5627 err = PTR_ERR(task);
5628 goto err_group_fd;
5629 }
5630 }
5631
5632 /*
5633 * Get the target context (task or percpu):
5634 */
5635 ctx = find_get_context(pmu, task, cpu);
5636 if (IS_ERR(ctx)) {
5637 err = PTR_ERR(ctx);
5638 goto err_group_fd;
5639 }
5640
5155 /* 5641 /*
5156 * Look up the group leader (we will attach this event to it): 5642 * Look up the group leader (we will attach this event to it):
5157 */ 5643 */
@@ -5163,42 +5649,66 @@ SYSCALL_DEFINE5(perf_event_open,
5163 * becoming part of another group-sibling): 5649 * becoming part of another group-sibling):
5164 */ 5650 */
5165 if (group_leader->group_leader != group_leader) 5651 if (group_leader->group_leader != group_leader)
5166 goto err_put_context; 5652 goto err_context;
5167 /* 5653 /*
5168 * Do not allow to attach to a group in a different 5654 * Do not allow to attach to a group in a different
5169 * task or CPU context: 5655 * task or CPU context:
5170 */ 5656 */
5171 if (group_leader->ctx != ctx) 5657 if (move_group) {
5172 goto err_put_context; 5658 if (group_leader->ctx->type != ctx->type)
5659 goto err_context;
5660 } else {
5661 if (group_leader->ctx != ctx)
5662 goto err_context;
5663 }
5664
5173 /* 5665 /*
5174 * Only a group leader can be exclusive or pinned 5666 * Only a group leader can be exclusive or pinned
5175 */ 5667 */
5176 if (attr.exclusive || attr.pinned) 5668 if (attr.exclusive || attr.pinned)
5177 goto err_put_context; 5669 goto err_context;
5178 }
5179
5180 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5181 NULL, NULL, GFP_KERNEL);
5182 if (IS_ERR(event)) {
5183 err = PTR_ERR(event);
5184 goto err_put_context;
5185 } 5670 }
5186 5671
5187 if (output_event) { 5672 if (output_event) {
5188 err = perf_event_set_output(event, output_event); 5673 err = perf_event_set_output(event, output_event);
5189 if (err) 5674 if (err)
5190 goto err_free_put_context; 5675 goto err_context;
5191 } 5676 }
5192 5677
5193 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5678 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5194 if (IS_ERR(event_file)) { 5679 if (IS_ERR(event_file)) {
5195 err = PTR_ERR(event_file); 5680 err = PTR_ERR(event_file);
5196 goto err_free_put_context; 5681 goto err_context;
5682 }
5683
5684 if (move_group) {
5685 struct perf_event_context *gctx = group_leader->ctx;
5686
5687 mutex_lock(&gctx->mutex);
5688 perf_event_remove_from_context(group_leader);
5689 list_for_each_entry(sibling, &group_leader->sibling_list,
5690 group_entry) {
5691 perf_event_remove_from_context(sibling);
5692 put_ctx(gctx);
5693 }
5694 mutex_unlock(&gctx->mutex);
5695 put_ctx(gctx);
5197 } 5696 }
5198 5697
5199 event->filp = event_file; 5698 event->filp = event_file;
5200 WARN_ON_ONCE(ctx->parent_ctx); 5699 WARN_ON_ONCE(ctx->parent_ctx);
5201 mutex_lock(&ctx->mutex); 5700 mutex_lock(&ctx->mutex);
5701
5702 if (move_group) {
5703 perf_install_in_context(ctx, group_leader, cpu);
5704 get_ctx(ctx);
5705 list_for_each_entry(sibling, &group_leader->sibling_list,
5706 group_entry) {
5707 perf_install_in_context(ctx, sibling, cpu);
5708 get_ctx(ctx);
5709 }
5710 }
5711
5202 perf_install_in_context(ctx, event, cpu); 5712 perf_install_in_context(ctx, event, cpu);
5203 ++ctx->generation; 5713 ++ctx->generation;
5204 mutex_unlock(&ctx->mutex); 5714 mutex_unlock(&ctx->mutex);
@@ -5219,11 +5729,11 @@ SYSCALL_DEFINE5(perf_event_open,
5219 fd_install(event_fd, event_file); 5729 fd_install(event_fd, event_file);
5220 return event_fd; 5730 return event_fd;
5221 5731
5222err_free_put_context: 5732err_context:
5223 free_event(event);
5224err_put_context:
5225 fput_light(group_file, fput_needed);
5226 put_ctx(ctx); 5733 put_ctx(ctx);
5734err_group_fd:
5735 fput_light(group_file, fput_needed);
5736 free_event(event);
5227err_fd: 5737err_fd:
5228 put_unused_fd(event_fd); 5738 put_unused_fd(event_fd);
5229 return err; 5739 return err;
@@ -5234,32 +5744,31 @@ err_fd:
5234 * 5744 *
5235 * @attr: attributes of the counter to create 5745 * @attr: attributes of the counter to create
5236 * @cpu: cpu in which the counter is bound 5746 * @cpu: cpu in which the counter is bound
5237 * @pid: task to profile 5747 * @task: task to profile (NULL for percpu)
5238 */ 5748 */
5239struct perf_event * 5749struct perf_event *
5240perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5750perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5241 pid_t pid, 5751 struct task_struct *task,
5242 perf_overflow_handler_t overflow_handler) 5752 perf_overflow_handler_t overflow_handler)
5243{ 5753{
5244 struct perf_event *event;
5245 struct perf_event_context *ctx; 5754 struct perf_event_context *ctx;
5755 struct perf_event *event;
5246 int err; 5756 int err;
5247 5757
5248 /* 5758 /*
5249 * Get the target context (task or percpu): 5759 * Get the target context (task or percpu):
5250 */ 5760 */
5251 5761
5252 ctx = find_get_context(pid, cpu); 5762 event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
5253 if (IS_ERR(ctx)) {
5254 err = PTR_ERR(ctx);
5255 goto err_exit;
5256 }
5257
5258 event = perf_event_alloc(attr, cpu, ctx, NULL,
5259 NULL, overflow_handler, GFP_KERNEL);
5260 if (IS_ERR(event)) { 5763 if (IS_ERR(event)) {
5261 err = PTR_ERR(event); 5764 err = PTR_ERR(event);
5262 goto err_put_context; 5765 goto err;
5766 }
5767
5768 ctx = find_get_context(event->pmu, task, cpu);
5769 if (IS_ERR(ctx)) {
5770 err = PTR_ERR(ctx);
5771 goto err_free;
5263 } 5772 }
5264 5773
5265 event->filp = NULL; 5774 event->filp = NULL;
@@ -5277,112 +5786,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5277 5786
5278 return event; 5787 return event;
5279 5788
5280 err_put_context: 5789err_free:
5281 put_ctx(ctx); 5790 free_event(event);
5282 err_exit: 5791err:
5283 return ERR_PTR(err); 5792 return ERR_PTR(err);
5284} 5793}
5285EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5794EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5286 5795
5287/*
5288 * inherit a event from parent task to child task:
5289 */
5290static struct perf_event *
5291inherit_event(struct perf_event *parent_event,
5292 struct task_struct *parent,
5293 struct perf_event_context *parent_ctx,
5294 struct task_struct *child,
5295 struct perf_event *group_leader,
5296 struct perf_event_context *child_ctx)
5297{
5298 struct perf_event *child_event;
5299
5300 /*
5301 * Instead of creating recursive hierarchies of events,
5302 * we link inherited events back to the original parent,
5303 * which has a filp for sure, which we use as the reference
5304 * count:
5305 */
5306 if (parent_event->parent)
5307 parent_event = parent_event->parent;
5308
5309 child_event = perf_event_alloc(&parent_event->attr,
5310 parent_event->cpu, child_ctx,
5311 group_leader, parent_event,
5312 NULL, GFP_KERNEL);
5313 if (IS_ERR(child_event))
5314 return child_event;
5315 get_ctx(child_ctx);
5316
5317 /*
5318 * Make the child state follow the state of the parent event,
5319 * not its attr.disabled bit. We hold the parent's mutex,
5320 * so we won't race with perf_event_{en, dis}able_family.
5321 */
5322 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5323 child_event->state = PERF_EVENT_STATE_INACTIVE;
5324 else
5325 child_event->state = PERF_EVENT_STATE_OFF;
5326
5327 if (parent_event->attr.freq) {
5328 u64 sample_period = parent_event->hw.sample_period;
5329 struct hw_perf_event *hwc = &child_event->hw;
5330
5331 hwc->sample_period = sample_period;
5332 hwc->last_period = sample_period;
5333
5334 local64_set(&hwc->period_left, sample_period);
5335 }
5336
5337 child_event->overflow_handler = parent_event->overflow_handler;
5338
5339 /*
5340 * Link it up in the child's context:
5341 */
5342 add_event_to_ctx(child_event, child_ctx);
5343
5344 /*
5345 * Get a reference to the parent filp - we will fput it
5346 * when the child event exits. This is safe to do because
5347 * we are in the parent and we know that the filp still
5348 * exists and has a nonzero count:
5349 */
5350 atomic_long_inc(&parent_event->filp->f_count);
5351
5352 /*
5353 * Link this into the parent event's child list
5354 */
5355 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5356 mutex_lock(&parent_event->child_mutex);
5357 list_add_tail(&child_event->child_list, &parent_event->child_list);
5358 mutex_unlock(&parent_event->child_mutex);
5359
5360 return child_event;
5361}
5362
5363static int inherit_group(struct perf_event *parent_event,
5364 struct task_struct *parent,
5365 struct perf_event_context *parent_ctx,
5366 struct task_struct *child,
5367 struct perf_event_context *child_ctx)
5368{
5369 struct perf_event *leader;
5370 struct perf_event *sub;
5371 struct perf_event *child_ctr;
5372
5373 leader = inherit_event(parent_event, parent, parent_ctx,
5374 child, NULL, child_ctx);
5375 if (IS_ERR(leader))
5376 return PTR_ERR(leader);
5377 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5378 child_ctr = inherit_event(sub, parent, parent_ctx,
5379 child, leader, child_ctx);
5380 if (IS_ERR(child_ctr))
5381 return PTR_ERR(child_ctr);
5382 }
5383 return 0;
5384}
5385
5386static void sync_child_event(struct perf_event *child_event, 5796static void sync_child_event(struct perf_event *child_event,
5387 struct task_struct *child) 5797 struct task_struct *child)
5388{ 5798{
@@ -5439,16 +5849,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5439 } 5849 }
5440} 5850}
5441 5851
5442/* 5852static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5443 * When a child task exits, feed back event values to parent events.
5444 */
5445void perf_event_exit_task(struct task_struct *child)
5446{ 5853{
5447 struct perf_event *child_event, *tmp; 5854 struct perf_event *child_event, *tmp;
5448 struct perf_event_context *child_ctx; 5855 struct perf_event_context *child_ctx;
5449 unsigned long flags; 5856 unsigned long flags;
5450 5857
5451 if (likely(!child->perf_event_ctxp)) { 5858 if (likely(!child->perf_event_ctxp[ctxn])) {
5452 perf_event_task(child, NULL, 0); 5859 perf_event_task(child, NULL, 0);
5453 return; 5860 return;
5454 } 5861 }
@@ -5460,7 +5867,7 @@ void perf_event_exit_task(struct task_struct *child)
5460 * scheduled, so we are now safe from rescheduling changing 5867 * scheduled, so we are now safe from rescheduling changing
5461 * our context. 5868 * our context.
5462 */ 5869 */
5463 child_ctx = child->perf_event_ctxp; 5870 child_ctx = child->perf_event_ctxp[ctxn];
5464 __perf_event_task_sched_out(child_ctx); 5871 __perf_event_task_sched_out(child_ctx);
5465 5872
5466 /* 5873 /*
@@ -5469,7 +5876,7 @@ void perf_event_exit_task(struct task_struct *child)
5469 * incremented the context's refcount before we do put_ctx below. 5876 * incremented the context's refcount before we do put_ctx below.
5470 */ 5877 */
5471 raw_spin_lock(&child_ctx->lock); 5878 raw_spin_lock(&child_ctx->lock);
5472 child->perf_event_ctxp = NULL; 5879 child->perf_event_ctxp[ctxn] = NULL;
5473 /* 5880 /*
5474 * If this context is a clone; unclone it so it can't get 5881 * If this context is a clone; unclone it so it can't get
5475 * swapped to another process while we're removing all 5882 * swapped to another process while we're removing all
@@ -5522,6 +5929,17 @@ again:
5522 put_ctx(child_ctx); 5929 put_ctx(child_ctx);
5523} 5930}
5524 5931
5932/*
5933 * When a child task exits, feed back event values to parent events.
5934 */
5935void perf_event_exit_task(struct task_struct *child)
5936{
5937 int ctxn;
5938
5939 for_each_task_context_nr(ctxn)
5940 perf_event_exit_task_context(child, ctxn);
5941}
5942
5525static void perf_free_event(struct perf_event *event, 5943static void perf_free_event(struct perf_event *event,
5526 struct perf_event_context *ctx) 5944 struct perf_event_context *ctx)
5527{ 5945{
@@ -5543,48 +5961,165 @@ static void perf_free_event(struct perf_event *event,
5543 5961
5544/* 5962/*
5545 * free an unexposed, unused context as created by inheritance by 5963 * free an unexposed, unused context as created by inheritance by
5546 * init_task below, used by fork() in case of fail. 5964 * perf_event_init_task below, used by fork() in case of fail.
5547 */ 5965 */
5548void perf_event_free_task(struct task_struct *task) 5966void perf_event_free_task(struct task_struct *task)
5549{ 5967{
5550 struct perf_event_context *ctx = task->perf_event_ctxp; 5968 struct perf_event_context *ctx;
5551 struct perf_event *event, *tmp; 5969 struct perf_event *event, *tmp;
5970 int ctxn;
5552 5971
5553 if (!ctx) 5972 for_each_task_context_nr(ctxn) {
5554 return; 5973 ctx = task->perf_event_ctxp[ctxn];
5974 if (!ctx)
5975 continue;
5555 5976
5556 mutex_lock(&ctx->mutex); 5977 mutex_lock(&ctx->mutex);
5557again: 5978again:
5558 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5979 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5559 perf_free_event(event, ctx); 5980 group_entry)
5981 perf_free_event(event, ctx);
5560 5982
5561 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5983 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5562 group_entry) 5984 group_entry)
5563 perf_free_event(event, ctx); 5985 perf_free_event(event, ctx);
5564 5986
5565 if (!list_empty(&ctx->pinned_groups) || 5987 if (!list_empty(&ctx->pinned_groups) ||
5566 !list_empty(&ctx->flexible_groups)) 5988 !list_empty(&ctx->flexible_groups))
5567 goto again; 5989 goto again;
5568 5990
5569 mutex_unlock(&ctx->mutex); 5991 mutex_unlock(&ctx->mutex);
5570 5992
5571 put_ctx(ctx); 5993 put_ctx(ctx);
5994 }
5995}
5996
5997void perf_event_delayed_put(struct task_struct *task)
5998{
5999 int ctxn;
6000
6001 for_each_task_context_nr(ctxn)
6002 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6003}
6004
6005/*
6006 * inherit a event from parent task to child task:
6007 */
6008static struct perf_event *
6009inherit_event(struct perf_event *parent_event,
6010 struct task_struct *parent,
6011 struct perf_event_context *parent_ctx,
6012 struct task_struct *child,
6013 struct perf_event *group_leader,
6014 struct perf_event_context *child_ctx)
6015{
6016 struct perf_event *child_event;
6017 unsigned long flags;
6018
6019 /*
6020 * Instead of creating recursive hierarchies of events,
6021 * we link inherited events back to the original parent,
6022 * which has a filp for sure, which we use as the reference
6023 * count:
6024 */
6025 if (parent_event->parent)
6026 parent_event = parent_event->parent;
6027
6028 child_event = perf_event_alloc(&parent_event->attr,
6029 parent_event->cpu,
6030 group_leader, parent_event,
6031 NULL);
6032 if (IS_ERR(child_event))
6033 return child_event;
6034 get_ctx(child_ctx);
6035
6036 /*
6037 * Make the child state follow the state of the parent event,
6038 * not its attr.disabled bit. We hold the parent's mutex,
6039 * so we won't race with perf_event_{en, dis}able_family.
6040 */
6041 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6042 child_event->state = PERF_EVENT_STATE_INACTIVE;
6043 else
6044 child_event->state = PERF_EVENT_STATE_OFF;
6045
6046 if (parent_event->attr.freq) {
6047 u64 sample_period = parent_event->hw.sample_period;
6048 struct hw_perf_event *hwc = &child_event->hw;
6049
6050 hwc->sample_period = sample_period;
6051 hwc->last_period = sample_period;
6052
6053 local64_set(&hwc->period_left, sample_period);
6054 }
6055
6056 child_event->ctx = child_ctx;
6057 child_event->overflow_handler = parent_event->overflow_handler;
6058
6059 /*
6060 * Link it up in the child's context:
6061 */
6062 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6063 add_event_to_ctx(child_event, child_ctx);
6064 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6065
6066 /*
6067 * Get a reference to the parent filp - we will fput it
6068 * when the child event exits. This is safe to do because
6069 * we are in the parent and we know that the filp still
6070 * exists and has a nonzero count:
6071 */
6072 atomic_long_inc(&parent_event->filp->f_count);
6073
6074 /*
6075 * Link this into the parent event's child list
6076 */
6077 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6078 mutex_lock(&parent_event->child_mutex);
6079 list_add_tail(&child_event->child_list, &parent_event->child_list);
6080 mutex_unlock(&parent_event->child_mutex);
6081
6082 return child_event;
6083}
6084
6085static int inherit_group(struct perf_event *parent_event,
6086 struct task_struct *parent,
6087 struct perf_event_context *parent_ctx,
6088 struct task_struct *child,
6089 struct perf_event_context *child_ctx)
6090{
6091 struct perf_event *leader;
6092 struct perf_event *sub;
6093 struct perf_event *child_ctr;
6094
6095 leader = inherit_event(parent_event, parent, parent_ctx,
6096 child, NULL, child_ctx);
6097 if (IS_ERR(leader))
6098 return PTR_ERR(leader);
6099 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6100 child_ctr = inherit_event(sub, parent, parent_ctx,
6101 child, leader, child_ctx);
6102 if (IS_ERR(child_ctr))
6103 return PTR_ERR(child_ctr);
6104 }
6105 return 0;
5572} 6106}
5573 6107
5574static int 6108static int
5575inherit_task_group(struct perf_event *event, struct task_struct *parent, 6109inherit_task_group(struct perf_event *event, struct task_struct *parent,
5576 struct perf_event_context *parent_ctx, 6110 struct perf_event_context *parent_ctx,
5577 struct task_struct *child, 6111 struct task_struct *child, int ctxn,
5578 int *inherited_all) 6112 int *inherited_all)
5579{ 6113{
5580 int ret; 6114 int ret;
5581 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6115 struct perf_event_context *child_ctx;
5582 6116
5583 if (!event->attr.inherit) { 6117 if (!event->attr.inherit) {
5584 *inherited_all = 0; 6118 *inherited_all = 0;
5585 return 0; 6119 return 0;
5586 } 6120 }
5587 6121
6122 child_ctx = child->perf_event_ctxp[ctxn];
5588 if (!child_ctx) { 6123 if (!child_ctx) {
5589 /* 6124 /*
5590 * This is executed from the parent task context, so 6125 * This is executed from the parent task context, so
@@ -5593,14 +6128,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5593 * child. 6128 * child.
5594 */ 6129 */
5595 6130
5596 child_ctx = kzalloc(sizeof(struct perf_event_context), 6131 child_ctx = alloc_perf_context(event->pmu, child);
5597 GFP_KERNEL);
5598 if (!child_ctx) 6132 if (!child_ctx)
5599 return -ENOMEM; 6133 return -ENOMEM;
5600 6134
5601 __perf_event_init_context(child_ctx, child); 6135 child->perf_event_ctxp[ctxn] = child_ctx;
5602 child->perf_event_ctxp = child_ctx;
5603 get_task_struct(child);
5604 } 6136 }
5605 6137
5606 ret = inherit_group(event, parent, parent_ctx, 6138 ret = inherit_group(event, parent, parent_ctx,
@@ -5612,11 +6144,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5612 return ret; 6144 return ret;
5613} 6145}
5614 6146
5615
5616/* 6147/*
5617 * Initialize the perf_event context in task_struct 6148 * Initialize the perf_event context in task_struct
5618 */ 6149 */
5619int perf_event_init_task(struct task_struct *child) 6150int perf_event_init_context(struct task_struct *child, int ctxn)
5620{ 6151{
5621 struct perf_event_context *child_ctx, *parent_ctx; 6152 struct perf_event_context *child_ctx, *parent_ctx;
5622 struct perf_event_context *cloned_ctx; 6153 struct perf_event_context *cloned_ctx;
@@ -5625,19 +6156,19 @@ int perf_event_init_task(struct task_struct *child)
5625 int inherited_all = 1; 6156 int inherited_all = 1;
5626 int ret = 0; 6157 int ret = 0;
5627 6158
5628 child->perf_event_ctxp = NULL; 6159 child->perf_event_ctxp[ctxn] = NULL;
5629 6160
5630 mutex_init(&child->perf_event_mutex); 6161 mutex_init(&child->perf_event_mutex);
5631 INIT_LIST_HEAD(&child->perf_event_list); 6162 INIT_LIST_HEAD(&child->perf_event_list);
5632 6163
5633 if (likely(!parent->perf_event_ctxp)) 6164 if (likely(!parent->perf_event_ctxp[ctxn]))
5634 return 0; 6165 return 0;
5635 6166
5636 /* 6167 /*
5637 * If the parent's context is a clone, pin it so it won't get 6168 * If the parent's context is a clone, pin it so it won't get
5638 * swapped under us. 6169 * swapped under us.
5639 */ 6170 */
5640 parent_ctx = perf_pin_task_context(parent); 6171 parent_ctx = perf_pin_task_context(parent, ctxn);
5641 6172
5642 /* 6173 /*
5643 * No need to check if parent_ctx != NULL here; since we saw 6174 * No need to check if parent_ctx != NULL here; since we saw
@@ -5657,20 +6188,20 @@ int perf_event_init_task(struct task_struct *child)
5657 * the list, not manipulating it: 6188 * the list, not manipulating it:
5658 */ 6189 */
5659 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6190 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5660 ret = inherit_task_group(event, parent, parent_ctx, child, 6191 ret = inherit_task_group(event, parent, parent_ctx,
5661 &inherited_all); 6192 child, ctxn, &inherited_all);
5662 if (ret) 6193 if (ret)
5663 break; 6194 break;
5664 } 6195 }
5665 6196
5666 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6197 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5667 ret = inherit_task_group(event, parent, parent_ctx, child, 6198 ret = inherit_task_group(event, parent, parent_ctx,
5668 &inherited_all); 6199 child, ctxn, &inherited_all);
5669 if (ret) 6200 if (ret)
5670 break; 6201 break;
5671 } 6202 }
5672 6203
5673 child_ctx = child->perf_event_ctxp; 6204 child_ctx = child->perf_event_ctxp[ctxn];
5674 6205
5675 if (child_ctx && inherited_all) { 6206 if (child_ctx && inherited_all) {
5676 /* 6207 /*
@@ -5699,63 +6230,98 @@ int perf_event_init_task(struct task_struct *child)
5699 return ret; 6230 return ret;
5700} 6231}
5701 6232
6233/*
6234 * Initialize the perf_event context in task_struct
6235 */
6236int perf_event_init_task(struct task_struct *child)
6237{
6238 int ctxn, ret;
6239
6240 for_each_task_context_nr(ctxn) {
6241 ret = perf_event_init_context(child, ctxn);
6242 if (ret)
6243 return ret;
6244 }
6245
6246 return 0;
6247}
6248
5702static void __init perf_event_init_all_cpus(void) 6249static void __init perf_event_init_all_cpus(void)
5703{ 6250{
6251 struct swevent_htable *swhash;
5704 int cpu; 6252 int cpu;
5705 struct perf_cpu_context *cpuctx;
5706 6253
5707 for_each_possible_cpu(cpu) { 6254 for_each_possible_cpu(cpu) {
5708 cpuctx = &per_cpu(perf_cpu_context, cpu); 6255 swhash = &per_cpu(swevent_htable, cpu);
5709 mutex_init(&cpuctx->hlist_mutex); 6256 mutex_init(&swhash->hlist_mutex);
5710 __perf_event_init_context(&cpuctx->ctx, NULL); 6257 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5711 } 6258 }
5712} 6259}
5713 6260
5714static void __cpuinit perf_event_init_cpu(int cpu) 6261static void __cpuinit perf_event_init_cpu(int cpu)
5715{ 6262{
5716 struct perf_cpu_context *cpuctx; 6263 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5717
5718 cpuctx = &per_cpu(perf_cpu_context, cpu);
5719
5720 spin_lock(&perf_resource_lock);
5721 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5722 spin_unlock(&perf_resource_lock);
5723 6264
5724 mutex_lock(&cpuctx->hlist_mutex); 6265 mutex_lock(&swhash->hlist_mutex);
5725 if (cpuctx->hlist_refcount > 0) { 6266 if (swhash->hlist_refcount > 0) {
5726 struct swevent_hlist *hlist; 6267 struct swevent_hlist *hlist;
5727 6268
5728 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6269 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5729 WARN_ON_ONCE(!hlist); 6270 WARN_ON(!hlist);
5730 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6271 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5731 } 6272 }
5732 mutex_unlock(&cpuctx->hlist_mutex); 6273 mutex_unlock(&swhash->hlist_mutex);
5733} 6274}
5734 6275
5735#ifdef CONFIG_HOTPLUG_CPU 6276#ifdef CONFIG_HOTPLUG_CPU
5736static void __perf_event_exit_cpu(void *info) 6277static void perf_pmu_rotate_stop(struct pmu *pmu)
5737{ 6278{
5738 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6279 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5739 struct perf_event_context *ctx = &cpuctx->ctx; 6280
6281 WARN_ON(!irqs_disabled());
6282
6283 list_del_init(&cpuctx->rotation_list);
6284}
6285
6286static void __perf_event_exit_context(void *__info)
6287{
6288 struct perf_event_context *ctx = __info;
5740 struct perf_event *event, *tmp; 6289 struct perf_event *event, *tmp;
5741 6290
6291 perf_pmu_rotate_stop(ctx->pmu);
6292
5742 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6293 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5743 __perf_event_remove_from_context(event); 6294 __perf_event_remove_from_context(event);
5744 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6295 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5745 __perf_event_remove_from_context(event); 6296 __perf_event_remove_from_context(event);
5746} 6297}
6298
6299static void perf_event_exit_cpu_context(int cpu)
6300{
6301 struct perf_event_context *ctx;
6302 struct pmu *pmu;
6303 int idx;
6304
6305 idx = srcu_read_lock(&pmus_srcu);
6306 list_for_each_entry_rcu(pmu, &pmus, entry) {
6307 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6308
6309 mutex_lock(&ctx->mutex);
6310 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6311 mutex_unlock(&ctx->mutex);
6312 }
6313 srcu_read_unlock(&pmus_srcu, idx);
6314}
6315
5747static void perf_event_exit_cpu(int cpu) 6316static void perf_event_exit_cpu(int cpu)
5748{ 6317{
5749 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6318 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5750 struct perf_event_context *ctx = &cpuctx->ctx;
5751 6319
5752 mutex_lock(&cpuctx->hlist_mutex); 6320 mutex_lock(&swhash->hlist_mutex);
5753 swevent_hlist_release(cpuctx); 6321 swevent_hlist_release(swhash);
5754 mutex_unlock(&cpuctx->hlist_mutex); 6322 mutex_unlock(&swhash->hlist_mutex);
5755 6323
5756 mutex_lock(&ctx->mutex); 6324 perf_event_exit_cpu_context(cpu);
5757 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5758 mutex_unlock(&ctx->mutex);
5759} 6325}
5760#else 6326#else
5761static inline void perf_event_exit_cpu(int cpu) { } 6327static inline void perf_event_exit_cpu(int cpu) { }
@@ -5785,118 +6351,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5785 return NOTIFY_OK; 6351 return NOTIFY_OK;
5786} 6352}
5787 6353
5788/*
5789 * This has to have a higher priority than migration_notifier in sched.c.
5790 */
5791static struct notifier_block __cpuinitdata perf_cpu_nb = {
5792 .notifier_call = perf_cpu_notify,
5793 .priority = 20,
5794};
5795
5796void __init perf_event_init(void) 6354void __init perf_event_init(void)
5797{ 6355{
5798 perf_event_init_all_cpus(); 6356 perf_event_init_all_cpus();
5799 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6357 init_srcu_struct(&pmus_srcu);
5800 (void *)(long)smp_processor_id()); 6358 perf_pmu_register(&perf_swevent);
5801 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6359 perf_pmu_register(&perf_cpu_clock);
5802 (void *)(long)smp_processor_id()); 6360 perf_pmu_register(&perf_task_clock);
5803 register_cpu_notifier(&perf_cpu_nb); 6361 perf_tp_register();
5804} 6362 perf_cpu_notifier(perf_cpu_notify);
5805
5806static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5807 struct sysdev_class_attribute *attr,
5808 char *buf)
5809{
5810 return sprintf(buf, "%d\n", perf_reserved_percpu);
5811}
5812
5813static ssize_t
5814perf_set_reserve_percpu(struct sysdev_class *class,
5815 struct sysdev_class_attribute *attr,
5816 const char *buf,
5817 size_t count)
5818{
5819 struct perf_cpu_context *cpuctx;
5820 unsigned long val;
5821 int err, cpu, mpt;
5822
5823 err = strict_strtoul(buf, 10, &val);
5824 if (err)
5825 return err;
5826 if (val > perf_max_events)
5827 return -EINVAL;
5828
5829 spin_lock(&perf_resource_lock);
5830 perf_reserved_percpu = val;
5831 for_each_online_cpu(cpu) {
5832 cpuctx = &per_cpu(perf_cpu_context, cpu);
5833 raw_spin_lock_irq(&cpuctx->ctx.lock);
5834 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5835 perf_max_events - perf_reserved_percpu);
5836 cpuctx->max_pertask = mpt;
5837 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5838 }
5839 spin_unlock(&perf_resource_lock);
5840
5841 return count;
5842}
5843
5844static ssize_t perf_show_overcommit(struct sysdev_class *class,
5845 struct sysdev_class_attribute *attr,
5846 char *buf)
5847{
5848 return sprintf(buf, "%d\n", perf_overcommit);
5849}
5850
5851static ssize_t
5852perf_set_overcommit(struct sysdev_class *class,
5853 struct sysdev_class_attribute *attr,
5854 const char *buf, size_t count)
5855{
5856 unsigned long val;
5857 int err;
5858
5859 err = strict_strtoul(buf, 10, &val);
5860 if (err)
5861 return err;
5862 if (val > 1)
5863 return -EINVAL;
5864
5865 spin_lock(&perf_resource_lock);
5866 perf_overcommit = val;
5867 spin_unlock(&perf_resource_lock);
5868
5869 return count;
5870}
5871
5872static SYSDEV_CLASS_ATTR(
5873 reserve_percpu,
5874 0644,
5875 perf_show_reserve_percpu,
5876 perf_set_reserve_percpu
5877 );
5878
5879static SYSDEV_CLASS_ATTR(
5880 overcommit,
5881 0644,
5882 perf_show_overcommit,
5883 perf_set_overcommit
5884 );
5885
5886static struct attribute *perfclass_attrs[] = {
5887 &attr_reserve_percpu.attr,
5888 &attr_overcommit.attr,
5889 NULL
5890};
5891
5892static struct attribute_group perfclass_attr_group = {
5893 .attrs = perfclass_attrs,
5894 .name = "perf_events",
5895};
5896
5897static int __init perf_event_sysfs_init(void)
5898{
5899 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5900 &perfclass_attr_group);
5901} 6363}
5902device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..c0d2067f3e0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3584,7 +3584,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3584 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3585 raw_spin_unlock(&rq->lock);
3586 3586
3587 perf_event_task_tick(curr); 3587 perf_event_task_tick();
3588 3588
3589#ifdef CONFIG_SMP 3589#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3590 rq->idle_at_tick = idle_cpu(cpu);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..ed6aacfcb7ef 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..e550d2eda1df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..65fb077ea79c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
884 FTRACE_ENABLE_CALLS = (1 << 0), 884 FTRACE_ENABLE_CALLS = (1 << 0),
885 FTRACE_DISABLE_CALLS = (1 << 1), 885 FTRACE_DISABLE_CALLS = (1 << 1),
886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
887 FTRACE_ENABLE_MCOUNT = (1 << 3), 887 FTRACE_START_FUNC_RET = (1 << 3),
888 FTRACE_DISABLE_MCOUNT = (1 << 4), 888 FTRACE_STOP_FUNC_RET = (1 << 4),
889 FTRACE_START_FUNC_RET = (1 << 5),
890 FTRACE_STOP_FUNC_RET = (1 << 6),
891}; 889};
892 890
893static int ftrace_filtered; 891static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
1226 1224
1227static void ftrace_startup_sysctl(void) 1225static void ftrace_startup_sysctl(void)
1228{ 1226{
1229 int command = FTRACE_ENABLE_MCOUNT;
1230
1231 if (unlikely(ftrace_disabled)) 1227 if (unlikely(ftrace_disabled))
1232 return; 1228 return;
1233 1229
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
1235 saved_ftrace_func = NULL; 1231 saved_ftrace_func = NULL;
1236 /* ftrace_start_up is true if we want ftrace running */ 1232 /* ftrace_start_up is true if we want ftrace running */
1237 if (ftrace_start_up) 1233 if (ftrace_start_up)
1238 command |= FTRACE_ENABLE_CALLS; 1234 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1239
1240 ftrace_run_update_code(command);
1241} 1235}
1242 1236
1243static void ftrace_shutdown_sysctl(void) 1237static void ftrace_shutdown_sysctl(void)
1244{ 1238{
1245 int command = FTRACE_DISABLE_MCOUNT;
1246
1247 if (unlikely(ftrace_disabled)) 1239 if (unlikely(ftrace_disabled))
1248 return; 1240 return;
1249 1241
1250 /* ftrace_start_up is true if ftrace is running */ 1242 /* ftrace_start_up is true if ftrace is running */
1251 if (ftrace_start_up) 1243 if (ftrace_start_up)
1252 command |= FTRACE_DISABLE_CALLS; 1244 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1253
1254 ftrace_run_update_code(command);
1255} 1245}
1256 1246
1257static cycle_t ftrace_update_time; 1247static cycle_t ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
1368#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1358#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1369 1359
1370struct ftrace_iterator { 1360struct ftrace_iterator {
1371 struct ftrace_page *pg; 1361 loff_t pos;
1372 int hidx; 1362 loff_t func_pos;
1373 int idx; 1363 struct ftrace_page *pg;
1374 unsigned flags; 1364 struct dyn_ftrace *func;
1375 struct trace_parser parser; 1365 struct ftrace_func_probe *probe;
1366 struct trace_parser parser;
1367 int hidx;
1368 int idx;
1369 unsigned flags;
1376}; 1370};
1377 1371
1378static void * 1372static void *
1379t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1373t_hash_next(struct seq_file *m, loff_t *pos)
1380{ 1374{
1381 struct ftrace_iterator *iter = m->private; 1375 struct ftrace_iterator *iter = m->private;
1382 struct hlist_node *hnd = v; 1376 struct hlist_node *hnd = NULL;
1383 struct hlist_head *hhd; 1377 struct hlist_head *hhd;
1384 1378
1385 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1386
1387 (*pos)++; 1379 (*pos)++;
1380 iter->pos = *pos;
1388 1381
1382 if (iter->probe)
1383 hnd = &iter->probe->node;
1389 retry: 1384 retry:
1390 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1385 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1391 return NULL; 1386 return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1408 } 1403 }
1409 } 1404 }
1410 1405
1411 return hnd; 1406 if (WARN_ON_ONCE(!hnd))
1407 return NULL;
1408
1409 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1410
1411 return iter;
1412} 1412}
1413 1413
1414static void *t_hash_start(struct seq_file *m, loff_t *pos) 1414static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417 void *p = NULL; 1417 void *p = NULL;
1418 loff_t l; 1418 loff_t l;
1419 1419
1420 if (!(iter->flags & FTRACE_ITER_HASH)) 1420 if (iter->func_pos > *pos)
1421 *pos = 0; 1421 return NULL;
1422
1423 iter->flags |= FTRACE_ITER_HASH;
1424 1422
1425 iter->hidx = 0; 1423 iter->hidx = 0;
1426 for (l = 0; l <= *pos; ) { 1424 for (l = 0; l <= (*pos - iter->func_pos); ) {
1427 p = t_hash_next(m, p, &l); 1425 p = t_hash_next(m, &l);
1428 if (!p) 1426 if (!p)
1429 break; 1427 break;
1430 } 1428 }
1431 return p; 1429 if (!p)
1430 return NULL;
1431
1432 /* Only set this if we have an item */
1433 iter->flags |= FTRACE_ITER_HASH;
1434
1435 return iter;
1432} 1436}
1433 1437
1434static int t_hash_show(struct seq_file *m, void *v) 1438static int
1439t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1435{ 1440{
1436 struct ftrace_func_probe *rec; 1441 struct ftrace_func_probe *rec;
1437 struct hlist_node *hnd = v;
1438 1442
1439 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1443 rec = iter->probe;
1444 if (WARN_ON_ONCE(!rec))
1445 return -EIO;
1440 1446
1441 if (rec->ops->print) 1447 if (rec->ops->print)
1442 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1448 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1457 struct dyn_ftrace *rec = NULL; 1463 struct dyn_ftrace *rec = NULL;
1458 1464
1459 if (iter->flags & FTRACE_ITER_HASH) 1465 if (iter->flags & FTRACE_ITER_HASH)
1460 return t_hash_next(m, v, pos); 1466 return t_hash_next(m, pos);
1461 1467
1462 (*pos)++; 1468 (*pos)++;
1469 iter->pos = *pos;
1463 1470
1464 if (iter->flags & FTRACE_ITER_PRINTALL) 1471 if (iter->flags & FTRACE_ITER_PRINTALL)
1465 return NULL; 1472 return t_hash_start(m, pos);
1466 1473
1467 retry: 1474 retry:
1468 if (iter->idx >= iter->pg->index) { 1475 if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1491 } 1498 }
1492 } 1499 }
1493 1500
1494 return rec; 1501 if (!rec)
1502 return t_hash_start(m, pos);
1503
1504 iter->func_pos = *pos;
1505 iter->func = rec;
1506
1507 return iter;
1508}
1509
1510static void reset_iter_read(struct ftrace_iterator *iter)
1511{
1512 iter->pos = 0;
1513 iter->func_pos = 0;
1514 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1495} 1515}
1496 1516
1497static void *t_start(struct seq_file *m, loff_t *pos) 1517static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1502 1522
1503 mutex_lock(&ftrace_lock); 1523 mutex_lock(&ftrace_lock);
1504 /* 1524 /*
1525 * If an lseek was done, then reset and start from beginning.
1526 */
1527 if (*pos < iter->pos)
1528 reset_iter_read(iter);
1529
1530 /*
1505 * For set_ftrace_filter reading, if we have the filter 1531 * For set_ftrace_filter reading, if we have the filter
1506 * off, we can short cut and just print out that all 1532 * off, we can short cut and just print out that all
1507 * functions are enabled. 1533 * functions are enabled.
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1518 if (iter->flags & FTRACE_ITER_HASH) 1544 if (iter->flags & FTRACE_ITER_HASH)
1519 return t_hash_start(m, pos); 1545 return t_hash_start(m, pos);
1520 1546
1547 /*
1548 * Unfortunately, we need to restart at ftrace_pages_start
1549 * every time we let go of the ftrace_mutex. This is because
1550 * those pointers can change without the lock.
1551 */
1521 iter->pg = ftrace_pages_start; 1552 iter->pg = ftrace_pages_start;
1522 iter->idx = 0; 1553 iter->idx = 0;
1523 for (l = 0; l <= *pos; ) { 1554 for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1526 break; 1557 break;
1527 } 1558 }
1528 1559
1529 if (!p && iter->flags & FTRACE_ITER_FILTER) 1560 if (!p) {
1530 return t_hash_start(m, pos); 1561 if (iter->flags & FTRACE_ITER_FILTER)
1562 return t_hash_start(m, pos);
1531 1563
1532 return p; 1564 return NULL;
1565 }
1566
1567 return iter;
1533} 1568}
1534 1569
1535static void t_stop(struct seq_file *m, void *p) 1570static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
1540static int t_show(struct seq_file *m, void *v) 1575static int t_show(struct seq_file *m, void *v)
1541{ 1576{
1542 struct ftrace_iterator *iter = m->private; 1577 struct ftrace_iterator *iter = m->private;
1543 struct dyn_ftrace *rec = v; 1578 struct dyn_ftrace *rec;
1544 1579
1545 if (iter->flags & FTRACE_ITER_HASH) 1580 if (iter->flags & FTRACE_ITER_HASH)
1546 return t_hash_show(m, v); 1581 return t_hash_show(m, iter);
1547 1582
1548 if (iter->flags & FTRACE_ITER_PRINTALL) { 1583 if (iter->flags & FTRACE_ITER_PRINTALL) {
1549 seq_printf(m, "#### all functions enabled ####\n"); 1584 seq_printf(m, "#### all functions enabled ####\n");
1550 return 0; 1585 return 0;
1551 } 1586 }
1552 1587
1588 rec = iter->func;
1589
1553 if (!rec) 1590 if (!rec)
1554 return 0; 1591 return 0;
1555 1592
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
2418 .open = ftrace_filter_open, 2455 .open = ftrace_filter_open,
2419 .read = seq_read, 2456 .read = seq_read,
2420 .write = ftrace_filter_write, 2457 .write = ftrace_filter_write,
2421 .llseek = no_llseek, 2458 .llseek = ftrace_regex_lseek,
2422 .release = ftrace_filter_release, 2459 .release = ftrace_filter_release,
2423}; 2460};
2424 2461
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 492197e2f86c..4e2f03410377 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2606}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2608
2609/*
2610 * The total entries in the ring buffer is the running counter
2611 * of entries entered into the ring buffer, minus the sum of
2612 * the entries read from the ring buffer and the number of
2613 * entries that were overwritten.
2614 */
2615static inline unsigned long
2616rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2617{
2618 return local_read(&cpu_buffer->entries) -
2619 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2620}
2621
2609/** 2622/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2623 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2624 * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2627unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2628{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2629 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2630
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2631 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2632 return 0;
2621 2633
2622 cpu_buffer = buffer->buffers[cpu]; 2634 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2635
2626 return ret; 2636 return rb_num_of_entries(cpu_buffer);
2627} 2637}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2639
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2694 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2695 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2696 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2697 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2698 }
2690 2699
2691 return entries; 2700 return entries;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int total_ref_count;
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 struct hlist_head *list; 27 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu; 29 int cpu;
30 30
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 42 tp_event->perf_events = list;
43 43
44 if (!total_ref_count) { 44 if (!total_ref_count) {
45 char *buf; 45 char __percpu *buf;
46 int i; 46 int i;
47 47
48 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 50 if (!buf)
51 goto fail; 51 goto fail;
52 52
@@ -65,7 +65,7 @@ fail:
65 if (!total_ref_count) { 65 if (!total_ref_count) {
66 int i; 66 int i;
67 67
68 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
71 } 71 }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
101 return ret; 101 return ret;
102} 102}
103 103
104int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
105{ 105{
106 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
107 struct hlist_head *list; 108 struct hlist_head *list;
108 109
109 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
110 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
111 return -EINVAL; 112 return -EINVAL;
112 113
113 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
114 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
115 119
116 return 0; 120 return 0;
117} 121}
118 122
119void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
120{ 124{
121 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
122} 126}
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
142 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
143 147
144 if (!--total_ref_count) { 148 if (!--total_ref_count) {
145 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
146 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
147 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
148 } 152 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
600 600
601enum { 601enum {
602 FORMAT_HEADER = 1, 602 FORMAT_HEADER = 1,
603 FORMAT_PRINTFMT = 2, 603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
604}; 605};
605 606
606static void *f_next(struct seq_file *m, void *v, loff_t *pos) 607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
607{ 608{
608 struct ftrace_event_call *call = m->private; 609 struct ftrace_event_call *call = m->private;
609 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
610 struct list_head *head; 611 struct list_head *common_head = &ftrace_common_fields;
612 struct list_head *head = trace_get_fields(call);
611 613
612 (*pos)++; 614 (*pos)++;
613 615
614 switch ((unsigned long)v) { 616 switch ((unsigned long)v) {
615 case FORMAT_HEADER: 617 case FORMAT_HEADER:
616 head = &ftrace_common_fields; 618 if (unlikely(list_empty(common_head)))
619 return NULL;
620
621 field = list_entry(common_head->prev,
622 struct ftrace_event_field, link);
623 return field;
617 624
625 case FORMAT_FIELD_SEPERATOR:
618 if (unlikely(list_empty(head))) 626 if (unlikely(list_empty(head)))
619 return NULL; 627 return NULL;
620 628
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
626 return NULL; 634 return NULL;
627 } 635 }
628 636
629 head = trace_get_fields(call);
630
631 /*
632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
636
637 field = v; 637 field = v;
638 /* 638 if (field->link.prev == common_head)
639 * If this is a common field, and at the end of the list, then 639 return (void *)FORMAT_FIELD_SEPERATOR;
640 * continue with main list. 640 else if (field->link.prev == head)
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
650 }
651
652 /* If we are done tell f_show to print the format */
653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT; 641 return (void *)FORMAT_PRINTFMT;
655 642
656 field = list_entry(field->link.prev, struct ftrace_event_field, link); 643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
688 seq_printf(m, "format:\n"); 675 seq_printf(m, "format:\n");
689 return 0; 676 return 0;
690 677
678 case FORMAT_FIELD_SEPERATOR:
679 seq_putc(m, '\n');
680 return 0;
681
691 case FORMAT_PRINTFMT: 682 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n", 683 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt); 684 call->print_fmt);
694 return 0; 685 return 0;
695 } 686 }
696 687
697 /*
698 * To separate common fields from event fields, the
699 * LSB is set on the first event field. Clear it and
700 * print a newline if it is set.
701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v; 688 field = v;
708 689
709 /* 690 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..ef49e9370b25 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -649,8 +665,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
649 665
650 /* Print nsecs (we don't want to exceed 7 numbers) */ 666 /* Print nsecs (we don't want to exceed 7 numbers) */
651 if (len < 7) { 667 if (len < 7) {
652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", 668 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
653 nsecs_rem); 669
670 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
654 ret = trace_seq_printf(s, ".%s", nsecs_str); 671 ret = trace_seq_printf(s, ".%s", nsecs_str);
655 if (!ret) 672 if (!ret)
656 return TRACE_TYPE_PARTIAL_LINE; 673 return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +872,92 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
855 return 0; 872 return 0;
856} 873}
857 874
875/*
876 * Entry check for irq code
877 *
878 * returns 1 if
879 * - we are inside irq code
880 * - we just extered irq code
881 *
882 * retunns 0 if
883 * - funcgraph-interrupts option is set
884 * - we are not inside irq code
885 */
886static int
887check_irq_entry(struct trace_iterator *iter, u32 flags,
888 unsigned long addr, int depth)
889{
890 int cpu = iter->cpu;
891 struct fgraph_data *data = iter->private;
892 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
893
894 if (flags & TRACE_GRAPH_PRINT_IRQS)
895 return 0;
896
897 /*
898 * We are inside the irq code
899 */
900 if (*depth_irq >= 0)
901 return 1;
902
903 if ((addr < (unsigned long)__irqentry_text_start) ||
904 (addr >= (unsigned long)__irqentry_text_end))
905 return 0;
906
907 /*
908 * We are entering irq code.
909 */
910 *depth_irq = depth;
911 return 1;
912}
913
914/*
915 * Return check for irq code
916 *
917 * returns 1 if
918 * - we are inside irq code
919 * - we just left irq code
920 *
921 * returns 0 if
922 * - funcgraph-interrupts option is set
923 * - we are not inside irq code
924 */
925static int
926check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
927{
928 int cpu = iter->cpu;
929 struct fgraph_data *data = iter->private;
930 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
931
932 if (flags & TRACE_GRAPH_PRINT_IRQS)
933 return 0;
934
935 /*
936 * We are not inside the irq code.
937 */
938 if (*depth_irq == -1)
939 return 0;
940
941 /*
942 * We are inside the irq code, and this is returning entry.
943 * Let's not trace it and clear the entry depth, since
944 * we are out of irq code.
945 *
946 * This condition ensures that we 'leave the irq code' once
947 * we are out of the entry depth. Thus protecting us from
948 * the RETURN entry loss.
949 */
950 if (*depth_irq >= depth) {
951 *depth_irq = -1;
952 return 1;
953 }
954
955 /*
956 * We are inside the irq code, and this is not the entry.
957 */
958 return 1;
959}
960
858static enum print_line_t 961static enum print_line_t
859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 962print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
860 struct trace_iterator *iter, u32 flags) 963 struct trace_iterator *iter, u32 flags)
@@ -865,6 +968,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
865 static enum print_line_t ret; 968 static enum print_line_t ret;
866 int cpu = iter->cpu; 969 int cpu = iter->cpu;
867 970
971 if (check_irq_entry(iter, flags, call->func, call->depth))
972 return TRACE_TYPE_HANDLED;
973
868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 974 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
869 return TRACE_TYPE_PARTIAL_LINE; 975 return TRACE_TYPE_PARTIAL_LINE;
870 976
@@ -902,6 +1008,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
902 int ret; 1008 int ret;
903 int i; 1009 int i;
904 1010
1011 if (check_irq_return(iter, flags, trace->depth))
1012 return TRACE_TYPE_HANDLED;
1013
905 if (data) { 1014 if (data) {
906 struct fgraph_cpu_data *cpu_data; 1015 struct fgraph_cpu_data *cpu_data;
907 int cpu = iter->cpu; 1016 int cpu = iter->cpu;
@@ -1210,9 +1319,12 @@ void graph_trace_open(struct trace_iterator *iter)
1210 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1319 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1211 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1320 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1212 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1321 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1322 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1323
1213 *pid = -1; 1324 *pid = -1;
1214 *depth = 0; 1325 *depth = 0;
1215 *ignore = 0; 1326 *ignore = 0;
1327 *depth_irq = -1;
1216 } 1328 }
1217 1329
1218 iter->private = data; 1330 iter->private = data;
@@ -1235,6 +1347,14 @@ void graph_trace_close(struct trace_iterator *iter)
1235 } 1347 }
1236} 1348}
1237 1349
1350static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1351{
1352 if (bit == TRACE_GRAPH_PRINT_IRQS)
1353 ftrace_graph_skip_irqs = !set;
1354
1355 return 0;
1356}
1357
1238static struct trace_event_functions graph_functions = { 1358static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event, 1359 .trace = print_graph_function_event,
1240}; 1360};
@@ -1261,6 +1381,7 @@ static struct tracer graph_trace __read_mostly = {
1261 .print_line = print_graph_function, 1381 .print_line = print_graph_function,
1262 .print_header = print_graph_headers, 1382 .print_header = print_graph_headers,
1263 .flags = &tracer_flags, 1383 .flags = &tracer_flags,
1384 .set_flag = func_graph_set_flag,
1264#ifdef CONFIG_FTRACE_SELFTEST 1385#ifdef CONFIG_FTRACE_SELFTEST
1265 .selftest = trace_selftest_startup_function_graph, 1386 .selftest = trace_selftest_startup_function_graph,
1266#endif 1387#endif
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..d6073a50a6ca 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 enable_jump_label(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 disable_jump_label(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 disable_jump_label(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..dc8e16824b51 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog; 46static int __initdata no_watchdog;
48 47
49 48
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
187 return 0; 186 return 0;
188} 187}
189 188
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 189#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = { 190static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 191 .type = PERF_TYPE_HARDWARE,
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
371 /* Try to register using hardware perf events */ 358 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 359 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(); 360 wd_attr->sample_period = hw_nmi_get_sample_period();
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
375 if (!IS_ERR(event)) { 362 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 364 goto out_save;
378 } 365 }
379 366
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
381 return -1; 368 return PTR_ERR(event);
382 369
383 /* success path */ 370 /* success path */
384out_save: 371out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
422static int watchdog_enable(int cpu) 409static int watchdog_enable(int cpu)
423{ 410{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
425 413
426 /* enable the perf event */ 414 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0) 415 err = watchdog_nmi_enable(cpu);
428 return -1; 416 if (err)
417 return err;
429 418
430 /* create the watchdog thread */ 419 /* create the watchdog thread */
431 if (!p) { 420 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) { 422 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1; 424 return PTR_ERR(p);
436 } 425 }
437 kthread_bind(p, cpu); 426 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0; 427 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
484{ 473{
485 int cpu; 474 int cpu;
486 475
476 if (no_watchdog)
477 return;
478
487 for_each_online_cpu(cpu) 479 for_each_online_cpu(cpu)
488 watchdog_disable(cpu); 480 watchdog_disable(cpu);
489 481
@@ -526,17 +518,16 @@ static int __cpuinit
526cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
527{ 519{
528 int hotcpu = (unsigned long)hcpu; 520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
529 522
530 switch (action) { 523 switch (action) {
531 case CPU_UP_PREPARE: 524 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN: 525 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu)) 526 err = watchdog_prepare_cpu(hotcpu);
534 return NOTIFY_BAD;
535 break; 527 break;
536 case CPU_ONLINE: 528 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN: 529 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu)) 530 err = watchdog_enable(hotcpu);
539 return NOTIFY_BAD;
540 break; 531 break;
541#ifdef CONFIG_HOTPLUG_CPU 532#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED: 533 case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
549 break; 540 break;
550#endif /* CONFIG_HOTPLUG_CPU */ 541#endif /* CONFIG_HOTPLUG_CPU */
551 } 542 }
552 return NOTIFY_OK; 543 return notifier_from_errno(err);
553} 544}
554 545
555static struct notifier_block __cpuinitdata cpu_nfb = { 546static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
565 return 0; 556 return 0;
566 557
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD); 559 WARN_ON(notifier_to_errno(err));
569 560
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
572 563
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
574
575 return 0; 564 return 0;
576} 565}
577early_initcall(spawn_watchdog_task); 566early_initcall(spawn_watchdog_task);