aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-05 18:30:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-05 18:30:21 -0500
commitc3fa27d1367fac63ac8533d6f20ea851d0d70a10 (patch)
treee7731554085e22b6b63411b1ebb401079f3e0bbb /kernel
parent96fa2b508d2d3fe040cf4ef2fffb955f0a537ea1 (diff)
parentd103d01e4b19f185d3c85f77402b605534c32e89 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (470 commits) x86: Fix comments of register/stack access functions perf tools: Replace %m with %a in sscanf hw-breakpoints: Keep track of user disabled breakpoints tracing/syscalls: Make syscall events print callbacks static tracing: Add DEFINE_EVENT(), DEFINE_SINGLE_EVENT() support to docbook perf: Don't free perf_mmap_data until work has been done perf_event: Fix compile error perf tools: Fix _GNU_SOURCE macro related strndup() build error trace_syscalls: Remove unused syscall_name_to_nr() trace_syscalls: Simplify syscall profile trace_syscalls: Remove duplicate init_enter_##sname() trace_syscalls: Add syscall_nr field to struct syscall_metadata trace_syscalls: Remove enter_id exit_id trace_syscalls: Set event_enter_##sname->data to its metadata trace_syscalls: Remove unused event_syscall_enter and event_syscall_exit perf_event: Initialize data.period in perf_swevent_hrtimer() perf probe: Simplify event naming perf probe: Add --list option for listing current probe events perf probe: Add argv_split() from lib/argv_split.c perf probe: Move probe event utility functions to probe-event.c ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/hw_breakpoint.c423
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kprobes.c68
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/perf_event.c627
-rw-r--r--kernel/signal.c27
-rw-r--r--kernel/trace/Kconfig38
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.h38
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c43
-rw-r--r--kernel/trace/trace_events.c168
-rw-r--r--kernel/trace/trace_events_filter.c310
-rw-r--r--kernel/trace/trace_export.c39
-rw-r--r--kernel/trace/trace_kprobe.c1523
-rw-r--r--kernel/trace/trace_ksym.c550
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_syscalls.c195
22 files changed, 3574 insertions, 575 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dcf6789bf547..982c50e2ce53 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -97,6 +98,7 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
98obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o 99obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
99obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
100 102
101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..3f45e3cf931d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code)
978 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
979 980
980 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
981 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
983 */ 988 */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..cf5ee1628411
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86/*
87 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
91{
92 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96
97 return;
98 }
99
100 for_each_online_cpu(cpu) {
101 unsigned int nr;
102
103 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu);
105
106 if (nr > slots->pinned)
107 slots->pinned = nr;
108
109 nr = per_cpu(nr_bp_flexible, cpu);
110
111 if (nr > slots->flexible)
112 slots->flexible = nr;
113 }
114}
115
116/*
117 * Add a pinned breakpoint for the given task in our constraint table
118 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned;
125 struct list_head *list;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143
144 spin_unlock_irqrestore(&ctx->lock, flags);
145
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) {
151 tsk_pinned[count]++;
152 if (count > 0)
153 tsk_pinned[count-1]--;
154 } else {
155 tsk_pinned[count]--;
156 if (count > 0)
157 tsk_pinned[count-1]++;
158 }
159}
160
161/*
162 * Add/remove the given breakpoint in our constraint table
163 */
164static void toggle_bp_slot(struct perf_event *bp, bool enable)
165{
166 int cpu = bp->cpu;
167 struct task_struct *tsk = bp->ctx->task;
168
169 /* Pinned counter task profiling */
170 if (tsk) {
171 if (cpu >= 0) {
172 toggle_bp_task_slot(tsk, cpu, enable);
173 return;
174 }
175
176 for_each_online_cpu(cpu)
177 toggle_bp_task_slot(tsk, cpu, enable);
178 return;
179 }
180
181 /* Pinned counter cpu profiling */
182 if (enable)
183 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
184 else
185 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
186}
187
188/*
189 * Contraints to check before allowing this new breakpoint counter:
190 *
191 * == Non-pinned counter == (Considered as pinned for now)
192 *
193 * - If attached to a single cpu, check:
194 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
197 *
198 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them.
200 * Otherwise, we check that the maximum number of per task
201 * breakpoints (for this cpu) plus the number of per cpu breakpoint
202 * (for this cpu) doesn't cover every registers.
203 *
204 * - If attached to every cpus, check:
205 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
208 *
209 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks
211 * breakpoints.
212 *
213 *
214 * == Pinned counter ==
215 *
216 * - If attached to a single cpu, check:
217 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
220 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed).
223 *
224 * - If attached to every cpus, check:
225 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
228 */
229int reserve_bp_slot(struct perf_event *bp)
230{
231 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235
236 fetch_bp_busy_slots(&slots, bp->cpu);
237
238 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
240 ret = -ENOSPC;
241 goto end;
242 }
243
244 toggle_bp_slot(bp, true);
245
246end:
247 mutex_unlock(&nr_bp_mutex);
248
249 return ret;
250}
251
252void release_bp_slot(struct perf_event *bp)
253{
254 mutex_lock(&nr_bp_mutex);
255
256 toggle_bp_slot(bp, false);
257
258 mutex_unlock(&nr_bp_mutex);
259}
260
261
262int __register_perf_hw_breakpoint(struct perf_event *bp)
263{
264 int ret;
265
266 ret = reserve_bp_slot(bp);
267 if (ret)
268 return ret;
269
270 /*
271 * Ptrace breakpoints can be temporary perf events only
272 * meant to reserve a slot. In this case, it is created disabled and
273 * we don't want to check the params right now (as we put a null addr)
274 * But perf tools create events as disabled and we want to check
275 * the params for them.
276 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace
278 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281
282 return ret;
283}
284
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/**
293 * register_user_hw_breakpoint - register a hardware breakpoint for user space
294 * @attr: breakpoint attributes
295 * @triggered: callback to trigger when we hit the breakpoint
296 * @tsk: pointer to 'task_struct' of the process to which the address belongs
297 */
298struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered,
301 struct task_struct *tsk)
302{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
304}
305EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
306
307/**
308 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
309 * @bp: the breakpoint structure to modify
310 * @attr: new breakpoint attributes
311 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */
314struct perf_event *
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{
319 /*
320 * FIXME: do it without unregistering
321 * - We don't want to lose our slot
322 * - If the new bp is incorrect, don't lose the older one
323 */
324 unregister_hw_breakpoint(bp);
325
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
327}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329
330/**
331 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
332 * @bp: the breakpoint structure to unregister
333 */
334void unregister_hw_breakpoint(struct perf_event *bp)
335{
336 if (!bp)
337 return;
338 perf_event_release_kernel(bp);
339}
340EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
341
342/**
343 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
344 * @attr: breakpoint attributes
345 * @triggered: callback to trigger when we hit the breakpoint
346 *
347 * @return a set of per_cpu pointers to perf events
348 */
349struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered)
352{
353 struct perf_event **cpu_events, **pevent, *bp;
354 long err;
355 int cpu;
356
357 cpu_events = alloc_percpu(typeof(*cpu_events));
358 if (!cpu_events)
359 return ERR_PTR(-ENOMEM);
360
361 for_each_possible_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364
365 *pevent = bp;
366
367 if (IS_ERR(bp)) {
368 err = PTR_ERR(bp);
369 goto fail;
370 }
371 }
372
373 return cpu_events;
374
375fail:
376 for_each_possible_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent))
379 break;
380 unregister_hw_breakpoint(*pevent);
381 }
382 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
387
388/**
389 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
390 * @cpu_events: the per cpu set of events to unregister
391 */
392void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
393{
394 int cpu;
395 struct perf_event **pevent;
396
397 for_each_possible_cpu(cpu) {
398 pevent = per_cpu_ptr(cpu_events, cpu);
399 unregister_hw_breakpoint(*pevent);
400 }
401 free_percpu(cpu_events);
402}
403EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
404
405static struct notifier_block hw_breakpoint_exceptions_nb = {
406 .notifier_call = hw_breakpoint_exceptions_notify,
407 /* we need to be notified first */
408 .priority = 0x7fffffff
409};
410
411static int __init init_hw_breakpoint(void)
412{
413 return register_die_notifier(&hw_breakpoint_exceptions_nb);
414}
415core_initcall(init_hw_breakpoint);
416
417
418struct pmu perf_ops_bp = {
419 .enable = arch_install_hw_breakpoint,
420 .disable = arch_uninstall_hw_breakpoint,
421 .read = hw_breakpoint_pmu_read,
422 .unthrottle = hw_breakpoint_pmu_unthrottle
423};
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1494e85b35f2..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..6b7ddba1dd64 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,6 +28,8 @@
28#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
31 33
32#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
33 35
@@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
244 put_ctx(ctx); 246 put_ctx(ctx);
245} 247}
246 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
247/* 292/*
248 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
292 if (event->group_leader != event) 337 if (event->group_leader != event)
293 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
294 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
295 /* 352 /*
296 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -445,50 +502,11 @@ retry:
445 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
446 * succeed. 503 * succeed.
447 */ 504 */
448 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
449 list_del_event(event, ctx); 506 list_del_event(event, ctx);
450 }
451 spin_unlock_irq(&ctx->lock); 507 spin_unlock_irq(&ctx->lock);
452} 508}
453 509
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490}
491
492/* 510/*
493 * Update total_time_enabled and total_time_running for all events in a group. 511 * Update total_time_enabled and total_time_running for all events in a group.
494 */ 512 */
@@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1031 update_context_time(ctx); 1049 update_context_time(ctx);
1032 1050
1033 perf_disable(); 1051 perf_disable();
1034 if (ctx->nr_active) 1052 if (ctx->nr_active) {
1035 list_for_each_entry(event, &ctx->group_list, group_entry) 1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1036 group_sched_out(event, cpuctx, ctx); 1054 group_sched_out(event, cpuctx, ctx);
1037 1055 }
1038 perf_enable(); 1056 perf_enable();
1039 out: 1057 out:
1040 spin_unlock(&ctx->lock); 1058 spin_unlock(&ctx->lock);
@@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1059 && !ctx1->pin_count && !ctx2->pin_count; 1077 && !ctx1->pin_count && !ctx2->pin_count;
1060} 1078}
1061 1079
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event, 1080static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event) 1081 struct perf_event *next_event)
1066{ 1082{
@@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1078 */ 1094 */
1079 switch (event->state) { 1095 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE: 1096 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event); 1097 event->pmu->read(event);
1082 break; 1098 /* fall-through */
1083 1099
1084 case PERF_EVENT_STATE_INACTIVE: 1100 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event); 1101 update_event_times(event);
@@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1118 if (!ctx->nr_stat) 1134 if (!ctx->nr_stat)
1119 return; 1135 return;
1120 1136
1137 update_context_time(ctx);
1138
1121 event = list_first_entry(&ctx->event_list, 1139 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry); 1140 struct perf_event, event_entry);
1123 1141
@@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1161 if (likely(!ctx || !cpuctx->task_ctx)) 1179 if (likely(!ctx || !cpuctx->task_ctx))
1162 return; 1180 return;
1163 1181
1164 update_context_time(ctx);
1165
1166 rcu_read_lock(); 1182 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx); 1183 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp; 1184 next_ctx = next->perf_event_ctxp;
@@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info)
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info; 1532 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx; 1533 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519 1534
1520 /* 1535 /*
1521 * If this is a task context, we need to check whether it is 1536 * If this is a task context, we need to check whether it is
@@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info)
1527 if (ctx->task && cpuctx->task_ctx != ctx) 1542 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return; 1543 return;
1529 1544
1530 local_irq_save(flags); 1545 spin_lock(&ctx->lock);
1531 if (ctx->is_active) 1546 update_context_time(ctx);
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event); 1547 update_event_times(event);
1535 local_irq_restore(flags); 1548 spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1536} 1551}
1537 1552
1538static u64 perf_event_read(struct perf_event *event) 1553static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
1545 smp_call_function_single(event->oncpu, 1560 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1); 1561 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1548 update_event_times(event); 1568 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags);
1549 } 1570 }
1550 1571
1551 return atomic64_read(&event->count); 1572 return atomic64_read(&event->count);
@@ -1658,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1658 return ERR_PTR(err); 1679 return ERR_PTR(err);
1659} 1680}
1660 1681
1682static void perf_event_free_filter(struct perf_event *event);
1683
1661static void free_event_rcu(struct rcu_head *head) 1684static void free_event_rcu(struct rcu_head *head)
1662{ 1685{
1663 struct perf_event *event; 1686 struct perf_event *event;
@@ -1665,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head)
1665 event = container_of(head, struct perf_event, rcu_head); 1688 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns) 1689 if (event->ns)
1667 put_pid_ns(event->ns); 1690 put_pid_ns(event->ns);
1691 perf_event_free_filter(event);
1668 kfree(event); 1692 kfree(event);
1669} 1693}
1670 1694
@@ -1696,16 +1720,10 @@ static void free_event(struct perf_event *event)
1696 call_rcu(&event->rcu_head, free_event_rcu); 1720 call_rcu(&event->rcu_head, free_event_rcu);
1697} 1721}
1698 1722
1699/* 1723int perf_event_release_kernel(struct perf_event *event)
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{ 1724{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx; 1725 struct perf_event_context *ctx = event->ctx;
1706 1726
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx); 1727 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex); 1728 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event); 1729 perf_event_remove_from_context(event);
@@ -1720,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
1720 1738
1721 return 0; 1739 return 0;
1722} 1740}
1741EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1742
1743/*
1744 * Called when the last reference to the file is gone.
1745 */
1746static int perf_release(struct inode *inode, struct file *file)
1747{
1748 struct perf_event *event = file->private_data;
1749
1750 file->private_data = NULL;
1751
1752 return perf_event_release_kernel(event);
1753}
1723 1754
1724static int perf_event_read_size(struct perf_event *event) 1755static int perf_event_read_size(struct perf_event *event)
1725{ 1756{
@@ -1746,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
1746 return size; 1777 return size;
1747} 1778}
1748 1779
1749static u64 perf_event_read_value(struct perf_event *event) 1780u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1750{ 1781{
1751 struct perf_event *child; 1782 struct perf_event *child;
1752 u64 total = 0; 1783 u64 total = 0;
1753 1784
1785 *enabled = 0;
1786 *running = 0;
1787
1788 mutex_lock(&event->child_mutex);
1754 total += perf_event_read(event); 1789 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list) 1790 *enabled += event->total_time_enabled +
1791 atomic64_read(&event->child_total_time_enabled);
1792 *running += event->total_time_running +
1793 atomic64_read(&event->child_total_time_running);
1794
1795 list_for_each_entry(child, &event->child_list, child_list) {
1756 total += perf_event_read(child); 1796 total += perf_event_read(child);
1797 *enabled += child->total_time_enabled;
1798 *running += child->total_time_running;
1799 }
1800 mutex_unlock(&event->child_mutex);
1757 1801
1758 return total; 1802 return total;
1759} 1803}
1760 1804EXPORT_SYMBOL_GPL(perf_event_read_value);
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778 1805
1779static int perf_event_read_group(struct perf_event *event, 1806static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf) 1807 u64 read_format, char __user *buf)
1781{ 1808{
1782 struct perf_event *leader = event->group_leader, *sub; 1809 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT; 1810 int n = 0, size = 0, ret = -EFAULT;
1784 u64 values[3]; 1811 struct perf_event_context *ctx = leader->ctx;
1812 u64 values[5];
1813 u64 count, enabled, running;
1814
1815 mutex_lock(&ctx->mutex);
1816 count = perf_event_read_value(leader, &enabled, &running);
1785 1817
1786 values[n++] = 1 + leader->nr_siblings; 1818 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1788 values[n++] = leader->total_time_enabled + 1820 values[n++] = enabled;
1789 atomic64_read(&leader->child_total_time_enabled); 1821 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1790 } 1822 values[n++] = running;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1823 values[n++] = count;
1792 values[n++] = leader->total_time_running + 1824 if (read_format & PERF_FORMAT_ID)
1793 atomic64_read(&leader->child_total_time_running); 1825 values[n++] = primary_event_id(leader);
1794 }
1795 1826
1796 size = n * sizeof(u64); 1827 size = n * sizeof(u64);
1797 1828
1798 if (copy_to_user(buf, values, size)) 1829 if (copy_to_user(buf, values, size))
1799 return -EFAULT; 1830 goto unlock;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804 1831
1805 size += err; 1832 ret = size;
1806 1833
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format, 1835 n = 0;
1809 buf + size); 1836
1810 if (err < 0) 1837 values[n++] = perf_event_read_value(sub, &enabled, &running);
1811 return err; 1838 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(sub);
1840
1841 size = n * sizeof(u64);
1812 1842
1813 size += err; 1843 if (copy_to_user(buf + ret, values, size)) {
1844 ret = -EFAULT;
1845 goto unlock;
1846 }
1847
1848 ret += size;
1814 } 1849 }
1850unlock:
1851 mutex_unlock(&ctx->mutex);
1815 1852
1816 return size; 1853 return ret;
1817} 1854}
1818 1855
1819static int perf_event_read_one(struct perf_event *event, 1856static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf) 1857 u64 read_format, char __user *buf)
1821{ 1858{
1859 u64 enabled, running;
1822 u64 values[4]; 1860 u64 values[4];
1823 int n = 0; 1861 int n = 0;
1824 1862
1825 values[n++] = perf_event_read_value(event); 1863 values[n++] = perf_event_read_value(event, &enabled, &running);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1864 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1827 values[n++] = event->total_time_enabled + 1865 values[n++] = enabled;
1828 atomic64_read(&event->child_total_time_enabled); 1866 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1829 } 1867 values[n++] = running;
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID) 1868 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event); 1869 values[n++] = primary_event_id(event);
1836 1870
@@ -1861,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861 return -ENOSPC; 1895 return -ENOSPC;
1862 1896
1863 WARN_ON_ONCE(event->ctx->parent_ctx); 1897 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP) 1898 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf); 1899 ret = perf_event_read_group(event, read_format, buf);
1867 else 1900 else
1868 ret = perf_event_read_one(event, read_format, buf); 1901 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870 1902
1871 return ret; 1903 return ret;
1872} 1904}
@@ -1974,7 +2006,8 @@ unlock:
1974 return ret; 2006 return ret;
1975} 2007}
1976 2008
1977int perf_event_set_output(struct perf_event *event, int output_fd); 2009static int perf_event_set_output(struct perf_event *event, int output_fd);
2010static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1978 2011
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2012static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{ 2013{
@@ -2002,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2002 case PERF_EVENT_IOC_SET_OUTPUT: 2035 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg); 2036 return perf_event_set_output(event, arg);
2004 2037
2038 case PERF_EVENT_IOC_SET_FILTER:
2039 return perf_event_set_filter(event, (void __user *)arg);
2040
2005 default: 2041 default:
2006 return -ENOTTY; 2042 return -ENOTTY;
2007 } 2043 }
@@ -2174,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2174 perf_mmap_free_page((unsigned long)data->user_page); 2210 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++) 2211 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2212 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2213 kfree(data);
2177} 2214}
2178 2215
2179#else 2216#else
@@ -2214,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2251 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215 2252
2216 vfree(base); 2253 vfree(base);
2254 kfree(data);
2217} 2255}
2218 2256
2219static void perf_mmap_data_free(struct perf_mmap_data *data) 2257static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 } 2345 }
2308 2346
2309 if (!data->watermark) 2347 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2348 data->watermark = max_size / 2;
2311 2349
2312 2350
2313 rcu_assign_pointer(event->data, data); 2351 rcu_assign_pointer(event->data, data);
@@ -2319,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2319 2357
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2358 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data); 2359 perf_mmap_data_free(data);
2322 kfree(data);
2323} 2360}
2324 2361
2325static void perf_mmap_data_release(struct perf_event *event) 2362static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2666static void perf_output_lock(struct perf_output_handle *handle) 2703static void perf_output_lock(struct perf_output_handle *handle)
2667{ 2704{
2668 struct perf_mmap_data *data = handle->data; 2705 struct perf_mmap_data *data = handle->data;
2669 int cpu; 2706 int cur, cpu = get_cpu();
2670 2707
2671 handle->locked = 0; 2708 handle->locked = 0;
2672 2709
2673 local_irq_save(handle->flags); 2710 for (;;) {
2674 cpu = smp_processor_id(); 2711 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2675 2712 if (cur == -1) {
2676 if (in_nmi() && atomic_read(&data->lock) == cpu) 2713 handle->locked = 1;
2677 return; 2714 break;
2715 }
2716 if (cur == cpu)
2717 break;
2678 2718
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax(); 2719 cpu_relax();
2681 2720 }
2682 handle->locked = 1;
2683} 2721}
2684 2722
2685static void perf_output_unlock(struct perf_output_handle *handle) 2723static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2763,7 @@ again:
2725 if (atomic_xchg(&data->wakeup, 0)) 2763 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle); 2764 perf_output_wakeup(handle);
2727out: 2765out:
2728 local_irq_restore(handle->flags); 2766 put_cpu();
2729} 2767}
2730 2768
2731void perf_output_copy(struct perf_output_handle *handle, 2769void perf_output_copy(struct perf_output_handle *handle,
@@ -3236,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3236{ 3274{
3237 struct perf_event *event; 3275 struct perf_event *event;
3238 3276
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3277 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event)) 3278 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event); 3279 perf_event_task_output(event, task_event);
3246 } 3280 }
3247 rcu_read_unlock();
3248} 3281}
3249 3282
3250static void perf_event_task_event(struct perf_task_event *task_event) 3283static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3252 struct perf_cpu_context *cpuctx; 3285 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx; 3286 struct perf_event_context *ctx = task_event->task_ctx;
3254 3287
3288 rcu_read_lock();
3255 cpuctx = &get_cpu_var(perf_cpu_context); 3289 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event); 3290 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context); 3291 put_cpu_var(perf_cpu_context);
3258 3292
3259 rcu_read_lock();
3260 if (!ctx) 3293 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3262 if (ctx) 3295 if (ctx)
@@ -3348,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3348{ 3381{
3349 struct perf_event *event; 3382 struct perf_event *event;
3350 3383
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3384 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event)) 3385 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event); 3386 perf_event_comm_output(event, comm_event);
3358 } 3387 }
3359 rcu_read_unlock();
3360} 3388}
3361 3389
3362static void perf_event_comm_event(struct perf_comm_event *comm_event) 3390static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3367 char comm[TASK_COMM_LEN]; 3395 char comm[TASK_COMM_LEN];
3368 3396
3369 memset(comm, 0, sizeof(comm)); 3397 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3398 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3399 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372 3400
3373 comm_event->comm = comm; 3401 comm_event->comm = comm;
@@ -3375,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375 3403
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3404 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377 3405
3406 rcu_read_lock();
3378 cpuctx = &get_cpu_var(perf_cpu_context); 3407 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context); 3409 put_cpu_var(perf_cpu_context);
3381 3410
3382 rcu_read_lock();
3383 /* 3411 /*
3384 * doesn't really matter which of the child contexts the 3412 * doesn't really matter which of the child contexts the
3385 * events ends up in. 3413 * events ends up in.
@@ -3472,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3472{ 3500{
3473 struct perf_event *event; 3501 struct perf_event *event;
3474 3502
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3503 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event)) 3504 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event); 3505 perf_event_mmap_output(event, mmap_event);
3482 } 3506 }
3483 rcu_read_unlock();
3484} 3507}
3485 3508
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3509static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,11 +3559,11 @@ got_name:
3536 3559
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3560 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538 3561
3562 rcu_read_lock();
3539 cpuctx = &get_cpu_var(perf_cpu_context); 3563 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context); 3565 put_cpu_var(perf_cpu_context);
3542 3566
3543 rcu_read_lock();
3544 /* 3567 /*
3545 * doesn't really matter which of the child contexts the 3568 * doesn't really matter which of the child contexts the
3546 * events ends up in. 3569 * events ends up in.
@@ -3679,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 perf_event_disable(event); 3702 perf_event_disable(event);
3680 } 3703 }
3681 3704
3682 perf_event_output(event, nmi, data, regs); 3705 if (event->overflow_handler)
3706 event->overflow_handler(event, nmi, data, regs);
3707 else
3708 perf_event_output(event, nmi, data, regs);
3709
3683 return ret; 3710 return ret;
3684} 3711}
3685 3712
@@ -3724,16 +3751,16 @@ again:
3724 return nr; 3751 return nr;
3725} 3752}
3726 3753
3727static void perf_swevent_overflow(struct perf_event *event, 3754static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3728 int nmi, struct perf_sample_data *data, 3755 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs) 3756 struct pt_regs *regs)
3730{ 3757{
3731 struct hw_perf_event *hwc = &event->hw; 3758 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0; 3759 int throttle = 0;
3733 u64 overflow;
3734 3760
3735 data->period = event->hw.last_period; 3761 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event); 3762 if (!overflow)
3763 overflow = perf_swevent_set_period(event);
3737 3764
3738 if (hwc->interrupts == MAX_INTERRUPTS) 3765 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return; 3766 return;
@@ -3766,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3766 3793
3767 atomic64_add(nr, &event->count); 3794 atomic64_add(nr, &event->count);
3768 3795
3796 if (!regs)
3797 return;
3798
3769 if (!hwc->sample_period) 3799 if (!hwc->sample_period)
3770 return; 3800 return;
3771 3801
3772 if (!regs) 3802 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3803 return perf_swevent_overflow(event, 1, nmi, data, regs);
3804
3805 if (atomic64_add_negative(nr, &hwc->period_left))
3773 return; 3806 return;
3774 3807
3775 if (!atomic64_add_negative(nr, &hwc->period_left)) 3808 perf_swevent_overflow(event, 0, nmi, data, regs);
3776 perf_swevent_overflow(event, nmi, data, regs);
3777} 3809}
3778 3810
3779static int perf_swevent_is_counting(struct perf_event *event) 3811static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
3806 return 1; 3838 return 1;
3807} 3839}
3808 3840
3841static int perf_tp_event_match(struct perf_event *event,
3842 struct perf_sample_data *data);
3843
3844static int perf_exclude_event(struct perf_event *event,
3845 struct pt_regs *regs)
3846{
3847 if (regs) {
3848 if (event->attr.exclude_user && user_mode(regs))
3849 return 1;
3850
3851 if (event->attr.exclude_kernel && !user_mode(regs))
3852 return 1;
3853 }
3854
3855 return 0;
3856}
3857
3809static int perf_swevent_match(struct perf_event *event, 3858static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type, 3859 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs) 3860 u32 event_id,
3861 struct perf_sample_data *data,
3862 struct pt_regs *regs)
3812{ 3863{
3813 if (!perf_swevent_is_counting(event)) 3864 if (!perf_swevent_is_counting(event))
3814 return 0; 3865 return 0;
3815 3866
3816 if (event->attr.type != type) 3867 if (event->attr.type != type)
3817 return 0; 3868 return 0;
3869
3818 if (event->attr.config != event_id) 3870 if (event->attr.config != event_id)
3819 return 0; 3871 return 0;
3820 3872
3821 if (regs) { 3873 if (perf_exclude_event(event, regs))
3822 if (event->attr.exclude_user && user_mode(regs)) 3874 return 0;
3823 return 0;
3824 3875
3825 if (event->attr.exclude_kernel && !user_mode(regs)) 3876 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3826 return 0; 3877 !perf_tp_event_match(event, data))
3827 } 3878 return 0;
3828 3879
3829 return 1; 3880 return 1;
3830} 3881}
@@ -3837,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3837{ 3888{
3838 struct perf_event *event; 3889 struct perf_event *event;
3839 3890
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3891 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs)) 3892 if (perf_swevent_match(event, type, event_id, data, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs); 3893 perf_swevent_add(event, nr, nmi, data, regs);
3847 } 3894 }
3848 rcu_read_unlock();
3849} 3895}
3850 3896
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3897int perf_swevent_get_recursion_context(void)
3852{ 3898{
3899 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3900 int rctx;
3901
3853 if (in_nmi()) 3902 if (in_nmi())
3854 return &cpuctx->recursion[3]; 3903 rctx = 3;
3904 else if (in_irq())
3905 rctx = 2;
3906 else if (in_softirq())
3907 rctx = 1;
3908 else
3909 rctx = 0;
3910
3911 if (cpuctx->recursion[rctx]) {
3912 put_cpu_var(perf_cpu_context);
3913 return -1;
3914 }
3855 3915
3856 if (in_irq()) 3916 cpuctx->recursion[rctx]++;
3857 return &cpuctx->recursion[2]; 3917 barrier();
3858 3918
3859 if (in_softirq()) 3919 return rctx;
3860 return &cpuctx->recursion[1]; 3920}
3921EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3861 3922
3862 return &cpuctx->recursion[0]; 3923void perf_swevent_put_recursion_context(int rctx)
3924{
3925 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3926 barrier();
3927 cpuctx->recursion[rctx]--;
3928 put_cpu_var(perf_cpu_context);
3863} 3929}
3930EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3864 3931
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3932static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi, 3933 u64 nr, int nmi,
3867 struct perf_sample_data *data, 3934 struct perf_sample_data *data,
3868 struct pt_regs *regs) 3935 struct pt_regs *regs)
3869{ 3936{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3937 struct perf_cpu_context *cpuctx;
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx; 3938 struct perf_event_context *ctx;
3873 3939
3874 if (*recursion) 3940 cpuctx = &__get_cpu_var(perf_cpu_context);
3875 goto out; 3941 rcu_read_lock();
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3942 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs); 3943 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /* 3944 /*
3884 * doesn't really matter which of the child contexts the 3945 * doesn't really matter which of the child contexts the
3885 * events ends up in. 3946 * events ends up in.
@@ -3888,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3888 if (ctx) 3949 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3950 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock(); 3951 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897} 3952}
3898 3953
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3954void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr) 3955 struct pt_regs *regs, u64 addr)
3901{ 3956{
3902 struct perf_sample_data data = { 3957 struct perf_sample_data data;
3903 .addr = addr, 3958 int rctx;
3904 };
3905 3959
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3960 rctx = perf_swevent_get_recursion_context();
3907 &data, regs); 3961 if (rctx < 0)
3962 return;
3963
3964 data.addr = addr;
3965 data.raw = NULL;
3966
3967 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3968
3969 perf_swevent_put_recursion_context(rctx);
3908} 3970}
3909 3971
3910static void perf_swevent_read(struct perf_event *event) 3972static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3949 event->pmu->read(event); 4011 event->pmu->read(event);
3950 4012
3951 data.addr = 0; 4013 data.addr = 0;
4014 data.period = event->hw.last_period;
3952 regs = get_irq_regs(); 4015 regs = get_irq_regs();
3953 /* 4016 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt 4017 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4108,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = {
4108}; 4171};
4109 4172
4110#ifdef CONFIG_EVENT_PROFILE 4173#ifdef CONFIG_EVENT_PROFILE
4174
4111void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4175void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4112 int entry_size) 4176 int entry_size)
4113{ 4177{
@@ -4126,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4126 if (!regs) 4190 if (!regs)
4127 regs = task_pt_regs(current); 4191 regs = task_pt_regs(current);
4128 4192
4193 /* Trace events already protected against recursion */
4129 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4194 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4130 &data, regs); 4195 &data, regs);
4131} 4196}
4132EXPORT_SYMBOL_GPL(perf_tp_event); 4197EXPORT_SYMBOL_GPL(perf_tp_event);
4133 4198
4134extern int ftrace_profile_enable(int); 4199static int perf_tp_event_match(struct perf_event *event,
4135extern void ftrace_profile_disable(int); 4200 struct perf_sample_data *data)
4201{
4202 void *record = data->raw->data;
4203
4204 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4205 return 1;
4206 return 0;
4207}
4136 4208
4137static void tp_perf_event_destroy(struct perf_event *event) 4209static void tp_perf_event_destroy(struct perf_event *event)
4138{ 4210{
@@ -4157,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4157 4229
4158 return &perf_ops_generic; 4230 return &perf_ops_generic;
4159} 4231}
4232
4233static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4234{
4235 char *filter_str;
4236 int ret;
4237
4238 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4239 return -EINVAL;
4240
4241 filter_str = strndup_user(arg, PAGE_SIZE);
4242 if (IS_ERR(filter_str))
4243 return PTR_ERR(filter_str);
4244
4245 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4246
4247 kfree(filter_str);
4248 return ret;
4249}
4250
4251static void perf_event_free_filter(struct perf_event *event)
4252{
4253 ftrace_profile_free_filter(event);
4254}
4255
4160#else 4256#else
4257
4258static int perf_tp_event_match(struct perf_event *event,
4259 struct perf_sample_data *data)
4260{
4261 return 1;
4262}
4263
4161static const struct pmu *tp_perf_event_init(struct perf_event *event) 4264static const struct pmu *tp_perf_event_init(struct perf_event *event)
4162{ 4265{
4163 return NULL; 4266 return NULL;
4164} 4267}
4268
4269static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4270{
4271 return -ENOENT;
4272}
4273
4274static void perf_event_free_filter(struct perf_event *event)
4275{
4276}
4277
4278#endif /* CONFIG_EVENT_PROFILE */
4279
4280#ifdef CONFIG_HAVE_HW_BREAKPOINT
4281static void bp_perf_event_destroy(struct perf_event *event)
4282{
4283 release_bp_slot(event);
4284}
4285
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{
4288 int err;
4289 /*
4290 * The breakpoint is already filled if we haven't created the counter
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err)
4299 return ERR_PTR(err);
4300
4301 bp->destroy = bp_perf_event_destroy;
4302
4303 return &perf_ops_bp;
4304}
4305
4306void perf_bp_event(struct perf_event *bp, void *data)
4307{
4308 struct perf_sample_data sample;
4309 struct pt_regs *regs = data;
4310
4311 sample.addr = bp->attr.bp_addr;
4312
4313 if (!perf_exclude_event(bp, regs))
4314 perf_swevent_add(bp, 1, 1, &sample, regs);
4315}
4316#else
4317static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4318{
4319 return NULL;
4320}
4321
4322void perf_bp_event(struct perf_event *bp, void *regs)
4323{
4324}
4165#endif 4325#endif
4166 4326
4167atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4327atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4208 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4368 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4209 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4369 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4210 case PERF_COUNT_SW_CPU_MIGRATIONS: 4370 case PERF_COUNT_SW_CPU_MIGRATIONS:
4371 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4372 case PERF_COUNT_SW_EMULATION_FAULTS:
4211 if (!event->parent) { 4373 if (!event->parent) {
4212 atomic_inc(&perf_swevent_enabled[event_id]); 4374 atomic_inc(&perf_swevent_enabled[event_id]);
4213 event->destroy = sw_perf_event_destroy; 4375 event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4228 struct perf_event_context *ctx, 4390 struct perf_event_context *ctx,
4229 struct perf_event *group_leader, 4391 struct perf_event *group_leader,
4230 struct perf_event *parent_event, 4392 struct perf_event *parent_event,
4393 perf_callback_t callback,
4231 gfp_t gfpflags) 4394 gfp_t gfpflags)
4232{ 4395{
4233 const struct pmu *pmu; 4396 const struct pmu *pmu;
@@ -4270,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4270 4433
4271 event->state = PERF_EVENT_STATE_INACTIVE; 4434 event->state = PERF_EVENT_STATE_INACTIVE;
4272 4435
4436 if (!callback && parent_event)
4437 callback = parent_event->callback;
4438
4439 event->callback = callback;
4440
4273 if (attr->disabled) 4441 if (attr->disabled)
4274 event->state = PERF_EVENT_STATE_OFF; 4442 event->state = PERF_EVENT_STATE_OFF;
4275 4443
@@ -4304,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4304 pmu = tp_perf_event_init(event); 4472 pmu = tp_perf_event_init(event);
4305 break; 4473 break;
4306 4474
4475 case PERF_TYPE_BREAKPOINT:
4476 pmu = bp_perf_event_init(event);
4477 break;
4478
4479
4307 default: 4480 default:
4308 break; 4481 break;
4309 } 4482 }
@@ -4416,7 +4589,7 @@ err_size:
4416 goto out; 4589 goto out;
4417} 4590}
4418 4591
4419int perf_event_set_output(struct perf_event *event, int output_fd) 4592static int perf_event_set_output(struct perf_event *event, int output_fd)
4420{ 4593{
4421 struct perf_event *output_event = NULL; 4594 struct perf_event *output_event = NULL;
4422 struct file *output_file = NULL; 4595 struct file *output_file = NULL;
@@ -4546,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
4546 } 4719 }
4547 4720
4548 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4721 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4549 NULL, GFP_KERNEL); 4722 NULL, NULL, GFP_KERNEL);
4550 err = PTR_ERR(event); 4723 err = PTR_ERR(event);
4551 if (IS_ERR(event)) 4724 if (IS_ERR(event))
4552 goto err_put_context; 4725 goto err_put_context;
@@ -4594,6 +4767,60 @@ err_put_context:
4594 return err; 4767 return err;
4595} 4768}
4596 4769
4770/**
4771 * perf_event_create_kernel_counter
4772 *
4773 * @attr: attributes of the counter to create
4774 * @cpu: cpu in which the counter is bound
4775 * @pid: task to profile
4776 */
4777struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback)
4780{
4781 struct perf_event *event;
4782 struct perf_event_context *ctx;
4783 int err;
4784
4785 /*
4786 * Get the target context (task or percpu):
4787 */
4788
4789 ctx = find_get_context(pid, cpu);
4790 if (IS_ERR(ctx)) {
4791 err = PTR_ERR(ctx);
4792 goto err_exit;
4793 }
4794
4795 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL);
4797 if (IS_ERR(event)) {
4798 err = PTR_ERR(event);
4799 goto err_put_context;
4800 }
4801
4802 event->filp = NULL;
4803 WARN_ON_ONCE(ctx->parent_ctx);
4804 mutex_lock(&ctx->mutex);
4805 perf_install_in_context(ctx, event, cpu);
4806 ++ctx->generation;
4807 mutex_unlock(&ctx->mutex);
4808
4809 event->owner = current;
4810 get_task_struct(current);
4811 mutex_lock(&current->perf_event_mutex);
4812 list_add_tail(&event->owner_entry, &current->perf_event_list);
4813 mutex_unlock(&current->perf_event_mutex);
4814
4815 return event;
4816
4817 err_put_context:
4818 put_ctx(ctx);
4819 err_exit:
4820 return ERR_PTR(err);
4821}
4822EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4823
4597/* 4824/*
4598 * inherit a event from parent task to child task: 4825 * inherit a event from parent task to child task:
4599 */ 4826 */
@@ -4619,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
4619 child_event = perf_event_alloc(&parent_event->attr, 4846 child_event = perf_event_alloc(&parent_event->attr,
4620 parent_event->cpu, child_ctx, 4847 parent_event->cpu, child_ctx,
4621 group_leader, parent_event, 4848 group_leader, parent_event,
4622 GFP_KERNEL); 4849 NULL, GFP_KERNEL);
4623 if (IS_ERR(child_event)) 4850 if (IS_ERR(child_event))
4624 return child_event; 4851 return child_event;
4625 get_ctx(child_ctx); 4852 get_ctx(child_ctx);
@@ -4637,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
4637 if (parent_event->attr.freq) 4864 if (parent_event->attr.freq)
4638 child_event->hw.sample_period = parent_event->hw.sample_period; 4865 child_event->hw.sample_period = parent_event->hw.sample_period;
4639 4866
4867 child_event->overflow_handler = parent_event->overflow_handler;
4868
4640 /* 4869 /*
4641 * Link it up in the child's context: 4870 * Link it up in the child's context:
4642 */ 4871 */
@@ -4726,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4726{ 4955{
4727 struct perf_event *parent_event; 4956 struct perf_event *parent_event;
4728 4957
4729 update_event_times(child_event);
4730 perf_event_remove_from_context(child_event); 4958 perf_event_remove_from_context(child_event);
4731 4959
4732 parent_event = child_event->parent; 4960 parent_event = child_event->parent;
@@ -4778,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
4778 * the events from it. 5006 * the events from it.
4779 */ 5007 */
4780 unclone_ctx(child_ctx); 5008 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx);
4781 spin_unlock_irqrestore(&child_ctx->lock, flags); 5010 spin_unlock_irqrestore(&child_ctx->lock, flags);
4782 5011
4783 /* 5012 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index fe08008133da..6b982f2cf524 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,7 +28,8 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <trace/events/sched.h> 31#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h>
32 33
33#include <asm/param.h> 34#include <asm/param.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -856,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
856 struct sigqueue *q; 857 struct sigqueue *q;
857 int override_rlimit; 858 int override_rlimit;
858 859
859 trace_sched_signal_send(sig, t); 860 trace_signal_generate(sig, info, t);
860 861
861 assert_spin_locked(&t->sighand->siglock); 862 assert_spin_locked(&t->sighand->siglock);
862 863
@@ -918,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
918 break; 919 break;
919 } 920 }
920 } else if (!is_si_special(info)) { 921 } else if (!is_si_special(info)) {
921 if (sig >= SIGRTMIN && info->si_code != SI_USER) 922 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
922 /* 923 /*
923 * Queue overflow, abort. We may abort if the signal was rt 924 * Queue overflow, abort. We may abort if the
924 * and sent by user using something other than kill(). 925 * signal was rt and sent by user using something
925 */ 926 * other than kill().
927 */
928 trace_signal_overflow_fail(sig, group, info);
926 return -EAGAIN; 929 return -EAGAIN;
930 } else {
931 /*
932 * This is a silent loss of information. We still
933 * send the signal, but the *info bits are lost.
934 */
935 trace_signal_lose_info(sig, group, info);
936 }
927 } 937 }
928 938
929out_set: 939out_set:
@@ -1859,6 +1869,9 @@ relock:
1859 ka = &sighand->action[signr-1]; 1869 ka = &sighand->action[signr-1];
1860 } 1870 }
1861 1871
1872 /* Trace actually delivered signals. */
1873 trace_signal_deliver(signr, info, ka);
1874
1862 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1875 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1863 continue; 1876 continue;
1864 if (ka->sa.sa_handler != SIG_DFL) { 1877 if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
428 449
429 If unsure, say N. 450 If unsure, say N.
430 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
431config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
432 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
433 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 59
58libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a72c6e03deec..a1ca4956ab5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
397 int ret; 397 int ret;
398 398
399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
401 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
402 403
403 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
405 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
406 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
407 409
408 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
409 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
410 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
411 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
412 415
413 return ret; 416 return ret;
414} 417}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index acef8b4636f0..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
104/* 129/*
105 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
106 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
209 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
211 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
212 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
213 } while (0) 239 } while (0)
214 240
@@ -364,6 +390,8 @@ int register_tracer(struct tracer *type);
364void unregister_tracer(struct tracer *type); 390void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void); 391int is_tracing_stopped(void);
366 392
393extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
394
367extern unsigned long nsecs_to_usecs(unsigned long nsecs); 395extern unsigned long nsecs_to_usecs(unsigned long nsecs);
368 396
369#ifdef CONFIG_TRACER_MAX_TRACE 397#ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
438 struct trace_array *tr); 466 struct trace_array *tr);
439extern int trace_selftest_startup_hw_branches(struct tracer *trace, 467extern int trace_selftest_startup_hw_branches(struct tracer *trace,
440 struct trace_array *tr); 468 struct trace_array *tr);
469extern int trace_selftest_startup_ksym(struct tracer *trace,
470 struct trace_array *tr);
441#endif /* CONFIG_FTRACE_STARTUP_TEST */ 471#endif /* CONFIG_FTRACE_STARTUP_TEST */
442 472
443extern void *head_page(struct trace_array_cpu *data); 473extern void *head_page(struct trace_array_cpu *data);
@@ -683,7 +713,6 @@ struct event_filter {
683 int n_preds; 713 int n_preds;
684 struct filter_pred **preds; 714 struct filter_pred **preds;
685 char *filter_string; 715 char *filter_string;
686 bool no_reset;
687}; 716};
688 717
689struct event_subsystem { 718struct event_subsystem {
@@ -703,7 +732,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
703typedef int (*regex_match_func)(char *str, struct regex *r, int len); 732typedef int (*regex_match_func)(char *str, struct regex *r, int len);
704 733
705enum regex_type { 734enum regex_type {
706 MATCH_FULL, 735 MATCH_FULL = 0,
707 MATCH_FRONT_ONLY, 736 MATCH_FRONT_ONLY,
708 MATCH_MIDDLE_ONLY, 737 MATCH_MIDDLE_ONLY,
709 MATCH_END_ONLY, 738 MATCH_END_ONLY,
@@ -744,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
744 struct ring_buffer *buffer, 773 struct ring_buffer *buffer,
745 struct ring_buffer_event *event) 774 struct ring_buffer_event *event)
746{ 775{
747 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 776 if (unlikely(call->filter_active) &&
777 !filter_match_preds(call->filter, rec)) {
748 ring_buffer_discard_commit(buffer, event); 778 ring_buffer_discard_commit(buffer, event);
749 return 1; 779 return 1;
750 } 780 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc998..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12char *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
19 17
20char *trace_profile_buf_nmi; 18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22 19
23/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 21static int total_profile_count;
@@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
32 return 0; 29 return 0;
33 30
34 if (!total_profile_count) { 31 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 32 buf = (char *)alloc_percpu(perf_trace_t);
36 if (!buf) 33 if (!buf)
37 goto fail_buf; 34 goto fail_buf;
38 35
39 rcu_assign_pointer(trace_profile_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
40 37
41 buf = (char *)alloc_percpu(profile_buf_t); 38 buf = (char *)alloc_percpu(perf_trace_t);
42 if (!buf) 39 if (!buf)
43 goto fail_buf_nmi; 40 goto fail_buf_nmi;
44 41
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 43 }
47 44
48 ret = event->profile_enable(); 45 ret = event->profile_enable(event);
49 if (!ret) { 46 if (!ret) {
50 total_profile_count++; 47 total_profile_count++;
51 return 0; 48 return 0;
@@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
53 50
54fail_buf_nmi: 51fail_buf_nmi:
55 if (!total_profile_count) { 52 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi); 53 free_percpu(perf_trace_buf_nmi);
57 free_percpu(trace_profile_buf); 54 free_percpu(perf_trace_buf);
58 trace_profile_buf_nmi = NULL; 55 perf_trace_buf_nmi = NULL;
59 trace_profile_buf = NULL; 56 perf_trace_buf = NULL;
60 } 57 }
61fail_buf: 58fail_buf:
62 atomic_dec(&event->profile_count); 59 atomic_dec(&event->profile_count);
@@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
89 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
90 return; 87 return;
91 88
92 event->profile_disable(); 89 event->profile_disable(event);
93 90
94 if (!--total_profile_count) { 91 if (!--total_profile_count) {
95 buf = trace_profile_buf; 92 buf = perf_trace_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL); 93 rcu_assign_pointer(perf_trace_buf, NULL);
97 94
98 nmi_buf = trace_profile_buf_nmi; 95 nmi_buf = perf_trace_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
100 97
101 /* 98 /*
102 * Ensure every events in profiling have finished before 99 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5e9ffc33f6db..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 95
96#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 97{
100 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
101 99
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 105 }
108} 106}
109 107
110#endif /* CONFIG_MODULES */
111
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 109 int enable)
114{ 110{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
117 if (call->enabled) { 113 if (call->enabled) {
118 call->enabled = 0; 114 call->enabled = 0;
119 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 116 call->unregfunc(call);
121 } 117 }
122 break; 118 break;
123 case 1: 119 case 1:
124 if (!call->enabled) { 120 if (!call->enabled) {
125 call->enabled = 1; 121 call->enabled = 1;
126 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
127 call->regfunc(call->data); 123 call->regfunc(call);
128 } 124 }
129 break; 125 break;
130 } 126 }
@@ -507,7 +503,7 @@ extern char *__bad_type_size(void);
507#define FIELD(type, name) \ 503#define FIELD(type, name) \
508 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
509 #type, "common_" #name, offsetof(typeof(field), name), \ 505 #type, "common_" #name, offsetof(typeof(field), name), \
510 sizeof(field.name) 506 sizeof(field.name), is_signed_type(type)
511 507
512static int trace_write_header(struct trace_seq *s) 508static int trace_write_header(struct trace_seq *s)
513{ 509{
@@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
515 511
516 /* struct trace_entry */ 512 /* struct trace_entry */
517 return trace_seq_printf(s, 513 return trace_seq_printf(s,
518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
523 "\n", 519 "\n",
524 FIELD(unsigned short, type), 520 FIELD(unsigned short, type),
525 FIELD(unsigned char, flags), 521 FIELD(unsigned char, flags),
526 FIELD(unsigned char, preempt_count), 522 FIELD(unsigned char, preempt_count),
527 FIELD(int, pid), 523 FIELD(int, pid),
528 FIELD(int, lock_depth)); 524 FIELD(int, lock_depth));
529} 525}
530 526
531static ssize_t 527static ssize_t
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
937 return 0; 933 return 0;
938} 934}
939 935
940#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
941 for (event = start; \ 937{
942 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
943 event++) 939 int ret;
944 940
945#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
946 943
947static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
948 953
949/* 954 d_events = event_trace_events_dir();
950 * Modules must own their file_operations to keep up with 955 if (!d_events)
951 * reference counting. 956 return -ENOENT;
952 */ 957
953struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
954 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
955 struct module *mod; 960 &ftrace_event_format_fops);
956 struct file_operations id; 961 if (!ret)
957 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
958 struct file_operations format; 963
959 struct file_operations filter; 964 return ret;
960}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
961 976
962static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
963{ 978{
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
985 } 1000 }
986} 1001}
987 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
988static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
989trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
990{ 1052{
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1042 if (!call->name) 1104 if (!call->name)
1043 continue; 1105 continue;
1044 if (call->raw_init) { 1106 if (call->raw_init) {
1045 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1046 if (ret < 0) { 1108 if (ret < 0) {
1047 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1048 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1060 return; 1122 return;
1061 } 1123 }
1062 call->mod = mod; 1124 call->mod = mod;
1063 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1064 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1065 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1066 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1067 } 1130 }
1068} 1131}
1069 1132
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1078 if (call->mod == mod) { 1141 if (call->mod == mod) {
1079 found = true; 1142 found = true;
1080 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1081 if (call->event)
1082 __unregister_ftrace_event(call->event);
1083 debugfs_remove_recursive(call->dir);
1084 list_del(&call->list);
1085 trace_destroy_fields(call);
1086 destroy_preds(call);
1087 remove_subsystem_dir(call->system);
1088 } 1144 }
1089 } 1145 }
1090 1146
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void)
1202 if (!call->name) 1258 if (!call->name)
1203 continue; 1259 continue;
1204 if (call->raw_init) { 1260 if (call->raw_init) {
1205 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1206 if (ret < 0) { 1262 if (ret < 0) {
1207 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1208 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void)
1210 continue; 1266 continue;
1211 } 1267 }
1212 } 1268 }
1213 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1214 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1215 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1216 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1217 } 1275 }
1218 1276
1219 while (true) { 1277 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 92672016da28..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
24 25
25#include "trace.h" 26#include "trace.h"
26#include "trace_output.h" 27#include "trace_output.h"
@@ -29,6 +30,7 @@ enum filter_op_ids
29{ 30{
30 OP_OR, 31 OP_OR,
31 OP_AND, 32 OP_AND,
33 OP_GLOB,
32 OP_NE, 34 OP_NE,
33 OP_EQ, 35 OP_EQ,
34 OP_LT, 36 OP_LT,
@@ -46,16 +48,17 @@ struct filter_op {
46}; 48};
47 49
48static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
49 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
50 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
51 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
52 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
53 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
54 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
55 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
56 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
57 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
58 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
59}; 62};
60 63
61enum { 64enum {
@@ -329,22 +332,18 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
329 return type; 332 return type;
330} 333}
331 334
332static int filter_build_regex(struct filter_pred *pred) 335static void filter_build_regex(struct filter_pred *pred)
333{ 336{
334 struct regex *r = &pred->regex; 337 struct regex *r = &pred->regex;
335 char *search, *dup; 338 char *search;
336 enum regex_type type; 339 enum regex_type type = MATCH_FULL;
337 int not; 340 int not = 0;
338 341
339 type = filter_parse_regex(r->pattern, r->len, &search, &not); 342 if (pred->op == OP_GLOB) {
340 dup = kstrdup(search, GFP_KERNEL); 343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
341 if (!dup) 344 r->len = strlen(search);
342 return -ENOMEM; 345 memmove(r->pattern, search, r->len+1);
343 346 }
344 strcpy(r->pattern, dup);
345 kfree(dup);
346
347 r->len = strlen(r->pattern);
348 347
349 switch (type) { 348 switch (type) {
350 case MATCH_FULL: 349 case MATCH_FULL:
@@ -362,14 +361,11 @@ static int filter_build_regex(struct filter_pred *pred)
362 } 361 }
363 362
364 pred->not ^= not; 363 pred->not ^= not;
365
366 return 0;
367} 364}
368 365
369/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
370int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
371{ 368{
372 struct event_filter *filter = call->filter;
373 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
374 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
375 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -542,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
542 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
543} 539}
544 540
545void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
546{ 542{
547 struct event_filter *filter = call->filter;
548 int i; 543 int i;
549 544
550 if (!filter) 545 if (!filter)
@@ -557,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
557 kfree(filter->preds); 552 kfree(filter->preds);
558 kfree(filter->filter_string); 553 kfree(filter->filter_string);
559 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
560 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
561} 562}
562 563
563static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
564{ 565{
565 struct event_filter *filter; 566 struct event_filter *filter;
566 struct filter_pred *pred; 567 struct filter_pred *pred;
567 int i; 568 int i;
568 569
569 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
570 return 0; 571 if (!filter)
571 572 return ERR_PTR(-ENOMEM);
572 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
573 if (!call->filter)
574 return -ENOMEM;
575 573
576 filter->n_preds = 0; 574 filter->n_preds = 0;
577 575
@@ -587,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
587 filter->preds[i] = pred; 585 filter->preds[i] = pred;
588 } 586 }
589 587
590 return 0; 588 return filter;
591 589
592oom: 590oom:
593 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
594
595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
594 599
595 return -ENOMEM; 600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
604
605 return 0;
596} 606}
597 607
598static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -615,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
615 return 0; 625 return 0;
616} 626}
617 627
618enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
619 FILTER_DISABLE_ALL,
620 FILTER_INIT_NO_RESET,
621 FILTER_SKIP_NO_RESET,
622};
623
624static void filter_free_subsystem_preds(struct event_subsystem *system,
625 int flag)
626{ 629{
627 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
628 631
@@ -633,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
633 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
634 continue; 637 continue;
635 638
636 if (flag == FILTER_INIT_NO_RESET) {
637 call->filter->no_reset = false;
638 continue;
639 }
640
641 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
642 continue;
643
644 filter_disable_preds(call); 639 filter_disable_preds(call);
645 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
646 } 641 }
@@ -648,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
648 643
649static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
650 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
651 struct filter_pred *pred, 647 struct filter_pred *pred,
652 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
653{ 649{
654 struct event_filter *filter = call->filter;
655 int idx, err; 650 int idx, err;
656 651
657 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -666,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
666 return err; 661 return err;
667 662
668 filter->n_preds++; 663 filter->n_preds++;
669 call->filter_active = 1;
670 664
671 return 0; 665 return 0;
672} 666}
@@ -691,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
691 685
692static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
693{ 687{
694 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
695 return 0; 692 return 0;
696 693
697 return 1; 694 return 1;
@@ -742,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
742 739
743static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
744 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
745 struct filter_pred *pred, 743 struct filter_pred *pred,
746 bool dry_run) 744 bool dry_run)
747{ 745{
@@ -776,15 +774,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
776 } 774 }
777 775
778 if (is_string_field(field)) { 776 if (is_string_field(field)) {
779 ret = filter_build_regex(pred); 777 filter_build_regex(pred);
780 if (ret)
781 return ret;
782 778
783 if (field->filter_type == FILTER_STATIC_STRING) { 779 if (field->filter_type == FILTER_STATIC_STRING) {
784 fn = filter_pred_string; 780 fn = filter_pred_string;
785 pred->regex.field_len = field->size; 781 pred->regex.field_len = field->size;
786 } else if (field->filter_type == FILTER_DYN_STRING) 782 } else if (field->filter_type == FILTER_DYN_STRING)
787 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
788 else { 784 else {
789 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
790 pred->regex.field_len = strlen(pred->regex.pattern); 786 pred->regex.field_len = strlen(pred->regex.pattern);
@@ -813,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
813 809
814add_pred_fn: 810add_pred_fn:
815 if (!dry_run) 811 if (!dry_run)
816 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
817 return 0;
818}
819
820static int filter_add_subsystem_pred(struct filter_parse_state *ps,
821 struct event_subsystem *system,
822 struct filter_pred *pred,
823 char *filter_string,
824 bool dry_run)
825{
826 struct ftrace_event_call *call;
827 int err = 0;
828 bool fail = true;
829
830 list_for_each_entry(call, &ftrace_events, list) {
831
832 if (!call->define_fields)
833 continue;
834
835 if (strcmp(call->system, system->name))
836 continue;
837
838 if (call->filter->no_reset)
839 continue;
840
841 err = filter_add_pred(ps, call, pred, dry_run);
842 if (err)
843 call->filter->no_reset = true;
844 else
845 fail = false;
846
847 if (!dry_run)
848 replace_filter_string(call->filter, filter_string);
849 }
850
851 if (fail) {
852 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
853 return err;
854 }
855 return 0; 813 return 0;
856} 814}
857 815
@@ -1209,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1209 return 0; 1167 return 0;
1210} 1168}
1211 1169
1212static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1213 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1214 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1215 char *filter_string, 1173 char *filter_string,
1216 bool dry_run) 1174 bool dry_run)
@@ -1257,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1257add_pred: 1215add_pred:
1258 if (!pred) 1216 if (!pred)
1259 return -ENOMEM; 1217 return -ENOMEM;
1260 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1261 err = filter_add_pred(ps, call, pred, false);
1262 else
1263 err = filter_add_subsystem_pred(ps, system, pred,
1264 filter_string, dry_run);
1265 filter_free_pred(pred); 1219 filter_free_pred(pred);
1266 if (err) 1220 if (err)
1267 return err; 1221 return err;
@@ -1272,10 +1226,50 @@ add_pred:
1272 return 0; 1226 return 0;
1273} 1227}
1274 1228
1275int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1276{ 1232{
1233 struct ftrace_event_call *call;
1234 bool fail = true;
1277 int err; 1235 int err;
1278 1236
1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1279 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1280 1274
1281 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1287,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1287 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1288 filter_disable_preds(call); 1282 filter_disable_preds(call);
1289 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1290 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1291 return 0;
1292 } 1285 }
1293 1286
1294 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1306,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 goto out; 1299 goto out;
1307 } 1300 }
1308 1301
1309 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1310 if (err) 1303 if (err)
1311 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1312 1305 else
1306 call->filter_active = 1;
1313out: 1307out:
1314 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1315 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1324,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1324 char *filter_string) 1318 char *filter_string)
1325{ 1319{
1326 int err; 1320 int err;
1327
1328 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1329 1322
1330 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1334,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1334 goto out_unlock; 1327 goto out_unlock;
1335 1328
1336 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1337 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1338 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1339 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1340 return 0;
1341 } 1333 }
1342 1334
1343 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1354,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 goto out; 1346 goto out;
1355 } 1347 }
1356 1348
1357 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1358 1350 if (err)
1359 /* try to see the filter can be applied to which events */
1360 err = replace_preds(system, NULL, ps, filter_string, true);
1361 if (err) {
1362 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1363 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1364 } 1386 }
1365 1387
1366 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1367 1391
1368 /* really apply the filter to the events */ 1392 err = -EEXIST;
1369 err = replace_preds(system, NULL, ps, filter_string, false); 1393 if (event->filter)
1370 if (err) { 1394 goto out_unlock;
1371 append_filter_err(ps, system->filter); 1395
1372 filter_free_subsystem_preds(system, 2); 1396 filter = __alloc_preds();
1397 if (IS_ERR(filter)) {
1398 err = PTR_ERR(filter);
1399 goto out_unlock;
1373 } 1400 }
1374 1401
1375out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1376 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1377 postfix_clear(ps); 1418 postfix_clear(ps);
1378 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1379out_unlock: 1425out_unlock:
1380 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1381 1427
1382 return err; 1428 return err;
1383} 1429}
1384 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index c74848ddb85a..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -66,44 +66,47 @@ static void __always_unused ____ftrace_check_##name(void) \
66#undef __field 66#undef __field
67#define __field(type, item) \ 67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
71 sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \ 72 if (!ret) \
73 return 0; 73 return 0;
74 74
75#undef __field_desc 75#undef __field_desc
76#define __field_desc(type, container, item) \ 76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \ 78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \ 79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \ 80 sizeof(field.container.item), \
81 is_signed_type(type)); \
81 if (!ret) \ 82 if (!ret) \
82 return 0; 83 return 0;
83 84
84#undef __array 85#undef __array
85#define __array(type, item, len) \ 86#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
88 offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
89 sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
90 if (!ret) \ 91 if (!ret) \
91 return 0; 92 return 0;
92 93
93#undef __array_desc 94#undef __array_desc
94#define __array_desc(type, container, item, len) \ 95#define __array_desc(type, container, item, len) \
95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
96 "offset:%zu;\tsize:%zu;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
97 offsetof(typeof(field), container.item), \ 98 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \ 99 sizeof(field.container.item), \
100 is_signed_type(type)); \
99 if (!ret) \ 101 if (!ret) \
100 return 0; 102 return 0;
101 103
102#undef __dynamic_array 104#undef __dynamic_array
103#define __dynamic_array(type, item) \ 105#define __dynamic_array(type, item) \
104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \ 107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
106 offsetof(typeof(field), item)); \ 108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
107 if (!ret) \ 110 if (!ret) \
108 return 0; 111 return 0;
109 112
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
131 134
132#include "trace_entries.h" 135#include "trace_entries.h"
133 136
134
135#undef __field 137#undef __field
136#define __field(type, item) \ 138#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
193 195
194#include "trace_entries.h" 196#include "trace_entries.h"
195 197
198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
196 203
197#undef __field 204#undef __field
198#define __field(type, item) 205#define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
211 218
212#undef FTRACE_ENTRY 219#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \ 221 \
216struct ftrace_event_call __used \ 222struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \ 223__attribute__((__aligned__(4))) \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \ 225 .name = #call, \
220 .id = type, \ 226 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \ 227 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \ 228 .raw_init = ftrace_raw_init_event, \
223 .show_format = ftrace_format_##call, \ 229 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \ 230 .define_fields = ftrace_define_fields_##call, \
225}; \ 231}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
231 232
232#include "trace_entries.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..aff5f80b59b8
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1523 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp);
277
278static DEFINE_MUTEX(probe_lock);
279static LIST_HEAD(probe_list);
280
281static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs);
284
285/*
286 * Allocate new trace_probe and initialize it (including kprobes).
287 */
288static struct trace_probe *alloc_trace_probe(const char *group,
289 const char *event,
290 void *addr,
291 const char *symbol,
292 unsigned long offs,
293 int nargs, int is_return)
294{
295 struct trace_probe *tp;
296
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp)
299 return ERR_PTR(-ENOMEM);
300
301 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL);
303 if (!tp->symbol)
304 goto error;
305 tp->rp.kp.symbol_name = tp->symbol;
306 tp->rp.kp.offset = offs;
307 } else
308 tp->rp.kp.addr = addr;
309
310 if (is_return)
311 tp->rp.handler = kretprobe_dispatcher;
312 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher;
314
315 if (!event)
316 goto error;
317 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name)
319 goto error;
320
321 if (!group)
322 goto error;
323 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system)
325 goto error;
326
327 INIT_LIST_HEAD(&tp->list);
328 return tp;
329error:
330 kfree(tp->call.name);
331 kfree(tp->symbol);
332 kfree(tp);
333 return ERR_PTR(-ENOMEM);
334}
335
336static void free_probe_arg(struct probe_arg *arg)
337{
338 if (arg->fetch.func == fetch_symbol)
339 free_symbol_cache(arg->fetch.data);
340 else if (arg->fetch.func == fetch_indirect)
341 free_indirect_fetch_data(arg->fetch.data);
342 kfree(arg->name);
343}
344
345static void free_trace_probe(struct trace_probe *tp)
346{
347 int i;
348
349 for (i = 0; i < tp->nr_args; i++)
350 free_probe_arg(&tp->args[i]);
351
352 kfree(tp->call.system);
353 kfree(tp->call.name);
354 kfree(tp->symbol);
355 kfree(tp);
356}
357
358static struct trace_probe *find_probe_event(const char *event,
359 const char *group)
360{
361 struct trace_probe *tp;
362
363 list_for_each_entry(tp, &probe_list, list)
364 if (strcmp(tp->call.name, event) == 0 &&
365 strcmp(tp->call.system, group) == 0)
366 return tp;
367 return NULL;
368}
369
370/* Unregister a trace_probe and probe_event: call with locking probe_lock */
371static void unregister_trace_probe(struct trace_probe *tp)
372{
373 if (probe_is_return(tp))
374 unregister_kretprobe(&tp->rp);
375 else
376 unregister_kprobe(&tp->rp.kp);
377 list_del(&tp->list);
378 unregister_probe_event(tp);
379}
380
381/* Register a trace_probe and probe_event */
382static int register_trace_probe(struct trace_probe *tp)
383{
384 struct trace_probe *old_tp;
385 int ret;
386
387 mutex_lock(&probe_lock);
388
389 /* register as an event */
390 old_tp = find_probe_event(tp->call.name, tp->call.system);
391 if (old_tp) {
392 /* delete old event */
393 unregister_trace_probe(old_tp);
394 free_trace_probe(old_tp);
395 }
396 ret = register_probe_event(tp);
397 if (ret) {
398 pr_warning("Faild to register probe event(%d)\n", ret);
399 goto end;
400 }
401
402 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
403 if (probe_is_return(tp))
404 ret = register_kretprobe(&tp->rp);
405 else
406 ret = register_kprobe(&tp->rp.kp);
407
408 if (ret) {
409 pr_warning("Could not insert probe(%d)\n", ret);
410 if (ret == -EILSEQ) {
411 pr_warning("Probing address(0x%p) is not an "
412 "instruction boundary.\n",
413 tp->rp.kp.addr);
414 ret = -EINVAL;
415 }
416 unregister_probe_event(tp);
417 } else
418 list_add_tail(&tp->list, &probe_list);
419end:
420 mutex_unlock(&probe_lock);
421 return ret;
422}
423
424/* Split symbol and offset. */
425static int split_symbol_offset(char *symbol, unsigned long *offset)
426{
427 char *tmp;
428 int ret;
429
430 if (!offset)
431 return -EINVAL;
432
433 tmp = strchr(symbol, '+');
434 if (tmp) {
435 /* skip sign because strict_strtol doesn't accept '+' */
436 ret = strict_strtoul(tmp + 1, 0, offset);
437 if (ret)
438 return ret;
439 *tmp = '\0';
440 } else
441 *offset = 0;
442 return 0;
443}
444
445#define PARAM_MAX_ARGS 16
446#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
447
448static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
449{
450 int ret = 0;
451 unsigned long param;
452
453 if (strcmp(arg, "retval") == 0) {
454 if (is_return) {
455 ff->func = fetch_retvalue;
456 ff->data = NULL;
457 } else
458 ret = -EINVAL;
459 } else if (strncmp(arg, "stack", 5) == 0) {
460 if (arg[5] == '\0') {
461 ff->func = fetch_stack_address;
462 ff->data = NULL;
463 } else if (isdigit(arg[5])) {
464 ret = strict_strtoul(arg + 5, 10, &param);
465 if (ret || param > PARAM_MAX_STACK)
466 ret = -EINVAL;
467 else {
468 ff->func = fetch_stack;
469 ff->data = (void *)param;
470 }
471 } else
472 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else
482 ret = -EINVAL;
483 return ret;
484}
485
486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
488{
489 int ret = 0;
490 unsigned long param;
491 long offset;
492 char *tmp;
493
494 switch (arg[0]) {
495 case '$':
496 ret = parse_probe_vars(arg + 1, ff, is_return);
497 break;
498 case '%': /* named register */
499 ret = regs_query_register_offset(arg + 1);
500 if (ret >= 0) {
501 ff->func = fetch_register;
502 ff->data = (void *)(unsigned long)ret;
503 ret = 0;
504 }
505 break;
506 case '@': /* memory or symbol */
507 if (isdigit(arg[1])) {
508 ret = strict_strtoul(arg + 1, 0, &param);
509 if (ret)
510 break;
511 ff->func = fetch_memory;
512 ff->data = (void *)param;
513 } else {
514 ret = split_symbol_offset(arg + 1, &offset);
515 if (ret)
516 break;
517 ff->data = alloc_symbol_cache(arg + 1, offset);
518 if (ff->data)
519 ff->func = fetch_symbol;
520 else
521 ret = -EINVAL;
522 }
523 break;
524 case '+': /* indirect memory */
525 case '-':
526 tmp = strchr(arg, '(');
527 if (!tmp) {
528 ret = -EINVAL;
529 break;
530 }
531 *tmp = '\0';
532 ret = strict_strtol(arg + 1, 0, &offset);
533 if (ret)
534 break;
535 if (arg[0] == '-')
536 offset = -offset;
537 arg = tmp + 1;
538 tmp = strrchr(arg, ')');
539 if (tmp) {
540 struct indirect_fetch_data *id;
541 *tmp = '\0';
542 id = kzalloc(sizeof(struct indirect_fetch_data),
543 GFP_KERNEL);
544 if (!id)
545 return -ENOMEM;
546 id->offset = offset;
547 ret = __parse_probe_arg(arg, &id->orig, is_return);
548 if (ret)
549 kfree(id);
550 else {
551 ff->func = fetch_indirect;
552 ff->data = (void *)id;
553 }
554 } else
555 ret = -EINVAL;
556 break;
557 default:
558 /* TODO: support custom handler */
559 ret = -EINVAL;
560 }
561 return ret;
562}
563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
574/* Return 1 if name is reserved or already used by another argument */
575static int conflict_field_name(const char *name,
576 struct probe_arg *args, int narg)
577{
578 int i;
579 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
580 if (strcmp(reserved_field_names[i], name) == 0)
581 return 1;
582 for (i = 0; i < narg; i++)
583 if (strcmp(args[i].name, name) == 0)
584 return 1;
585 return 0;
586}
587
588static int create_trace_probe(int argc, char **argv)
589{
590 /*
591 * Argument syntax:
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value
597 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-)
599 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
600 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
601 * %REG : fetch register REG
602 * Indirect memory fetch:
603 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
604 * Alias name of args:
605 * NAME=FETCHARG : set NAME as alias of FETCHARG.
606 */
607 struct trace_probe *tp;
608 int i, ret = 0;
609 int is_return = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0;
612 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN];
614
615 if (argc < 2) {
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p')
621 is_return = 0;
622 else if (argv[0][0] == 'r')
623 is_return = 1;
624 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n");
626 return -EINVAL;
627 }
628
629 if (argv[0][1] == ':') {
630 event = &argv[0][2];
631 if (strchr(event, '/')) {
632 group = event;
633 event = strchr(group, '/') + 1;
634 event[-1] = '\0';
635 if (strlen(group) == 0) {
636 pr_info("Group name is not specifiled\n");
637 return -EINVAL;
638 }
639 }
640 if (strlen(event) == 0) {
641 pr_info("Event name is not specifiled\n");
642 return -EINVAL;
643 }
644 }
645
646 if (isdigit(argv[1][0])) {
647 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n");
649 return -EINVAL;
650 }
651 /* an address specified */
652 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
653 if (ret) {
654 pr_info("Failed to parse address.\n");
655 return ret;
656 }
657 } else {
658 /* a symbol specified */
659 symbol = argv[1];
660 /* TODO: support .init module functions */
661 ret = split_symbol_offset(symbol, &offset);
662 if (ret) {
663 pr_info("Failed to parse symbol.\n");
664 return ret;
665 }
666 if (offset && is_return) {
667 pr_info("Return probe must be used without offset.\n");
668 return -EINVAL;
669 }
670 }
671 argc -= 2; argv += 2;
672
673 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) {
677 /* Make a new event name */
678 if (symbol)
679 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
680 is_return ? 'r' : 'p', symbol, offset);
681 else
682 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
683 is_return ? 'r' : 'p', addr);
684 event = buf;
685 }
686 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
687 is_return);
688 if (IS_ERR(tp)) {
689 pr_info("Failed to allocate trace_probe.(%d)\n",
690 (int)PTR_ERR(tp));
691 return PTR_ERR(tp);
692 }
693
694 /* parse arguments */
695 ret = 0;
696 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
697 /* Parse argument name */
698 arg = strchr(argv[i], '=');
699 if (arg)
700 *arg++ = '\0';
701 else
702 arg = argv[i];
703
704 if (conflict_field_name(argv[i], tp->args, i)) {
705 pr_info("Argument%d name '%s' conflicts with "
706 "another field.\n", i, argv[i]);
707 ret = -EINVAL;
708 goto error;
709 }
710
711 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
712 if (!tp->args[i].name) {
713 pr_info("Failed to allocate argument%d name '%s'.\n",
714 i, argv[i]);
715 ret = -ENOMEM;
716 goto error;
717 }
718
719 /* Parse fetch argument */
720 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
721 if (ret) {
722 pr_info("Parse error at argument%d. (%d)\n", i, ret);
723 kfree(tp->args[i].name);
724 goto error;
725 }
726
727 tp->nr_args++;
728 }
729
730 ret = register_trace_probe(tp);
731 if (ret)
732 goto error;
733 return 0;
734
735error:
736 free_trace_probe(tp);
737 return ret;
738}
739
740static void cleanup_all_probes(void)
741{
742 struct trace_probe *tp;
743
744 mutex_lock(&probe_lock);
745 /* TODO: Use batch unregistration */
746 while (!list_empty(&probe_list)) {
747 tp = list_entry(probe_list.next, struct trace_probe, list);
748 unregister_trace_probe(tp);
749 free_trace_probe(tp);
750 }
751 mutex_unlock(&probe_lock);
752}
753
754
755/* Probes listing interfaces */
756static void *probes_seq_start(struct seq_file *m, loff_t *pos)
757{
758 mutex_lock(&probe_lock);
759 return seq_list_start(&probe_list, *pos);
760}
761
762static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
763{
764 return seq_list_next(v, &probe_list, pos);
765}
766
767static void probes_seq_stop(struct seq_file *m, void *v)
768{
769 mutex_unlock(&probe_lock);
770}
771
772static int probes_seq_show(struct seq_file *m, void *v)
773{
774 struct trace_probe *tp = v;
775 int i, ret;
776 char buf[MAX_ARGSTR_LEN + 1];
777
778 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
779 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
780
781 if (!tp->symbol)
782 seq_printf(m, " 0x%p", tp->rp.kp.addr);
783 else if (tp->rp.kp.offset)
784 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
785 else
786 seq_printf(m, " %s", probe_symbol(tp));
787
788 for (i = 0; i < tp->nr_args; i++) {
789 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
790 if (ret < 0) {
791 pr_warning("Argument%d decoding error(%d).\n", i, ret);
792 return ret;
793 }
794 seq_printf(m, " %s=%s", tp->args[i].name, buf);
795 }
796 seq_printf(m, "\n");
797 return 0;
798}
799
800static const struct seq_operations probes_seq_op = {
801 .start = probes_seq_start,
802 .next = probes_seq_next,
803 .stop = probes_seq_stop,
804 .show = probes_seq_show
805};
806
807static int probes_open(struct inode *inode, struct file *file)
808{
809 if ((file->f_mode & FMODE_WRITE) &&
810 (file->f_flags & O_TRUNC))
811 cleanup_all_probes();
812
813 return seq_open(file, &probes_seq_op);
814}
815
816static int command_trace_probe(const char *buf)
817{
818 char **argv;
819 int argc = 0, ret = 0;
820
821 argv = argv_split(GFP_KERNEL, buf, &argc);
822 if (!argv)
823 return -ENOMEM;
824
825 if (argc)
826 ret = create_trace_probe(argc, argv);
827
828 argv_free(argv);
829 return ret;
830}
831
832#define WRITE_BUFSIZE 128
833
834static ssize_t probes_write(struct file *file, const char __user *buffer,
835 size_t count, loff_t *ppos)
836{
837 char *kbuf, *tmp;
838 int ret;
839 size_t done;
840 size_t size;
841
842 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
843 if (!kbuf)
844 return -ENOMEM;
845
846 ret = done = 0;
847 while (done < count) {
848 size = count - done;
849 if (size >= WRITE_BUFSIZE)
850 size = WRITE_BUFSIZE - 1;
851 if (copy_from_user(kbuf, buffer + done, size)) {
852 ret = -EFAULT;
853 goto out;
854 }
855 kbuf[size] = '\0';
856 tmp = strchr(kbuf, '\n');
857 if (tmp) {
858 *tmp = '\0';
859 size = tmp - kbuf + 1;
860 } else if (done + size < count) {
861 pr_warning("Line length is too long: "
862 "Should be less than %d.", WRITE_BUFSIZE);
863 ret = -EINVAL;
864 goto out;
865 }
866 done += size;
867 /* Remove comments */
868 tmp = strchr(kbuf, '#');
869 if (tmp)
870 *tmp = '\0';
871
872 ret = command_trace_probe(kbuf);
873 if (ret)
874 goto out;
875 }
876 ret = done;
877out:
878 kfree(kbuf);
879 return ret;
880}
881
882static const struct file_operations kprobe_events_ops = {
883 .owner = THIS_MODULE,
884 .open = probes_open,
885 .read = seq_read,
886 .llseek = seq_lseek,
887 .release = seq_release,
888 .write = probes_write,
889};
890
891/* Probes profiling interfaces */
892static int probes_profile_seq_show(struct seq_file *m, void *v)
893{
894 struct trace_probe *tp = v;
895
896 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
897 tp->rp.kp.nmissed);
898
899 return 0;
900}
901
902static const struct seq_operations profile_seq_op = {
903 .start = probes_seq_start,
904 .next = probes_seq_next,
905 .stop = probes_seq_stop,
906 .show = probes_profile_seq_show
907};
908
909static int profile_open(struct inode *inode, struct file *file)
910{
911 return seq_open(file, &profile_seq_op);
912}
913
914static const struct file_operations kprobe_profile_ops = {
915 .owner = THIS_MODULE,
916 .open = profile_open,
917 .read = seq_read,
918 .llseek = seq_lseek,
919 .release = seq_release,
920};
921
922/* Kprobe handler */
923static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
924{
925 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
926 struct kprobe_trace_entry *entry;
927 struct ring_buffer_event *event;
928 struct ring_buffer *buffer;
929 int size, i, pc;
930 unsigned long irq_flags;
931 struct ftrace_event_call *call = &tp->call;
932
933 tp->nhit++;
934
935 local_save_flags(irq_flags);
936 pc = preempt_count();
937
938 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
939
940 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
941 irq_flags, pc);
942 if (!event)
943 return 0;
944
945 entry = ring_buffer_event_data(event);
946 entry->nargs = tp->nr_args;
947 entry->ip = (unsigned long)kp->addr;
948 for (i = 0; i < tp->nr_args; i++)
949 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
950
951 if (!filter_current_check_discard(buffer, call, entry, event))
952 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
953 return 0;
954}
955
956/* Kretprobe handler */
957static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
958 struct pt_regs *regs)
959{
960 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
961 struct kretprobe_trace_entry *entry;
962 struct ring_buffer_event *event;
963 struct ring_buffer *buffer;
964 int size, i, pc;
965 unsigned long irq_flags;
966 struct ftrace_event_call *call = &tp->call;
967
968 local_save_flags(irq_flags);
969 pc = preempt_count();
970
971 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
972
973 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
974 irq_flags, pc);
975 if (!event)
976 return 0;
977
978 entry = ring_buffer_event_data(event);
979 entry->nargs = tp->nr_args;
980 entry->func = (unsigned long)tp->rp.kp.addr;
981 entry->ret_ip = (unsigned long)ri->ret_addr;
982 for (i = 0; i < tp->nr_args; i++)
983 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
984
985 if (!filter_current_check_discard(buffer, call, entry, event))
986 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
987
988 return 0;
989}
990
991/* Event entry printers */
992enum print_line_t
993print_kprobe_event(struct trace_iterator *iter, int flags)
994{
995 struct kprobe_trace_entry *field;
996 struct trace_seq *s = &iter->seq;
997 struct trace_event *event;
998 struct trace_probe *tp;
999 int i;
1000
1001 field = (struct kprobe_trace_entry *)iter->ent;
1002 event = ftrace_find_event(field->ent.type);
1003 tp = container_of(event, struct trace_probe, event);
1004
1005 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1006 goto partial;
1007
1008 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1009 goto partial;
1010
1011 if (!trace_seq_puts(s, ")"))
1012 goto partial;
1013
1014 for (i = 0; i < field->nargs; i++)
1015 if (!trace_seq_printf(s, " %s=%lx",
1016 tp->args[i].name, field->args[i]))
1017 goto partial;
1018
1019 if (!trace_seq_puts(s, "\n"))
1020 goto partial;
1021
1022 return TRACE_TYPE_HANDLED;
1023partial:
1024 return TRACE_TYPE_PARTIAL_LINE;
1025}
1026
1027enum print_line_t
1028print_kretprobe_event(struct trace_iterator *iter, int flags)
1029{
1030 struct kretprobe_trace_entry *field;
1031 struct trace_seq *s = &iter->seq;
1032 struct trace_event *event;
1033 struct trace_probe *tp;
1034 int i;
1035
1036 field = (struct kretprobe_trace_entry *)iter->ent;
1037 event = ftrace_find_event(field->ent.type);
1038 tp = container_of(event, struct trace_probe, event);
1039
1040 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1041 goto partial;
1042
1043 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1044 goto partial;
1045
1046 if (!trace_seq_puts(s, " <- "))
1047 goto partial;
1048
1049 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1050 goto partial;
1051
1052 if (!trace_seq_puts(s, ")"))
1053 goto partial;
1054
1055 for (i = 0; i < field->nargs; i++)
1056 if (!trace_seq_printf(s, " %s=%lx",
1057 tp->args[i].name, field->args[i]))
1058 goto partial;
1059
1060 if (!trace_seq_puts(s, "\n"))
1061 goto partial;
1062
1063 return TRACE_TYPE_HANDLED;
1064partial:
1065 return TRACE_TYPE_PARTIAL_LINE;
1066}
1067
1068static int probe_event_enable(struct ftrace_event_call *call)
1069{
1070 struct trace_probe *tp = (struct trace_probe *)call->data;
1071
1072 tp->flags |= TP_FLAG_TRACE;
1073 if (probe_is_return(tp))
1074 return enable_kretprobe(&tp->rp);
1075 else
1076 return enable_kprobe(&tp->rp.kp);
1077}
1078
1079static void probe_event_disable(struct ftrace_event_call *call)
1080{
1081 struct trace_probe *tp = (struct trace_probe *)call->data;
1082
1083 tp->flags &= ~TP_FLAG_TRACE;
1084 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1085 if (probe_is_return(tp))
1086 disable_kretprobe(&tp->rp);
1087 else
1088 disable_kprobe(&tp->rp.kp);
1089 }
1090}
1091
1092static int probe_event_raw_init(struct ftrace_event_call *event_call)
1093{
1094 INIT_LIST_HEAD(&event_call->fields);
1095
1096 return 0;
1097}
1098
1099#undef DEFINE_FIELD
1100#define DEFINE_FIELD(type, item, name, is_signed) \
1101 do { \
1102 ret = trace_define_field(event_call, #type, name, \
1103 offsetof(typeof(field), item), \
1104 sizeof(field.item), is_signed, \
1105 FILTER_OTHER); \
1106 if (ret) \
1107 return ret; \
1108 } while (0)
1109
1110static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1111{
1112 int ret, i;
1113 struct kprobe_trace_entry field;
1114 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1115
1116 ret = trace_define_common_fields(event_call);
1117 if (!ret)
1118 return ret;
1119
1120 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1121 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1122 /* Set argument names as fields */
1123 for (i = 0; i < tp->nr_args; i++)
1124 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1125 return 0;
1126}
1127
1128static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1129{
1130 int ret, i;
1131 struct kretprobe_trace_entry field;
1132 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1133
1134 ret = trace_define_common_fields(event_call);
1135 if (!ret)
1136 return ret;
1137
1138 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1139 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1141 /* Set argument names as fields */
1142 for (i = 0; i < tp->nr_args; i++)
1143 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1144 return 0;
1145}
1146
1147static int __probe_event_show_format(struct trace_seq *s,
1148 struct trace_probe *tp, const char *fmt,
1149 const char *arg)
1150{
1151 int i;
1152
1153 /* Show format */
1154 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1155 return 0;
1156
1157 for (i = 0; i < tp->nr_args; i++)
1158 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1159 return 0;
1160
1161 if (!trace_seq_printf(s, "\", %s", arg))
1162 return 0;
1163
1164 for (i = 0; i < tp->nr_args; i++)
1165 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1166 return 0;
1167
1168 return trace_seq_puts(s, "\n");
1169}
1170
1171#undef SHOW_FIELD
1172#define SHOW_FIELD(type, item, name) \
1173 do { \
1174 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1175 "offset:%u;\tsize:%u;\n", name, \
1176 (unsigned int)offsetof(typeof(field), item),\
1177 (unsigned int)sizeof(type)); \
1178 if (!ret) \
1179 return 0; \
1180 } while (0)
1181
1182static int kprobe_event_show_format(struct ftrace_event_call *call,
1183 struct trace_seq *s)
1184{
1185 struct kprobe_trace_entry field __attribute__((unused));
1186 int ret, i;
1187 struct trace_probe *tp = (struct trace_probe *)call->data;
1188
1189 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1190 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1191
1192 /* Show fields */
1193 for (i = 0; i < tp->nr_args; i++)
1194 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1195 trace_seq_puts(s, "\n");
1196
1197 return __probe_event_show_format(s, tp, "(%lx)",
1198 "REC->" FIELD_STRING_IP);
1199}
1200
1201static int kretprobe_event_show_format(struct ftrace_event_call *call,
1202 struct trace_seq *s)
1203{
1204 struct kretprobe_trace_entry field __attribute__((unused));
1205 int ret, i;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207
1208 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1209 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1210 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1211
1212 /* Show fields */
1213 for (i = 0; i < tp->nr_args; i++)
1214 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1215 trace_seq_puts(s, "\n");
1216
1217 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1218 "REC->" FIELD_STRING_FUNC
1219 ", REC->" FIELD_STRING_RETIP);
1220}
1221
1222#ifdef CONFIG_EVENT_PROFILE
1223
1224/* Kprobe profile handler */
1225static __kprobes int kprobe_profile_func(struct kprobe *kp,
1226 struct pt_regs *regs)
1227{
1228 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1229 struct ftrace_event_call *call = &tp->call;
1230 struct kprobe_trace_entry *entry;
1231 struct trace_entry *ent;
1232 int size, __size, i, pc, __cpu;
1233 unsigned long irq_flags;
1234 char *trace_buf;
1235 char *raw_data;
1236 int rctx;
1237
1238 pc = preempt_count();
1239 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1240 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1241 size -= sizeof(u32);
1242 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1243 "profile buffer not large enough"))
1244 return 0;
1245
1246 /*
1247 * Protect the non nmi buffer
1248 * This also protects the rcu read side
1249 */
1250 local_irq_save(irq_flags);
1251
1252 rctx = perf_swevent_get_recursion_context();
1253 if (rctx < 0)
1254 goto end_recursion;
1255
1256 __cpu = smp_processor_id();
1257
1258 if (in_nmi())
1259 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1260 else
1261 trace_buf = rcu_dereference(perf_trace_buf);
1262
1263 if (!trace_buf)
1264 goto end;
1265
1266 raw_data = per_cpu_ptr(trace_buf, __cpu);
1267
1268 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1269 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1270 entry = (struct kprobe_trace_entry *)raw_data;
1271 ent = &entry->ent;
1272
1273 tracing_generic_entry_update(ent, irq_flags, pc);
1274 ent->type = call->id;
1275 entry->nargs = tp->nr_args;
1276 entry->ip = (unsigned long)kp->addr;
1277 for (i = 0; i < tp->nr_args; i++)
1278 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1279 perf_tp_event(call->id, entry->ip, 1, entry, size);
1280
1281end:
1282 perf_swevent_put_recursion_context(rctx);
1283end_recursion:
1284 local_irq_restore(irq_flags);
1285
1286 return 0;
1287}
1288
1289/* Kretprobe profile handler */
1290static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1291 struct pt_regs *regs)
1292{
1293 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1294 struct ftrace_event_call *call = &tp->call;
1295 struct kretprobe_trace_entry *entry;
1296 struct trace_entry *ent;
1297 int size, __size, i, pc, __cpu;
1298 unsigned long irq_flags;
1299 char *trace_buf;
1300 char *raw_data;
1301 int rctx;
1302
1303 pc = preempt_count();
1304 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1305 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1306 size -= sizeof(u32);
1307 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1308 "profile buffer not large enough"))
1309 return 0;
1310
1311 /*
1312 * Protect the non nmi buffer
1313 * This also protects the rcu read side
1314 */
1315 local_irq_save(irq_flags);
1316
1317 rctx = perf_swevent_get_recursion_context();
1318 if (rctx < 0)
1319 goto end_recursion;
1320
1321 __cpu = smp_processor_id();
1322
1323 if (in_nmi())
1324 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1325 else
1326 trace_buf = rcu_dereference(perf_trace_buf);
1327
1328 if (!trace_buf)
1329 goto end;
1330
1331 raw_data = per_cpu_ptr(trace_buf, __cpu);
1332
1333 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1334 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1335 entry = (struct kretprobe_trace_entry *)raw_data;
1336 ent = &entry->ent;
1337
1338 tracing_generic_entry_update(ent, irq_flags, pc);
1339 ent->type = call->id;
1340 entry->nargs = tp->nr_args;
1341 entry->func = (unsigned long)tp->rp.kp.addr;
1342 entry->ret_ip = (unsigned long)ri->ret_addr;
1343 for (i = 0; i < tp->nr_args; i++)
1344 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1345 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1346
1347end:
1348 perf_swevent_put_recursion_context(rctx);
1349end_recursion:
1350 local_irq_restore(irq_flags);
1351
1352 return 0;
1353}
1354
1355static int probe_profile_enable(struct ftrace_event_call *call)
1356{
1357 struct trace_probe *tp = (struct trace_probe *)call->data;
1358
1359 tp->flags |= TP_FLAG_PROFILE;
1360
1361 if (probe_is_return(tp))
1362 return enable_kretprobe(&tp->rp);
1363 else
1364 return enable_kprobe(&tp->rp.kp);
1365}
1366
1367static void probe_profile_disable(struct ftrace_event_call *call)
1368{
1369 struct trace_probe *tp = (struct trace_probe *)call->data;
1370
1371 tp->flags &= ~TP_FLAG_PROFILE;
1372
1373 if (!(tp->flags & TP_FLAG_TRACE)) {
1374 if (probe_is_return(tp))
1375 disable_kretprobe(&tp->rp);
1376 else
1377 disable_kprobe(&tp->rp.kp);
1378 }
1379}
1380#endif /* CONFIG_EVENT_PROFILE */
1381
1382
1383static __kprobes
1384int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1385{
1386 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1387
1388 if (tp->flags & TP_FLAG_TRACE)
1389 kprobe_trace_func(kp, regs);
1390#ifdef CONFIG_EVENT_PROFILE
1391 if (tp->flags & TP_FLAG_PROFILE)
1392 kprobe_profile_func(kp, regs);
1393#endif /* CONFIG_EVENT_PROFILE */
1394 return 0; /* We don't tweek kernel, so just return 0 */
1395}
1396
1397static __kprobes
1398int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1399{
1400 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1401
1402 if (tp->flags & TP_FLAG_TRACE)
1403 kretprobe_trace_func(ri, regs);
1404#ifdef CONFIG_EVENT_PROFILE
1405 if (tp->flags & TP_FLAG_PROFILE)
1406 kretprobe_profile_func(ri, regs);
1407#endif /* CONFIG_EVENT_PROFILE */
1408 return 0; /* We don't tweek kernel, so just return 0 */
1409}
1410
1411static int register_probe_event(struct trace_probe *tp)
1412{
1413 struct ftrace_event_call *call = &tp->call;
1414 int ret;
1415
1416 /* Initialize ftrace_event_call */
1417 if (probe_is_return(tp)) {
1418 tp->event.trace = print_kretprobe_event;
1419 call->raw_init = probe_event_raw_init;
1420 call->show_format = kretprobe_event_show_format;
1421 call->define_fields = kretprobe_event_define_fields;
1422 } else {
1423 tp->event.trace = print_kprobe_event;
1424 call->raw_init = probe_event_raw_init;
1425 call->show_format = kprobe_event_show_format;
1426 call->define_fields = kprobe_event_define_fields;
1427 }
1428 call->event = &tp->event;
1429 call->id = register_ftrace_event(&tp->event);
1430 if (!call->id)
1431 return -ENODEV;
1432 call->enabled = 0;
1433 call->regfunc = probe_event_enable;
1434 call->unregfunc = probe_event_disable;
1435
1436#ifdef CONFIG_EVENT_PROFILE
1437 atomic_set(&call->profile_count, -1);
1438 call->profile_enable = probe_profile_enable;
1439 call->profile_disable = probe_profile_disable;
1440#endif
1441 call->data = tp;
1442 ret = trace_add_event_call(call);
1443 if (ret) {
1444 pr_info("Failed to register kprobe event: %s\n", call->name);
1445 unregister_ftrace_event(&tp->event);
1446 }
1447 return ret;
1448}
1449
1450static void unregister_probe_event(struct trace_probe *tp)
1451{
1452 /* tp->event is unregistered in trace_remove_event_call() */
1453 trace_remove_event_call(&tp->call);
1454}
1455
1456/* Make a debugfs interface for controling probe points */
1457static __init int init_kprobe_trace(void)
1458{
1459 struct dentry *d_tracer;
1460 struct dentry *entry;
1461
1462 d_tracer = tracing_init_dentry();
1463 if (!d_tracer)
1464 return 0;
1465
1466 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1467 NULL, &kprobe_events_ops);
1468
1469 /* Event list interface */
1470 if (!entry)
1471 pr_warning("Could not create debugfs "
1472 "'kprobe_events' entry\n");
1473
1474 /* Profile interface */
1475 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1476 NULL, &kprobe_profile_ops);
1477
1478 if (!entry)
1479 pr_warning("Could not create debugfs "
1480 "'kprobe_profile' entry\n");
1481 return 0;
1482}
1483fs_initcall(init_kprobe_trace);
1484
1485
1486#ifdef CONFIG_FTRACE_STARTUP_TEST
1487
1488static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1489 int a4, int a5, int a6)
1490{
1491 return a1 + a2 + a3 + a4 + a5 + a6;
1492}
1493
1494static __init int kprobe_trace_self_tests_init(void)
1495{
1496 int ret;
1497 int (*target)(int, int, int, int, int, int);
1498
1499 target = kprobe_trace_selftest_target;
1500
1501 pr_info("Testing kprobe tracing: ");
1502
1503 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1504 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1505 if (WARN_ON_ONCE(ret))
1506 pr_warning("error enabling function entry\n");
1507
1508 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1509 "$retval");
1510 if (WARN_ON_ONCE(ret))
1511 pr_warning("error enabling function return\n");
1512
1513 ret = target(1, 2, 3, 4, 5, 6);
1514
1515 cleanup_all_probes();
1516
1517 pr_cont("OK\n");
1518 return 0;
1519}
1520
1521late_initcall(kprobe_trace_self_tests_init);
1522
1523#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..ddfa0fd43bc0
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, void *data)
83{
84 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer;
88 int pc;
89
90 if (!ksym_tracing_enabled)
91 return;
92
93 buffer = ksym_trace_array->buffer;
94
95 pc = preempt_count();
96
97 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
98 sizeof(*entry), 0, pc);
99 if (!event)
100 return;
101
102 entry = ring_buffer_event_data(event);
103 entry->ip = instruction_pointer(regs);
104 entry->type = hw_breakpoint_type(hbp);
105 entry->addr = hw_breakpoint_addr(hbp);
106 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
107
108#ifdef CONFIG_PROFILE_KSYM_TRACER
109 ksym_collect_stats(hw_breakpoint_addr(hbp));
110#endif /* CONFIG_PROFILE_KSYM_TRACER */
111
112 trace_buffer_unlock_commit(buffer, event, 0, pc);
113}
114
115/* Valid access types are represented as
116 *
117 * rw- : Set Read/Write Access Breakpoint
118 * -w- : Set Write Access Breakpoint
119 * --- : Clear Breakpoints
120 * --x : Set Execution Break points (Not available yet)
121 *
122 */
123static int ksym_trace_get_access_type(char *str)
124{
125 int access = 0;
126
127 if (str[0] == 'r')
128 access |= HW_BREAKPOINT_R;
129
130 if (str[1] == 'w')
131 access |= HW_BREAKPOINT_W;
132
133 if (str[2] == 'x')
134 access |= HW_BREAKPOINT_X;
135
136 switch (access) {
137 case HW_BREAKPOINT_R:
138 case HW_BREAKPOINT_W:
139 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
140 return access;
141 default:
142 return -EINVAL;
143 }
144}
145
146/*
147 * There can be several possible malformed requests and we attempt to capture
148 * all of them. We enumerate some of the rules
149 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
150 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
151 * <module>:<ksym_name>:<op>.
152 * 2. No delimiter symbol ':' in the input string
153 * 3. Spurious operator symbols or symbols not in their respective positions
154 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
155 * 5. Kernel symbol not a part of /proc/kallsyms
156 * 6. Duplicate requests
157 */
158static int parse_ksym_trace_str(char *input_string, char **ksymname,
159 unsigned long *addr)
160{
161 int ret;
162
163 *ksymname = strsep(&input_string, ":");
164 *addr = kallsyms_lookup_name(*ksymname);
165
166 /* Check for malformed request: (2), (1) and (5) */
167 if ((!input_string) ||
168 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
169 (*addr == 0))
170 return -EINVAL;;
171
172 ret = ksym_trace_get_access_type(input_string);
173
174 return ret;
175}
176
177int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
178{
179 struct trace_ksym *entry;
180 int ret = -ENOMEM;
181
182 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
183 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
184 " new requests for tracing can be accepted now.\n",
185 KSYM_TRACER_MAX);
186 return -ENOSPC;
187 }
188
189 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
190 if (!entry)
191 return -ENOMEM;
192
193 hw_breakpoint_init(&entry->attr);
194
195 entry->attr.bp_type = op;
196 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler);
202
203 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again"
206 " later!!\n");
207 goto err;
208 }
209
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212
213 return 0;
214
215err:
216 kfree(entry);
217
218 return ret;
219}
220
221static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
222 size_t count, loff_t *ppos)
223{
224 struct trace_ksym *entry;
225 struct hlist_node *node;
226 struct trace_seq *s;
227 ssize_t cnt = 0;
228 int ret;
229
230 s = kmalloc(sizeof(*s), GFP_KERNEL);
231 if (!s)
232 return -ENOMEM;
233 trace_seq_init(s);
234
235 mutex_lock(&ksym_tracer_mutex);
236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
242 ret = trace_seq_puts(s, "-w-\n");
243 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
244 ret = trace_seq_puts(s, "rw-\n");
245 WARN_ON_ONCE(!ret);
246 }
247
248 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
249
250 mutex_unlock(&ksym_tracer_mutex);
251
252 kfree(s);
253
254 return cnt;
255}
256
257static void __ksym_trace_reset(void)
258{
259 struct trace_ksym *entry;
260 struct hlist_node *node, *node1;
261
262 mutex_lock(&ksym_tracer_mutex);
263 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
264 ksym_hlist) {
265 unregister_wide_hw_breakpoint(entry->ksym_hbp);
266 ksym_filter_entry_count--;
267 hlist_del_rcu(&(entry->ksym_hlist));
268 synchronize_rcu();
269 kfree(entry);
270 }
271 mutex_unlock(&ksym_tracer_mutex);
272}
273
274static ssize_t ksym_trace_filter_write(struct file *file,
275 const char __user *buffer,
276 size_t count, loff_t *ppos)
277{
278 struct trace_ksym *entry;
279 struct hlist_node *node;
280 char *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0;
283
284 input_string = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string)
286 return -ENOMEM;
287
288 if (copy_from_user(input_string, buffer, count)) {
289 kfree(input_string);
290 return -EFAULT;
291 }
292 input_string[count] = '\0';
293
294 strstrip(input_string);
295
296 /*
297 * Clear all breakpoints if:
298 * 1: echo > ksym_trace_filter
299 * 2: echo 0 > ksym_trace_filter
300 * 3: echo "*:---" > ksym_trace_filter
301 */
302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset();
305 kfree(input_string);
306 return count;
307 }
308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) {
311 kfree(input_string);
312 return ret;
313 }
314
315 mutex_lock(&ksym_tracer_mutex);
316
317 ret = -EINVAL;
318 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
319 if (entry->attr.bp_addr == ksym_addr) {
320 /* Check for malformed request: (6) */
321 if (entry->attr.bp_type != op)
322 changed = 1;
323 else
324 goto out;
325 break;
326 }
327 }
328 if (changed) {
329 unregister_wide_hw_breakpoint(entry->ksym_hbp);
330 entry->attr.bp_type = op;
331 ret = 0;
332 if (op > 0) {
333 entry->ksym_hbp =
334 register_wide_hw_breakpoint(&entry->attr,
335 ksym_hbp_handler);
336 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp);
338 else
339 goto out;
340 }
341 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu();
345 kfree(entry);
346 goto out;
347 } else {
348 /* Check for malformed request: (4) */
349 if (op == 0)
350 goto out;
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 }
353out:
354 mutex_unlock(&ksym_tracer_mutex);
355
356 kfree(input_string);
357
358 if (!ret)
359 ret = count;
360 return ret;
361}
362
363static const struct file_operations ksym_tracing_fops = {
364 .open = tracing_open_generic,
365 .read = ksym_trace_filter_read,
366 .write = ksym_trace_filter_write,
367};
368
369static void ksym_trace_reset(struct trace_array *tr)
370{
371 ksym_tracing_enabled = 0;
372 __ksym_trace_reset();
373}
374
375static int ksym_trace_init(struct trace_array *tr)
376{
377 int cpu, ret = 0;
378
379 for_each_online_cpu(cpu)
380 tracing_reset(tr, cpu);
381 ksym_tracing_enabled = 1;
382 ksym_trace_array = tr;
383
384 return ret;
385}
386
387static void ksym_trace_print_header(struct seq_file *m)
388{
389 seq_puts(m,
390 "# TASK-PID CPU# Symbol "
391 "Type Function\n");
392 seq_puts(m,
393 "# | | | "
394 " | |\n");
395}
396
397static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
398{
399 struct trace_entry *entry = iter->ent;
400 struct trace_seq *s = &iter->seq;
401 struct ksym_trace_entry *field;
402 char str[KSYM_SYMBOL_LEN];
403 int ret;
404
405 if (entry->type != TRACE_KSYM)
406 return TRACE_TYPE_UNHANDLED;
407
408 trace_assign_type(field, entry);
409
410 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
411 entry->pid, iter->cpu, (char *)field->addr);
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 switch (field->type) {
416 case HW_BREAKPOINT_R:
417 ret = trace_seq_printf(s, " R ");
418 break;
419 case HW_BREAKPOINT_W:
420 ret = trace_seq_printf(s, " W ");
421 break;
422 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
423 ret = trace_seq_printf(s, " RW ");
424 break;
425 default:
426 return TRACE_TYPE_PARTIAL_LINE;
427 }
428
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 sprint_symbol(str, field->ip);
433 ret = trace_seq_printf(s, "%s\n", str);
434 if (!ret)
435 return TRACE_TYPE_PARTIAL_LINE;
436
437 return TRACE_TYPE_HANDLED;
438}
439
440struct tracer ksym_tracer __read_mostly =
441{
442 .name = "ksym_tracer",
443 .init = ksym_trace_init,
444 .reset = ksym_trace_reset,
445#ifdef CONFIG_FTRACE_SELFTEST
446 .selftest = trace_selftest_startup_ksym,
447#endif
448 .print_header = ksym_trace_print_header,
449 .print_line = ksym_trace_output
450};
451
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m)
473{
474 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480
481static int ksym_tracer_stat_show(struct seq_file *m, void *v)
482{
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
489
490 access_type = entry->attr.bp_type;
491
492 switch (access_type) {
493 case HW_BREAKPOINT_R:
494 seq_puts(m, " R ");
495 break;
496 case HW_BREAKPOINT_W:
497 seq_puts(m, " W ");
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 }
505
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511
512 return 0;
513}
514
515static void *ksym_tracer_stat_start(struct tracer_stat *trace)
516{
517 return ksym_filter_head.first;
518}
519
520static void *
521ksym_tracer_stat_next(void *v, int idx)
522{
523 struct hlist_node *stat = v;
524
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534};
535
536__init static int ksym_tracer_stat_init(void)
537{
538 int ret;
539
540 ret = register_stat_tracer(&ksym_tracer_stats);
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546
547 return 0;
548}
549fs_initcall(ksym_tracer_stat_init);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ddee9c593732..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,32 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
51 return syscalls_metadata[nr]; 51 return syscalls_metadata[nr];
52} 52}
53 53
54int syscall_name_to_nr(char *name)
55{
56 int i;
57
58 if (!syscalls_metadata)
59 return -1;
60
61 for (i = 0; i < NR_syscalls; i++) {
62 if (syscalls_metadata[i]) {
63 if (!strcmp(syscalls_metadata[i]->name, name))
64 return i;
65 }
66 }
67 return -1;
68}
69
70void set_syscall_enter_id(int num, int id)
71{
72 syscalls_metadata[num]->enter_id = id;
73}
74
75void set_syscall_exit_id(int num, int id)
76{
77 syscalls_metadata[num]->exit_id = id;
78}
79
80enum print_line_t 54enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
82{ 56{
@@ -93,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
93 if (!entry) 67 if (!entry)
94 goto end; 68 goto end;
95 69
96 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
97 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
98 goto end; 72 goto end;
99 } 73 }
@@ -148,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
148 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
149 } 123 }
150 124
151 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
152 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
153 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
154 } 128 }
@@ -166,24 +140,19 @@ extern char *__bad_type_size(void);
166#define SYSCALL_FIELD(type, name) \ 140#define SYSCALL_FIELD(type, name) \
167 sizeof(type) != sizeof(trace.name) ? \ 141 sizeof(type) != sizeof(trace.name) ? \
168 __bad_type_size() : \ 142 __bad_type_size() : \
169 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type)
170 145
171int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
172{ 147{
173 int i; 148 int i;
174 int nr;
175 int ret; 149 int ret;
176 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
177 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
178 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
179 153
180 nr = syscall_name_to_nr(call->data); 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
181 entry = syscall_nr_to_meta(nr); 155 "\tsigned:%u;\n",
182
183 if (!entry)
184 return 0;
185
186 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
187 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
188 if (!ret) 157 if (!ret)
189 return 0; 158 return 0;
@@ -193,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
193 entry->args[i]); 162 entry->args[i]);
194 if (!ret) 163 if (!ret)
195 return 0; 164 return 0;
196 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
197 sizeof(unsigned long)); 166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
198 if (!ret) 169 if (!ret)
199 return 0; 170 return 0;
200 offset += sizeof(unsigned long); 171 offset += sizeof(unsigned long);
@@ -226,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
226 struct syscall_trace_exit trace; 197 struct syscall_trace_exit trace;
227 198
228 ret = trace_seq_printf(s, 199 ret = trace_seq_printf(s,
229 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
230 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
231 SYSCALL_FIELD(int, nr), 204 SYSCALL_FIELD(int, nr),
232 SYSCALL_FIELD(long, ret)); 205 SYSCALL_FIELD(long, ret));
233 if (!ret) 206 if (!ret)
@@ -239,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
239int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
240{ 213{
241 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
242 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
243 int ret; 216 int ret;
244 int nr;
245 int i; 217 int i;
246 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
247 219
248 nr = syscall_name_to_nr(call->data);
249 meta = syscall_nr_to_meta(nr);
250
251 if (!meta)
252 return 0;
253
254 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
255 if (ret) 221 if (ret)
256 return ret; 222 return ret;
257 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
258 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
259 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
260 meta->args[i], offset, 230 meta->args[i], offset,
@@ -275,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
275 if (ret) 245 if (ret)
276 return ret; 246 return ret;
277 247
278 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, 248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
279 FILTER_OTHER); 253 FILTER_OTHER);
280 254
281 return ret; 255 return ret;
@@ -302,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
302 276
303 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
304 278
305 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
306 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
307 if (!event) 281 if (!event)
308 return; 282 return;
309 283
@@ -334,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
334 if (!sys_data) 308 if (!sys_data)
335 return; 309 return;
336 310
337 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
338 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
339 if (!event) 313 if (!event)
340 return; 314 return;
341 315
@@ -348,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
348 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 322 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
349} 323}
350 324
351int reg_event_syscall_enter(void *ptr) 325int reg_event_syscall_enter(struct ftrace_event_call *call)
352{ 326{
353 int ret = 0; 327 int ret = 0;
354 int num; 328 int num;
355 char *name;
356 329
357 name = (char *)ptr; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
358 num = syscall_name_to_nr(name);
359 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
360 return -ENOSYS; 332 return -ENOSYS;
361 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -372,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
372 return ret; 344 return ret;
373} 345}
374 346
375void unreg_event_syscall_enter(void *ptr) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
376{ 348{
377 int num; 349 int num;
378 char *name;
379 350
380 name = (char *)ptr; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
381 num = syscall_name_to_nr(name);
382 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
383 return; 353 return;
384 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -389,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
389 mutex_unlock(&syscall_trace_lock); 359 mutex_unlock(&syscall_trace_lock);
390} 360}
391 361
392int reg_event_syscall_exit(void *ptr) 362int reg_event_syscall_exit(struct ftrace_event_call *call)
393{ 363{
394 int ret = 0; 364 int ret = 0;
395 int num; 365 int num;
396 char *name;
397 366
398 name = (char *)ptr; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
399 num = syscall_name_to_nr(name);
400 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
401 return -ENOSYS; 369 return -ENOSYS;
402 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -413,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
413 return ret; 381 return ret;
414} 382}
415 383
416void unreg_event_syscall_exit(void *ptr) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
417{ 385{
418 int num; 386 int num;
419 char *name;
420 387
421 name = (char *)ptr; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
422 num = syscall_name_to_nr(name);
423 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
424 return; 390 return;
425 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -430,13 +396,17 @@ void unreg_event_syscall_exit(void *ptr)
430 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
431} 397}
432 398
433struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
434 .trace = print_syscall_enter, 400{
435}; 401 int id;
436 402
437struct trace_event event_syscall_exit = { 403 id = register_ftrace_event(call->event);
438 .trace = print_syscall_exit, 404 if (!id)
439}; 405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
440 410
441int __init init_ftrace_syscalls(void) 411int __init init_ftrace_syscalls(void)
442{ 412{
@@ -454,6 +424,10 @@ int __init init_ftrace_syscalls(void)
454 for (i = 0; i < NR_syscalls; i++) { 424 for (i = 0; i < NR_syscalls; i++) {
455 addr = arch_syscall_addr(i); 425 addr = arch_syscall_addr(i);
456 meta = find_syscall_meta(addr); 426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
457 syscalls_metadata[i] = meta; 431 syscalls_metadata[i] = meta;
458 } 432 }
459 433
@@ -473,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
473 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
474 struct syscall_trace_enter *rec; 448 struct syscall_trace_enter *rec;
475 unsigned long flags; 449 unsigned long flags;
450 char *trace_buf;
476 char *raw_data; 451 char *raw_data;
477 int syscall_nr; 452 int syscall_nr;
453 int rctx;
478 int size; 454 int size;
479 int cpu; 455 int cpu;
480 456
@@ -498,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
498 /* Protect the per cpu buffer, begin the rcu read side */ 474 /* Protect the per cpu buffer, begin the rcu read side */
499 local_irq_save(flags); 475 local_irq_save(flags);
500 476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
501 cpu = smp_processor_id(); 481 cpu = smp_processor_id();
502 482
503 if (in_nmi()) 483 trace_buf = rcu_dereference(perf_trace_buf);
504 raw_data = rcu_dereference(trace_profile_buf_nmi);
505 else
506 raw_data = rcu_dereference(trace_profile_buf);
507 484
508 if (!raw_data) 485 if (!trace_buf)
509 goto end; 486 goto end;
510 487
511 raw_data = per_cpu_ptr(raw_data, cpu); 488 raw_data = per_cpu_ptr(trace_buf, cpu);
512 489
513 /* zero the dead bytes from align to not leak stack to user */ 490 /* zero the dead bytes from align to not leak stack to user */
514 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
515 492
516 rec = (struct syscall_trace_enter *) raw_data; 493 rec = (struct syscall_trace_enter *) raw_data;
517 tracing_generic_entry_update(&rec->ent, 0, 0); 494 tracing_generic_entry_update(&rec->ent, 0, 0);
518 rec->ent.type = sys_data->enter_id; 495 rec->ent.type = sys_data->enter_event->id;
519 rec->nr = syscall_nr; 496 rec->nr = syscall_nr;
520 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
521 (unsigned long *)&rec->args); 498 (unsigned long *)&rec->args);
522 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
523 500
524end: 501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
525 local_irq_restore(flags); 504 local_irq_restore(flags);
526} 505}
527 506
528int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
529{ 508{
530 int ret = 0; 509 int ret = 0;
531 int num; 510 int num;
532 511
533 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
534 if (num < 0 || num >= NR_syscalls)
535 return -ENOSYS;
536 513
537 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
538 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -548,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
548 return ret; 525 return ret;
549} 526}
550 527
551void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
552{ 529{
553 int num; 530 int num;
554 531
555 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
556 if (num < 0 || num >= NR_syscalls)
557 return;
558 533
559 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
560 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -570,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
570 struct syscall_trace_exit *rec; 545 struct syscall_trace_exit *rec;
571 unsigned long flags; 546 unsigned long flags;
572 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
573 char *raw_data; 549 char *raw_data;
550 int rctx;
574 int size; 551 int size;
575 int cpu; 552 int cpu;
576 553
@@ -596,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
596 573
597 /* Protect the per cpu buffer, begin the rcu read side */ 574 /* Protect the per cpu buffer, begin the rcu read side */
598 local_irq_save(flags); 575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
599 cpu = smp_processor_id(); 581 cpu = smp_processor_id();
600 582
601 if (in_nmi()) 583 trace_buf = rcu_dereference(perf_trace_buf);
602 raw_data = rcu_dereference(trace_profile_buf_nmi);
603 else
604 raw_data = rcu_dereference(trace_profile_buf);
605 584
606 if (!raw_data) 585 if (!trace_buf)
607 goto end; 586 goto end;
608 587
609 raw_data = per_cpu_ptr(raw_data, cpu); 588 raw_data = per_cpu_ptr(trace_buf, cpu);
610 589
611 /* zero the dead bytes from align to not leak stack to user */ 590 /* zero the dead bytes from align to not leak stack to user */
612 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -614,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
614 rec = (struct syscall_trace_exit *)raw_data; 593 rec = (struct syscall_trace_exit *)raw_data;
615 594
616 tracing_generic_entry_update(&rec->ent, 0, 0); 595 tracing_generic_entry_update(&rec->ent, 0, 0);
617 rec->ent.type = sys_data->exit_id; 596 rec->ent.type = sys_data->exit_event->id;
618 rec->nr = syscall_nr; 597 rec->nr = syscall_nr;
619 rec->ret = syscall_get_return_value(current, regs); 598 rec->ret = syscall_get_return_value(current, regs);
620 599
621 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
622 601
623end: 602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
624 local_irq_restore(flags); 605 local_irq_restore(flags);
625} 606}
626 607
627int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
628{ 609{
629 int ret = 0; 610 int ret = 0;
630 int num; 611 int num;
631 612
632 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
633 if (num < 0 || num >= NR_syscalls)
634 return -ENOSYS;
635 614
636 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
637 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -647,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
647 return ret; 626 return ret;
648} 627}
649 628
650void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
651{ 630{
652 int num; 631 int num;
653 632
654 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
655 if (num < 0 || num >= NR_syscalls)
656 return;
657 634
658 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
659 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;