aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/hw_breakpoint.c423
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/perf_event.c528
-rw-r--r--kernel/signal.c27
-rw-r--r--kernel/trace/Kconfig21
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_event_profile.c14
-rw-r--r--kernel/trace/trace_kprobe.c94
-rw-r--r--kernel/trace/trace_ksym.c550
-rw-r--r--kernel/trace/trace_selftest.c55
-rw-r--r--kernel/trace/trace_syscalls.c191
15 files changed, 1573 insertions, 364 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..6b7ce8173dfd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
24endif 25endif
25 26
26obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -95,6 +96,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_PERF_EVENTS) += perf_event.o 98obj-$(CONFIG_PERF_EVENTS) += perf_event.o
99obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
98 100
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..3f45e3cf931d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code)
978 proc_exit_connector(tsk); 979 proc_exit_connector(tsk);
979 980
980 /* 981 /*
982 * FIXME: do that only when needed, using sched_exit tracepoint
983 */
984 flush_ptrace_hw_breakpoint(tsk);
985 /*
981 * Flush inherited counters to the parent - before the parent 986 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 987 * gets woken up by child-exit notifications.
983 */ 988 */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..cf5ee1628411
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) IBM Corporation, 2009
18 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Thanks to Ingo Molnar for his many suggestions.
21 *
22 * Authors: Alan Stern <stern@rowland.harvard.edu>
23 * K.Prasad <prasad@linux.vnet.ibm.com>
24 * Frederic Weisbecker <fweisbec@gmail.com>
25 */
26
27/*
28 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
29 * using the CPU's debug registers.
30 * This file contains the arch-independent routines.
31 */
32
33#include <linux/irqflags.h>
34#include <linux/kallsyms.h>
35#include <linux/notifier.h>
36#include <linux/kprobes.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/percpu.h>
41#include <linux/sched.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44
45#include <linux/hw_breakpoint.h>
46
47/*
48 * Constraints data
49 */
50
51/* Number of pinned cpu breakpoints in a cpu */
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53
54/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
56
57/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
59
60/* Gather the number of total pinned and un-pinned bp in a cpuset */
61struct bp_busy_slots {
62 unsigned int pinned;
63 unsigned int flexible;
64};
65
66/* Serialize accesses to the above constraints */
67static DEFINE_MUTEX(nr_bp_mutex);
68
69/*
70 * Report the maximum number of pinned breakpoints a task
71 * have in this cpu
72 */
73static unsigned int max_task_bp_pinned(int cpu)
74{
75 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
77
78 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0)
80 return i + 1;
81 }
82
83 return 0;
84}
85
86/*
87 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
91{
92 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96
97 return;
98 }
99
100 for_each_online_cpu(cpu) {
101 unsigned int nr;
102
103 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu);
105
106 if (nr > slots->pinned)
107 slots->pinned = nr;
108
109 nr = per_cpu(nr_bp_flexible, cpu);
110
111 if (nr > slots->flexible)
112 slots->flexible = nr;
113 }
114}
115
116/*
117 * Add a pinned breakpoint for the given task in our constraint table
118 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned;
125 struct list_head *list;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143
144 spin_unlock_irqrestore(&ctx->lock, flags);
145
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) {
151 tsk_pinned[count]++;
152 if (count > 0)
153 tsk_pinned[count-1]--;
154 } else {
155 tsk_pinned[count]--;
156 if (count > 0)
157 tsk_pinned[count-1]++;
158 }
159}
160
161/*
162 * Add/remove the given breakpoint in our constraint table
163 */
164static void toggle_bp_slot(struct perf_event *bp, bool enable)
165{
166 int cpu = bp->cpu;
167 struct task_struct *tsk = bp->ctx->task;
168
169 /* Pinned counter task profiling */
170 if (tsk) {
171 if (cpu >= 0) {
172 toggle_bp_task_slot(tsk, cpu, enable);
173 return;
174 }
175
176 for_each_online_cpu(cpu)
177 toggle_bp_task_slot(tsk, cpu, enable);
178 return;
179 }
180
181 /* Pinned counter cpu profiling */
182 if (enable)
183 per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
184 else
185 per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
186}
187
188/*
189 * Contraints to check before allowing this new breakpoint counter:
190 *
191 * == Non-pinned counter == (Considered as pinned for now)
192 *
193 * - If attached to a single cpu, check:
194 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
197 *
198 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them.
200 * Otherwise, we check that the maximum number of per task
201 * breakpoints (for this cpu) plus the number of per cpu breakpoint
202 * (for this cpu) doesn't cover every registers.
203 *
204 * - If attached to every cpus, check:
205 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
208 *
209 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks
211 * breakpoints.
212 *
213 *
214 * == Pinned counter ==
215 *
216 * - If attached to a single cpu, check:
217 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
220 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed).
223 *
224 * - If attached to every cpus, check:
225 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
228 */
229int reserve_bp_slot(struct perf_event *bp)
230{
231 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235
236 fetch_bp_busy_slots(&slots, bp->cpu);
237
238 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
240 ret = -ENOSPC;
241 goto end;
242 }
243
244 toggle_bp_slot(bp, true);
245
246end:
247 mutex_unlock(&nr_bp_mutex);
248
249 return ret;
250}
251
252void release_bp_slot(struct perf_event *bp)
253{
254 mutex_lock(&nr_bp_mutex);
255
256 toggle_bp_slot(bp, false);
257
258 mutex_unlock(&nr_bp_mutex);
259}
260
261
262int __register_perf_hw_breakpoint(struct perf_event *bp)
263{
264 int ret;
265
266 ret = reserve_bp_slot(bp);
267 if (ret)
268 return ret;
269
270 /*
271 * Ptrace breakpoints can be temporary perf events only
272 * meant to reserve a slot. In this case, it is created disabled and
273 * we don't want to check the params right now (as we put a null addr)
274 * But perf tools create events as disabled and we want to check
275 * the params for them.
276 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace
278 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281
282 return ret;
283}
284
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288
289 return __register_perf_hw_breakpoint(bp);
290}
291
292/**
293 * register_user_hw_breakpoint - register a hardware breakpoint for user space
294 * @attr: breakpoint attributes
295 * @triggered: callback to trigger when we hit the breakpoint
296 * @tsk: pointer to 'task_struct' of the process to which the address belongs
297 */
298struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered,
301 struct task_struct *tsk)
302{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
304}
305EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
306
307/**
308 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
309 * @bp: the breakpoint structure to modify
310 * @attr: new breakpoint attributes
311 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */
314struct perf_event *
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{
319 /*
320 * FIXME: do it without unregistering
321 * - We don't want to lose our slot
322 * - If the new bp is incorrect, don't lose the older one
323 */
324 unregister_hw_breakpoint(bp);
325
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
327}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329
330/**
331 * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
332 * @bp: the breakpoint structure to unregister
333 */
334void unregister_hw_breakpoint(struct perf_event *bp)
335{
336 if (!bp)
337 return;
338 perf_event_release_kernel(bp);
339}
340EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
341
342/**
343 * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
344 * @attr: breakpoint attributes
345 * @triggered: callback to trigger when we hit the breakpoint
346 *
347 * @return a set of per_cpu pointers to perf events
348 */
349struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered)
352{
353 struct perf_event **cpu_events, **pevent, *bp;
354 long err;
355 int cpu;
356
357 cpu_events = alloc_percpu(typeof(*cpu_events));
358 if (!cpu_events)
359 return ERR_PTR(-ENOMEM);
360
361 for_each_possible_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364
365 *pevent = bp;
366
367 if (IS_ERR(bp)) {
368 err = PTR_ERR(bp);
369 goto fail;
370 }
371 }
372
373 return cpu_events;
374
375fail:
376 for_each_possible_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent))
379 break;
380 unregister_hw_breakpoint(*pevent);
381 }
382 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
387
388/**
389 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
390 * @cpu_events: the per cpu set of events to unregister
391 */
392void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
393{
394 int cpu;
395 struct perf_event **pevent;
396
397 for_each_possible_cpu(cpu) {
398 pevent = per_cpu_ptr(cpu_events, cpu);
399 unregister_hw_breakpoint(*pevent);
400 }
401 free_percpu(cpu_events);
402}
403EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
404
405static struct notifier_block hw_breakpoint_exceptions_nb = {
406 .notifier_call = hw_breakpoint_exceptions_notify,
407 /* we need to be notified first */
408 .priority = 0x7fffffff
409};
410
411static int __init init_hw_breakpoint(void)
412{
413 return register_die_notifier(&hw_breakpoint_exceptions_nb);
414}
415core_initcall(init_hw_breakpoint);
416
417
418struct pmu perf_ops_bp = {
419 .enable = arch_install_hw_breakpoint,
420 .disable = arch_uninstall_hw_breakpoint,
421 .read = hw_breakpoint_pmu_read,
422 .unthrottle = hw_breakpoint_pmu_unthrottle
423};
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
181 } 181 }
182 return module_kallsyms_lookup_name(name); 182 return module_kallsyms_lookup_name(name);
183} 183}
184EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
184 185
185int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, 186int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
186 unsigned long), 187 unsigned long),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3256e36ad251..6b7ddba1dd64 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -29,6 +29,7 @@
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
32#include <linux/hw_breakpoint.h>
32 33
33#include <asm/irq_regs.h> 34#include <asm/irq_regs.h>
34 35
@@ -245,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
245 put_ctx(ctx); 246 put_ctx(ctx);
246} 247}
247 248
249static inline u64 perf_clock(void)
250{
251 return cpu_clock(smp_processor_id());
252}
253
254/*
255 * Update the record of the current time in a context.
256 */
257static void update_context_time(struct perf_event_context *ctx)
258{
259 u64 now = perf_clock();
260
261 ctx->time += now - ctx->timestamp;
262 ctx->timestamp = now;
263}
264
265/*
266 * Update the total_time_enabled and total_time_running fields for a event.
267 */
268static void update_event_times(struct perf_event *event)
269{
270 struct perf_event_context *ctx = event->ctx;
271 u64 run_end;
272
273 if (event->state < PERF_EVENT_STATE_INACTIVE ||
274 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
275 return;
276
277 if (ctx->is_active)
278 run_end = ctx->time;
279 else
280 run_end = event->tstamp_stopped;
281
282 event->total_time_enabled = run_end - event->tstamp_enabled;
283
284 if (event->state == PERF_EVENT_STATE_INACTIVE)
285 run_end = event->tstamp_stopped;
286 else
287 run_end = ctx->time;
288
289 event->total_time_running = run_end - event->tstamp_running;
290}
291
248/* 292/*
249 * Add a event from the lists for its context. 293 * Add a event from the lists for its context.
250 * Must be called with ctx->mutex and ctx->lock held. 294 * Must be called with ctx->mutex and ctx->lock held.
@@ -293,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
293 if (event->group_leader != event) 337 if (event->group_leader != event)
294 event->group_leader->nr_siblings--; 338 event->group_leader->nr_siblings--;
295 339
340 update_event_times(event);
341
342 /*
343 * If event was in error state, then keep it
344 * that way, otherwise bogus counts will be
345 * returned on read(). The only way to get out
346 * of error state is by explicit re-enabling
347 * of the event
348 */
349 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF;
351
296 /* 352 /*
297 * If this was a group event with sibling events then 353 * If this was a group event with sibling events then
298 * upgrade the siblings to singleton events by adding them 354 * upgrade the siblings to singleton events by adding them
@@ -446,50 +502,11 @@ retry:
446 * can remove the event safely, if the call above did not 502 * can remove the event safely, if the call above did not
447 * succeed. 503 * succeed.
448 */ 504 */
449 if (!list_empty(&event->group_entry)) { 505 if (!list_empty(&event->group_entry))
450 list_del_event(event, ctx); 506 list_del_event(event, ctx);
451 }
452 spin_unlock_irq(&ctx->lock); 507 spin_unlock_irq(&ctx->lock);
453} 508}
454 509
455static inline u64 perf_clock(void)
456{
457 return cpu_clock(smp_processor_id());
458}
459
460/*
461 * Update the record of the current time in a context.
462 */
463static void update_context_time(struct perf_event_context *ctx)
464{
465 u64 now = perf_clock();
466
467 ctx->time += now - ctx->timestamp;
468 ctx->timestamp = now;
469}
470
471/*
472 * Update the total_time_enabled and total_time_running fields for a event.
473 */
474static void update_event_times(struct perf_event *event)
475{
476 struct perf_event_context *ctx = event->ctx;
477 u64 run_end;
478
479 if (event->state < PERF_EVENT_STATE_INACTIVE ||
480 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481 return;
482
483 event->total_time_enabled = ctx->time - event->tstamp_enabled;
484
485 if (event->state == PERF_EVENT_STATE_INACTIVE)
486 run_end = event->tstamp_stopped;
487 else
488 run_end = ctx->time;
489
490 event->total_time_running = run_end - event->tstamp_running;
491}
492
493/* 510/*
494 * Update total_time_enabled and total_time_running for all events in a group. 511 * Update total_time_enabled and total_time_running for all events in a group.
495 */ 512 */
@@ -1032,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1032 update_context_time(ctx); 1049 update_context_time(ctx);
1033 1050
1034 perf_disable(); 1051 perf_disable();
1035 if (ctx->nr_active) 1052 if (ctx->nr_active) {
1036 list_for_each_entry(event, &ctx->group_list, group_entry) 1053 list_for_each_entry(event, &ctx->group_list, group_entry)
1037 group_sched_out(event, cpuctx, ctx); 1054 group_sched_out(event, cpuctx, ctx);
1038 1055 }
1039 perf_enable(); 1056 perf_enable();
1040 out: 1057 out:
1041 spin_unlock(&ctx->lock); 1058 spin_unlock(&ctx->lock);
@@ -1060,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1060 && !ctx1->pin_count && !ctx2->pin_count; 1077 && !ctx1->pin_count && !ctx2->pin_count;
1061} 1078}
1062 1079
1063static void __perf_event_read(void *event);
1064
1065static void __perf_event_sync_stat(struct perf_event *event, 1080static void __perf_event_sync_stat(struct perf_event *event,
1066 struct perf_event *next_event) 1081 struct perf_event *next_event)
1067{ 1082{
@@ -1079,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1079 */ 1094 */
1080 switch (event->state) { 1095 switch (event->state) {
1081 case PERF_EVENT_STATE_ACTIVE: 1096 case PERF_EVENT_STATE_ACTIVE:
1082 __perf_event_read(event); 1097 event->pmu->read(event);
1083 break; 1098 /* fall-through */
1084 1099
1085 case PERF_EVENT_STATE_INACTIVE: 1100 case PERF_EVENT_STATE_INACTIVE:
1086 update_event_times(event); 1101 update_event_times(event);
@@ -1119,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1119 if (!ctx->nr_stat) 1134 if (!ctx->nr_stat)
1120 return; 1135 return;
1121 1136
1137 update_context_time(ctx);
1138
1122 event = list_first_entry(&ctx->event_list, 1139 event = list_first_entry(&ctx->event_list,
1123 struct perf_event, event_entry); 1140 struct perf_event, event_entry);
1124 1141
@@ -1162,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
1162 if (likely(!ctx || !cpuctx->task_ctx)) 1179 if (likely(!ctx || !cpuctx->task_ctx))
1163 return; 1180 return;
1164 1181
1165 update_context_time(ctx);
1166
1167 rcu_read_lock(); 1182 rcu_read_lock();
1168 parent = rcu_dereference(ctx->parent_ctx); 1183 parent = rcu_dereference(ctx->parent_ctx);
1169 next_ctx = next->perf_event_ctxp; 1184 next_ctx = next->perf_event_ctxp;
@@ -1516,7 +1531,6 @@ static void __perf_event_read(void *info)
1516 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1531 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517 struct perf_event *event = info; 1532 struct perf_event *event = info;
1518 struct perf_event_context *ctx = event->ctx; 1533 struct perf_event_context *ctx = event->ctx;
1519 unsigned long flags;
1520 1534
1521 /* 1535 /*
1522 * If this is a task context, we need to check whether it is 1536 * If this is a task context, we need to check whether it is
@@ -1528,12 +1542,12 @@ static void __perf_event_read(void *info)
1528 if (ctx->task && cpuctx->task_ctx != ctx) 1542 if (ctx->task && cpuctx->task_ctx != ctx)
1529 return; 1543 return;
1530 1544
1531 local_irq_save(flags); 1545 spin_lock(&ctx->lock);
1532 if (ctx->is_active) 1546 update_context_time(ctx);
1533 update_context_time(ctx);
1534 event->pmu->read(event);
1535 update_event_times(event); 1547 update_event_times(event);
1536 local_irq_restore(flags); 1548 spin_unlock(&ctx->lock);
1549
1550 event->pmu->read(event);
1537} 1551}
1538 1552
1539static u64 perf_event_read(struct perf_event *event) 1553static u64 perf_event_read(struct perf_event *event)
@@ -1546,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
1546 smp_call_function_single(event->oncpu, 1560 smp_call_function_single(event->oncpu,
1547 __perf_event_read, event, 1); 1561 __perf_event_read, event, 1);
1548 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1562 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1563 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx);
1549 update_event_times(event); 1568 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags);
1550 } 1570 }
1551 1571
1552 return atomic64_read(&event->count); 1572 return atomic64_read(&event->count);
@@ -1700,16 +1720,10 @@ static void free_event(struct perf_event *event)
1700 call_rcu(&event->rcu_head, free_event_rcu); 1720 call_rcu(&event->rcu_head, free_event_rcu);
1701} 1721}
1702 1722
1703/* 1723int perf_event_release_kernel(struct perf_event *event)
1704 * Called when the last reference to the file is gone.
1705 */
1706static int perf_release(struct inode *inode, struct file *file)
1707{ 1724{
1708 struct perf_event *event = file->private_data;
1709 struct perf_event_context *ctx = event->ctx; 1725 struct perf_event_context *ctx = event->ctx;
1710 1726
1711 file->private_data = NULL;
1712
1713 WARN_ON_ONCE(ctx->parent_ctx); 1727 WARN_ON_ONCE(ctx->parent_ctx);
1714 mutex_lock(&ctx->mutex); 1728 mutex_lock(&ctx->mutex);
1715 perf_event_remove_from_context(event); 1729 perf_event_remove_from_context(event);
@@ -1724,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
1724 1738
1725 return 0; 1739 return 0;
1726} 1740}
1741EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1742
1743/*
1744 * Called when the last reference to the file is gone.
1745 */
1746static int perf_release(struct inode *inode, struct file *file)
1747{
1748 struct perf_event *event = file->private_data;
1749
1750 file->private_data = NULL;
1751
1752 return perf_event_release_kernel(event);
1753}
1727 1754
1728static int perf_event_read_size(struct perf_event *event) 1755static int perf_event_read_size(struct perf_event *event)
1729{ 1756{
@@ -1750,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
1750 return size; 1777 return size;
1751} 1778}
1752 1779
1753static u64 perf_event_read_value(struct perf_event *event) 1780u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1754{ 1781{
1755 struct perf_event *child; 1782 struct perf_event *child;
1756 u64 total = 0; 1783 u64 total = 0;
1757 1784
1785 *enabled = 0;
1786 *running = 0;
1787
1788 mutex_lock(&event->child_mutex);
1758 total += perf_event_read(event); 1789 total += perf_event_read(event);
1759 list_for_each_entry(child, &event->child_list, child_list) 1790 *enabled += event->total_time_enabled +
1791 atomic64_read(&event->child_total_time_enabled);
1792 *running += event->total_time_running +
1793 atomic64_read(&event->child_total_time_running);
1794
1795 list_for_each_entry(child, &event->child_list, child_list) {
1760 total += perf_event_read(child); 1796 total += perf_event_read(child);
1797 *enabled += child->total_time_enabled;
1798 *running += child->total_time_running;
1799 }
1800 mutex_unlock(&event->child_mutex);
1761 1801
1762 return total; 1802 return total;
1763} 1803}
1764 1804EXPORT_SYMBOL_GPL(perf_event_read_value);
1765static int perf_event_read_entry(struct perf_event *event,
1766 u64 read_format, char __user *buf)
1767{
1768 int n = 0, count = 0;
1769 u64 values[2];
1770
1771 values[n++] = perf_event_read_value(event);
1772 if (read_format & PERF_FORMAT_ID)
1773 values[n++] = primary_event_id(event);
1774
1775 count = n * sizeof(u64);
1776
1777 if (copy_to_user(buf, values, count))
1778 return -EFAULT;
1779
1780 return count;
1781}
1782 1805
1783static int perf_event_read_group(struct perf_event *event, 1806static int perf_event_read_group(struct perf_event *event,
1784 u64 read_format, char __user *buf) 1807 u64 read_format, char __user *buf)
1785{ 1808{
1786 struct perf_event *leader = event->group_leader, *sub; 1809 struct perf_event *leader = event->group_leader, *sub;
1787 int n = 0, size = 0, err = -EFAULT; 1810 int n = 0, size = 0, ret = -EFAULT;
1788 u64 values[3]; 1811 struct perf_event_context *ctx = leader->ctx;
1812 u64 values[5];
1813 u64 count, enabled, running;
1814
1815 mutex_lock(&ctx->mutex);
1816 count = perf_event_read_value(leader, &enabled, &running);
1789 1817
1790 values[n++] = 1 + leader->nr_siblings; 1818 values[n++] = 1 + leader->nr_siblings;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1792 values[n++] = leader->total_time_enabled + 1820 values[n++] = enabled;
1793 atomic64_read(&leader->child_total_time_enabled); 1821 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1794 } 1822 values[n++] = running;
1795 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1823 values[n++] = count;
1796 values[n++] = leader->total_time_running + 1824 if (read_format & PERF_FORMAT_ID)
1797 atomic64_read(&leader->child_total_time_running); 1825 values[n++] = primary_event_id(leader);
1798 }
1799 1826
1800 size = n * sizeof(u64); 1827 size = n * sizeof(u64);
1801 1828
1802 if (copy_to_user(buf, values, size)) 1829 if (copy_to_user(buf, values, size))
1803 return -EFAULT; 1830 goto unlock;
1804
1805 err = perf_event_read_entry(leader, read_format, buf + size);
1806 if (err < 0)
1807 return err;
1808 1831
1809 size += err; 1832 ret = size;
1810 1833
1811 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1812 err = perf_event_read_entry(sub, read_format, 1835 n = 0;
1813 buf + size); 1836
1814 if (err < 0) 1837 values[n++] = perf_event_read_value(sub, &enabled, &running);
1815 return err; 1838 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(sub);
1840
1841 size = n * sizeof(u64);
1842
1843 if (copy_to_user(buf + ret, values, size)) {
1844 ret = -EFAULT;
1845 goto unlock;
1846 }
1816 1847
1817 size += err; 1848 ret += size;
1818 } 1849 }
1850unlock:
1851 mutex_unlock(&ctx->mutex);
1819 1852
1820 return size; 1853 return ret;
1821} 1854}
1822 1855
1823static int perf_event_read_one(struct perf_event *event, 1856static int perf_event_read_one(struct perf_event *event,
1824 u64 read_format, char __user *buf) 1857 u64 read_format, char __user *buf)
1825{ 1858{
1859 u64 enabled, running;
1826 u64 values[4]; 1860 u64 values[4];
1827 int n = 0; 1861 int n = 0;
1828 1862
1829 values[n++] = perf_event_read_value(event); 1863 values[n++] = perf_event_read_value(event, &enabled, &running);
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1864 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1831 values[n++] = event->total_time_enabled + 1865 values[n++] = enabled;
1832 atomic64_read(&event->child_total_time_enabled); 1866 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1833 } 1867 values[n++] = running;
1834 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1835 values[n++] = event->total_time_running +
1836 atomic64_read(&event->child_total_time_running);
1837 }
1838 if (read_format & PERF_FORMAT_ID) 1868 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(event); 1869 values[n++] = primary_event_id(event);
1840 1870
@@ -1865,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1865 return -ENOSPC; 1895 return -ENOSPC;
1866 1896
1867 WARN_ON_ONCE(event->ctx->parent_ctx); 1897 WARN_ON_ONCE(event->ctx->parent_ctx);
1868 mutex_lock(&event->child_mutex);
1869 if (read_format & PERF_FORMAT_GROUP) 1898 if (read_format & PERF_FORMAT_GROUP)
1870 ret = perf_event_read_group(event, read_format, buf); 1899 ret = perf_event_read_group(event, read_format, buf);
1871 else 1900 else
1872 ret = perf_event_read_one(event, read_format, buf); 1901 ret = perf_event_read_one(event, read_format, buf);
1873 mutex_unlock(&event->child_mutex);
1874 1902
1875 return ret; 1903 return ret;
1876} 1904}
@@ -2182,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2182 perf_mmap_free_page((unsigned long)data->user_page); 2210 perf_mmap_free_page((unsigned long)data->user_page);
2183 for (i = 0; i < data->nr_pages; i++) 2211 for (i = 0; i < data->nr_pages; i++)
2184 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2212 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2213 kfree(data);
2185} 2214}
2186 2215
2187#else 2216#else
@@ -2222,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2222 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2251 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2223 2252
2224 vfree(base); 2253 vfree(base);
2254 kfree(data);
2225} 2255}
2226 2256
2227static void perf_mmap_data_free(struct perf_mmap_data *data) 2257static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2315,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2315 } 2345 }
2316 2346
2317 if (!data->watermark) 2347 if (!data->watermark)
2318 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2348 data->watermark = max_size / 2;
2319 2349
2320 2350
2321 rcu_assign_pointer(event->data, data); 2351 rcu_assign_pointer(event->data, data);
@@ -2327,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2327 2357
2328 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2358 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2329 perf_mmap_data_free(data); 2359 perf_mmap_data_free(data);
2330 kfree(data);
2331} 2360}
2332 2361
2333static void perf_mmap_data_release(struct perf_event *event) 2362static void perf_mmap_data_release(struct perf_event *event)
@@ -3245,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3245{ 3274{
3246 struct perf_event *event; 3275 struct perf_event *event;
3247 3276
3248 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3249 return;
3250
3251 rcu_read_lock();
3252 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3277 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3253 if (perf_event_task_match(event)) 3278 if (perf_event_task_match(event))
3254 perf_event_task_output(event, task_event); 3279 perf_event_task_output(event, task_event);
3255 } 3280 }
3256 rcu_read_unlock();
3257} 3281}
3258 3282
3259static void perf_event_task_event(struct perf_task_event *task_event) 3283static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3261,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3261 struct perf_cpu_context *cpuctx; 3285 struct perf_cpu_context *cpuctx;
3262 struct perf_event_context *ctx = task_event->task_ctx; 3286 struct perf_event_context *ctx = task_event->task_ctx;
3263 3287
3288 rcu_read_lock();
3264 cpuctx = &get_cpu_var(perf_cpu_context); 3289 cpuctx = &get_cpu_var(perf_cpu_context);
3265 perf_event_task_ctx(&cpuctx->ctx, task_event); 3290 perf_event_task_ctx(&cpuctx->ctx, task_event);
3266 put_cpu_var(perf_cpu_context); 3291 put_cpu_var(perf_cpu_context);
3267 3292
3268 rcu_read_lock();
3269 if (!ctx) 3293 if (!ctx)
3270 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3271 if (ctx) 3295 if (ctx)
@@ -3357,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3357{ 3381{
3358 struct perf_event *event; 3382 struct perf_event *event;
3359 3383
3360 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3361 return;
3362
3363 rcu_read_lock();
3364 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3384 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3365 if (perf_event_comm_match(event)) 3385 if (perf_event_comm_match(event))
3366 perf_event_comm_output(event, comm_event); 3386 perf_event_comm_output(event, comm_event);
3367 } 3387 }
3368 rcu_read_unlock();
3369} 3388}
3370 3389
3371static void perf_event_comm_event(struct perf_comm_event *comm_event) 3390static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3376,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3376 char comm[TASK_COMM_LEN]; 3395 char comm[TASK_COMM_LEN];
3377 3396
3378 memset(comm, 0, sizeof(comm)); 3397 memset(comm, 0, sizeof(comm));
3379 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3398 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3380 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3399 size = ALIGN(strlen(comm)+1, sizeof(u64));
3381 3400
3382 comm_event->comm = comm; 3401 comm_event->comm = comm;
@@ -3384,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3384 3403
3385 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3404 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3386 3405
3406 rcu_read_lock();
3387 cpuctx = &get_cpu_var(perf_cpu_context); 3407 cpuctx = &get_cpu_var(perf_cpu_context);
3388 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3389 put_cpu_var(perf_cpu_context); 3409 put_cpu_var(perf_cpu_context);
3390 3410
3391 rcu_read_lock();
3392 /* 3411 /*
3393 * doesn't really matter which of the child contexts the 3412 * doesn't really matter which of the child contexts the
3394 * events ends up in. 3413 * events ends up in.
@@ -3481,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3481{ 3500{
3482 struct perf_event *event; 3501 struct perf_event *event;
3483 3502
3484 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3485 return;
3486
3487 rcu_read_lock();
3488 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3503 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3489 if (perf_event_mmap_match(event, mmap_event)) 3504 if (perf_event_mmap_match(event, mmap_event))
3490 perf_event_mmap_output(event, mmap_event); 3505 perf_event_mmap_output(event, mmap_event);
3491 } 3506 }
3492 rcu_read_unlock();
3493} 3507}
3494 3508
3495static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3509static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3545,11 +3559,11 @@ got_name:
3545 3559
3546 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3560 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3547 3561
3562 rcu_read_lock();
3548 cpuctx = &get_cpu_var(perf_cpu_context); 3563 cpuctx = &get_cpu_var(perf_cpu_context);
3549 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3550 put_cpu_var(perf_cpu_context); 3565 put_cpu_var(perf_cpu_context);
3551 3566
3552 rcu_read_lock();
3553 /* 3567 /*
3554 * doesn't really matter which of the child contexts the 3568 * doesn't really matter which of the child contexts the
3555 * events ends up in. 3569 * events ends up in.
@@ -3688,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 perf_event_disable(event); 3702 perf_event_disable(event);
3689 } 3703 }
3690 3704
3691 perf_event_output(event, nmi, data, regs); 3705 if (event->overflow_handler)
3706 event->overflow_handler(event, nmi, data, regs);
3707 else
3708 perf_event_output(event, nmi, data, regs);
3709
3692 return ret; 3710 return ret;
3693} 3711}
3694 3712
@@ -3733,16 +3751,16 @@ again:
3733 return nr; 3751 return nr;
3734} 3752}
3735 3753
3736static void perf_swevent_overflow(struct perf_event *event, 3754static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3737 int nmi, struct perf_sample_data *data, 3755 int nmi, struct perf_sample_data *data,
3738 struct pt_regs *regs) 3756 struct pt_regs *regs)
3739{ 3757{
3740 struct hw_perf_event *hwc = &event->hw; 3758 struct hw_perf_event *hwc = &event->hw;
3741 int throttle = 0; 3759 int throttle = 0;
3742 u64 overflow;
3743 3760
3744 data->period = event->hw.last_period; 3761 data->period = event->hw.last_period;
3745 overflow = perf_swevent_set_period(event); 3762 if (!overflow)
3763 overflow = perf_swevent_set_period(event);
3746 3764
3747 if (hwc->interrupts == MAX_INTERRUPTS) 3765 if (hwc->interrupts == MAX_INTERRUPTS)
3748 return; 3766 return;
@@ -3775,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3775 3793
3776 atomic64_add(nr, &event->count); 3794 atomic64_add(nr, &event->count);
3777 3795
3796 if (!regs)
3797 return;
3798
3778 if (!hwc->sample_period) 3799 if (!hwc->sample_period)
3779 return; 3800 return;
3780 3801
3781 if (!regs) 3802 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3803 return perf_swevent_overflow(event, 1, nmi, data, regs);
3804
3805 if (atomic64_add_negative(nr, &hwc->period_left))
3782 return; 3806 return;
3783 3807
3784 if (!atomic64_add_negative(nr, &hwc->period_left)) 3808 perf_swevent_overflow(event, 0, nmi, data, regs);
3785 perf_swevent_overflow(event, nmi, data, regs);
3786} 3809}
3787 3810
3788static int perf_swevent_is_counting(struct perf_event *event) 3811static int perf_swevent_is_counting(struct perf_event *event)
@@ -3818,6 +3841,20 @@ static int perf_swevent_is_counting(struct perf_event *event)
3818static int perf_tp_event_match(struct perf_event *event, 3841static int perf_tp_event_match(struct perf_event *event,
3819 struct perf_sample_data *data); 3842 struct perf_sample_data *data);
3820 3843
3844static int perf_exclude_event(struct perf_event *event,
3845 struct pt_regs *regs)
3846{
3847 if (regs) {
3848 if (event->attr.exclude_user && user_mode(regs))
3849 return 1;
3850
3851 if (event->attr.exclude_kernel && !user_mode(regs))
3852 return 1;
3853 }
3854
3855 return 0;
3856}
3857
3821static int perf_swevent_match(struct perf_event *event, 3858static int perf_swevent_match(struct perf_event *event,
3822 enum perf_type_id type, 3859 enum perf_type_id type,
3823 u32 event_id, 3860 u32 event_id,
@@ -3829,16 +3866,12 @@ static int perf_swevent_match(struct perf_event *event,
3829 3866
3830 if (event->attr.type != type) 3867 if (event->attr.type != type)
3831 return 0; 3868 return 0;
3869
3832 if (event->attr.config != event_id) 3870 if (event->attr.config != event_id)
3833 return 0; 3871 return 0;
3834 3872
3835 if (regs) { 3873 if (perf_exclude_event(event, regs))
3836 if (event->attr.exclude_user && user_mode(regs)) 3874 return 0;
3837 return 0;
3838
3839 if (event->attr.exclude_kernel && !user_mode(regs))
3840 return 0;
3841 }
3842 3875
3843 if (event->attr.type == PERF_TYPE_TRACEPOINT && 3876 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3844 !perf_tp_event_match(event, data)) 3877 !perf_tp_event_match(event, data))
@@ -3855,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3855{ 3888{
3856 struct perf_event *event; 3889 struct perf_event *event;
3857 3890
3858 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3859 return;
3860
3861 rcu_read_lock();
3862 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3891 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3863 if (perf_swevent_match(event, type, event_id, data, regs)) 3892 if (perf_swevent_match(event, type, event_id, data, regs))
3864 perf_swevent_add(event, nr, nmi, data, regs); 3893 perf_swevent_add(event, nr, nmi, data, regs);
3865 } 3894 }
3866 rcu_read_unlock();
3867} 3895}
3868 3896
3869static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 3897int perf_swevent_get_recursion_context(void)
3870{ 3898{
3899 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3900 int rctx;
3901
3871 if (in_nmi()) 3902 if (in_nmi())
3872 return &cpuctx->recursion[3]; 3903 rctx = 3;
3904 else if (in_irq())
3905 rctx = 2;
3906 else if (in_softirq())
3907 rctx = 1;
3908 else
3909 rctx = 0;
3873 3910
3874 if (in_irq()) 3911 if (cpuctx->recursion[rctx]) {
3875 return &cpuctx->recursion[2]; 3912 put_cpu_var(perf_cpu_context);
3913 return -1;
3914 }
3876 3915
3877 if (in_softirq()) 3916 cpuctx->recursion[rctx]++;
3878 return &cpuctx->recursion[1]; 3917 barrier();
3879 3918
3880 return &cpuctx->recursion[0]; 3919 return rctx;
3881} 3920}
3921EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3922
3923void perf_swevent_put_recursion_context(int rctx)
3924{
3925 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3926 barrier();
3927 cpuctx->recursion[rctx]--;
3928 put_cpu_var(perf_cpu_context);
3929}
3930EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3882 3931
3883static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 3932static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3884 u64 nr, int nmi, 3933 u64 nr, int nmi,
3885 struct perf_sample_data *data, 3934 struct perf_sample_data *data,
3886 struct pt_regs *regs) 3935 struct pt_regs *regs)
3887{ 3936{
3888 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3937 struct perf_cpu_context *cpuctx;
3889 int *recursion = perf_swevent_recursion_context(cpuctx);
3890 struct perf_event_context *ctx; 3938 struct perf_event_context *ctx;
3891 3939
3892 if (*recursion) 3940 cpuctx = &__get_cpu_var(perf_cpu_context);
3893 goto out; 3941 rcu_read_lock();
3894
3895 (*recursion)++;
3896 barrier();
3897
3898 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 3942 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3899 nr, nmi, data, regs); 3943 nr, nmi, data, regs);
3900 rcu_read_lock();
3901 /* 3944 /*
3902 * doesn't really matter which of the child contexts the 3945 * doesn't really matter which of the child contexts the
3903 * events ends up in. 3946 * events ends up in.
@@ -3906,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3906 if (ctx) 3949 if (ctx)
3907 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 3950 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3908 rcu_read_unlock(); 3951 rcu_read_unlock();
3909
3910 barrier();
3911 (*recursion)--;
3912
3913out:
3914 put_cpu_var(perf_cpu_context);
3915} 3952}
3916 3953
3917void __perf_sw_event(u32 event_id, u64 nr, int nmi, 3954void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3918 struct pt_regs *regs, u64 addr) 3955 struct pt_regs *regs, u64 addr)
3919{ 3956{
3920 struct perf_sample_data data = { 3957 struct perf_sample_data data;
3921 .addr = addr, 3958 int rctx;
3922 };
3923 3959
3924 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 3960 rctx = perf_swevent_get_recursion_context();
3925 &data, regs); 3961 if (rctx < 0)
3962 return;
3963
3964 data.addr = addr;
3965 data.raw = NULL;
3966
3967 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3968
3969 perf_swevent_put_recursion_context(rctx);
3926} 3970}
3927 3971
3928static void perf_swevent_read(struct perf_event *event) 3972static void perf_swevent_read(struct perf_event *event)
@@ -3967,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3967 event->pmu->read(event); 4011 event->pmu->read(event);
3968 4012
3969 data.addr = 0; 4013 data.addr = 0;
4014 data.period = event->hw.last_period;
3970 regs = get_irq_regs(); 4015 regs = get_irq_regs();
3971 /* 4016 /*
3972 * In case we exclude kernel IPs or are somehow not in interrupt 4017 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4145,6 +4190,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4145 if (!regs) 4190 if (!regs)
4146 regs = task_pt_regs(current); 4191 regs = task_pt_regs(current);
4147 4192
4193 /* Trace events already protected against recursion */
4148 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4194 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4149 &data, regs); 4195 &data, regs);
4150} 4196}
@@ -4231,6 +4277,53 @@ static void perf_event_free_filter(struct perf_event *event)
4231 4277
4232#endif /* CONFIG_EVENT_PROFILE */ 4278#endif /* CONFIG_EVENT_PROFILE */
4233 4279
4280#ifdef CONFIG_HAVE_HW_BREAKPOINT
4281static void bp_perf_event_destroy(struct perf_event *event)
4282{
4283 release_bp_slot(event);
4284}
4285
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{
4288 int err;
4289 /*
4290 * The breakpoint is already filled if we haven't created the counter
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err)
4299 return ERR_PTR(err);
4300
4301 bp->destroy = bp_perf_event_destroy;
4302
4303 return &perf_ops_bp;
4304}
4305
4306void perf_bp_event(struct perf_event *bp, void *data)
4307{
4308 struct perf_sample_data sample;
4309 struct pt_regs *regs = data;
4310
4311 sample.addr = bp->attr.bp_addr;
4312
4313 if (!perf_exclude_event(bp, regs))
4314 perf_swevent_add(bp, 1, 1, &sample, regs);
4315}
4316#else
4317static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4318{
4319 return NULL;
4320}
4321
4322void perf_bp_event(struct perf_event *bp, void *regs)
4323{
4324}
4325#endif
4326
4234atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4327atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4235 4328
4236static void sw_perf_event_destroy(struct perf_event *event) 4329static void sw_perf_event_destroy(struct perf_event *event)
@@ -4297,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4297 struct perf_event_context *ctx, 4390 struct perf_event_context *ctx,
4298 struct perf_event *group_leader, 4391 struct perf_event *group_leader,
4299 struct perf_event *parent_event, 4392 struct perf_event *parent_event,
4393 perf_callback_t callback,
4300 gfp_t gfpflags) 4394 gfp_t gfpflags)
4301{ 4395{
4302 const struct pmu *pmu; 4396 const struct pmu *pmu;
@@ -4339,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4339 4433
4340 event->state = PERF_EVENT_STATE_INACTIVE; 4434 event->state = PERF_EVENT_STATE_INACTIVE;
4341 4435
4436 if (!callback && parent_event)
4437 callback = parent_event->callback;
4438
4439 event->callback = callback;
4440
4342 if (attr->disabled) 4441 if (attr->disabled)
4343 event->state = PERF_EVENT_STATE_OFF; 4442 event->state = PERF_EVENT_STATE_OFF;
4344 4443
@@ -4373,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4373 pmu = tp_perf_event_init(event); 4472 pmu = tp_perf_event_init(event);
4374 break; 4473 break;
4375 4474
4475 case PERF_TYPE_BREAKPOINT:
4476 pmu = bp_perf_event_init(event);
4477 break;
4478
4479
4376 default: 4480 default:
4377 break; 4481 break;
4378 } 4482 }
@@ -4615,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
4615 } 4719 }
4616 4720
4617 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4721 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4618 NULL, GFP_KERNEL); 4722 NULL, NULL, GFP_KERNEL);
4619 err = PTR_ERR(event); 4723 err = PTR_ERR(event);
4620 if (IS_ERR(event)) 4724 if (IS_ERR(event))
4621 goto err_put_context; 4725 goto err_put_context;
@@ -4663,6 +4767,60 @@ err_put_context:
4663 return err; 4767 return err;
4664} 4768}
4665 4769
4770/**
4771 * perf_event_create_kernel_counter
4772 *
4773 * @attr: attributes of the counter to create
4774 * @cpu: cpu in which the counter is bound
4775 * @pid: task to profile
4776 */
4777struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback)
4780{
4781 struct perf_event *event;
4782 struct perf_event_context *ctx;
4783 int err;
4784
4785 /*
4786 * Get the target context (task or percpu):
4787 */
4788
4789 ctx = find_get_context(pid, cpu);
4790 if (IS_ERR(ctx)) {
4791 err = PTR_ERR(ctx);
4792 goto err_exit;
4793 }
4794
4795 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL);
4797 if (IS_ERR(event)) {
4798 err = PTR_ERR(event);
4799 goto err_put_context;
4800 }
4801
4802 event->filp = NULL;
4803 WARN_ON_ONCE(ctx->parent_ctx);
4804 mutex_lock(&ctx->mutex);
4805 perf_install_in_context(ctx, event, cpu);
4806 ++ctx->generation;
4807 mutex_unlock(&ctx->mutex);
4808
4809 event->owner = current;
4810 get_task_struct(current);
4811 mutex_lock(&current->perf_event_mutex);
4812 list_add_tail(&event->owner_entry, &current->perf_event_list);
4813 mutex_unlock(&current->perf_event_mutex);
4814
4815 return event;
4816
4817 err_put_context:
4818 put_ctx(ctx);
4819 err_exit:
4820 return ERR_PTR(err);
4821}
4822EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4823
4666/* 4824/*
4667 * inherit a event from parent task to child task: 4825 * inherit a event from parent task to child task:
4668 */ 4826 */
@@ -4688,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
4688 child_event = perf_event_alloc(&parent_event->attr, 4846 child_event = perf_event_alloc(&parent_event->attr,
4689 parent_event->cpu, child_ctx, 4847 parent_event->cpu, child_ctx,
4690 group_leader, parent_event, 4848 group_leader, parent_event,
4691 GFP_KERNEL); 4849 NULL, GFP_KERNEL);
4692 if (IS_ERR(child_event)) 4850 if (IS_ERR(child_event))
4693 return child_event; 4851 return child_event;
4694 get_ctx(child_ctx); 4852 get_ctx(child_ctx);
@@ -4706,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
4706 if (parent_event->attr.freq) 4864 if (parent_event->attr.freq)
4707 child_event->hw.sample_period = parent_event->hw.sample_period; 4865 child_event->hw.sample_period = parent_event->hw.sample_period;
4708 4866
4867 child_event->overflow_handler = parent_event->overflow_handler;
4868
4709 /* 4869 /*
4710 * Link it up in the child's context: 4870 * Link it up in the child's context:
4711 */ 4871 */
@@ -4795,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4795{ 4955{
4796 struct perf_event *parent_event; 4956 struct perf_event *parent_event;
4797 4957
4798 update_event_times(child_event);
4799 perf_event_remove_from_context(child_event); 4958 perf_event_remove_from_context(child_event);
4800 4959
4801 parent_event = child_event->parent; 4960 parent_event = child_event->parent;
@@ -4847,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
4847 * the events from it. 5006 * the events from it.
4848 */ 5007 */
4849 unclone_ctx(child_ctx); 5008 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx);
4850 spin_unlock_irqrestore(&child_ctx->lock, flags); 5010 spin_unlock_irqrestore(&child_ctx->lock, flags);
4851 5011
4852 /* 5012 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..93e72e5feae6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,8 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/events/sched.h> 30#define CREATE_TRACE_POINTS
31#include <trace/events/signal.h>
31 32
32#include <asm/param.h> 33#include <asm/param.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
@@ -834,7 +835,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 struct sigqueue *q; 835 struct sigqueue *q;
835 int override_rlimit; 836 int override_rlimit;
836 837
837 trace_sched_signal_send(sig, t); 838 trace_signal_generate(sig, info, t);
838 839
839 assert_spin_locked(&t->sighand->siglock); 840 assert_spin_locked(&t->sighand->siglock);
840 841
@@ -896,12 +897,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
896 break; 897 break;
897 } 898 }
898 } else if (!is_si_special(info)) { 899 } else if (!is_si_special(info)) {
899 if (sig >= SIGRTMIN && info->si_code != SI_USER) 900 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
900 /* 901 /*
901 * Queue overflow, abort. We may abort if the signal was rt 902 * Queue overflow, abort. We may abort if the
902 * and sent by user using something other than kill(). 903 * signal was rt and sent by user using something
903 */ 904 * other than kill().
905 */
906 trace_signal_overflow_fail(sig, group, info);
904 return -EAGAIN; 907 return -EAGAIN;
908 } else {
909 /*
910 * This is a silent loss of information. We still
911 * send the signal, but the *info bits are lost.
912 */
913 trace_signal_lose_info(sig, group, info);
914 }
905 } 915 }
906 916
907out_set: 917out_set:
@@ -1839,6 +1849,9 @@ relock:
1839 ka = &sighand->action[signr-1]; 1849 ka = &sighand->action[signr-1];
1840 } 1850 }
1841 1851
1852 /* Trace actually delivered signals. */
1853 trace_signal_deliver(signr, info, ka);
1854
1842 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1855 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1843 continue; 1856 continue;
1844 if (ka->sa.sa_handler != SIG_DFL) { 1857 if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f05671609a89..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
342config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT
345 select TRACING
346 help
347 This tracer helps find read and write operations on any given kernel
348 symbol i.e. /proc/kallsyms.
349
350config PROFILE_KSYM_TRACER
351 bool "Profile all kernel memory accesses on 'watched' variables"
352 depends on KSYM_TRACER
353 help
354 This tracer profiles kernel accesses on variables watched through the
355 ksym tracer ftrace plugin. Depending upon the hardware, all read
356 and write operations on kernel variables can be monitored for
357 accesses.
358
359 The results will be displayed in:
360 /debugfs/tracing/profile_ksym
361
362 Say N if unsure.
342 363
343config STACK_TRACER 364config STACK_TRACER
344 bool "Trace max stack" 365 bool "Trace max stack"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index edc3a3cca1a1..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
57obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
58 59
59libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b4e4212e66d7..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h>
14 15
15#include <linux/trace_seq.h> 16#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 17#include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
37 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
39 TRACE_BLK, 40 TRACE_BLK,
41 TRACE_KSYM,
40 42
41 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
42}; 44};
@@ -98,7 +100,7 @@ struct syscall_trace_enter {
98struct syscall_trace_exit { 100struct syscall_trace_exit {
99 struct trace_entry ent; 101 struct trace_entry ent;
100 int nr; 102 int nr;
101 unsigned long ret; 103 long ret;
102}; 104};
103 105
104struct kprobe_trace_entry { 106struct kprobe_trace_entry {
@@ -232,6 +234,7 @@ extern void __ftrace_bad_type(void);
232 TRACE_KMEM_ALLOC); \ 234 TRACE_KMEM_ALLOC); \
233 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
234 TRACE_KMEM_FREE); \ 236 TRACE_KMEM_FREE); \
237 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
235 __ftrace_bad_type(); \ 238 __ftrace_bad_type(); \
236 } while (0) 239 } while (0)
237 240
@@ -387,6 +390,8 @@ int register_tracer(struct tracer *type);
387void unregister_tracer(struct tracer *type); 390void unregister_tracer(struct tracer *type);
388int is_tracing_stopped(void); 391int is_tracing_stopped(void);
389 392
393extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
394
390extern unsigned long nsecs_to_usecs(unsigned long nsecs); 395extern unsigned long nsecs_to_usecs(unsigned long nsecs);
391 396
392#ifdef CONFIG_TRACER_MAX_TRACE 397#ifdef CONFIG_TRACER_MAX_TRACE
@@ -461,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
461 struct trace_array *tr); 466 struct trace_array *tr);
462extern int trace_selftest_startup_hw_branches(struct tracer *trace, 467extern int trace_selftest_startup_hw_branches(struct tracer *trace,
463 struct trace_array *tr); 468 struct trace_array *tr);
469extern int trace_selftest_startup_ksym(struct tracer *trace,
470 struct trace_array *tr);
464#endif /* CONFIG_FTRACE_STARTUP_TEST */ 471#endif /* CONFIG_FTRACE_STARTUP_TEST */
465 472
466extern void *head_page(struct trace_array_cpu *data); 473extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
364 F_printk("type:%u call_site:%lx ptr:%p", 364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr) 365 __entry->type_id, __entry->call_site, __entry->ptr)
366); 366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , cmd, TASK_COMM_LEN )
376 __field( unsigned long, addr )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 (void *)__entry->addr, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e0d351b01f5a..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,31 +9,33 @@
9#include "trace.h" 9#include "trace.h"
10 10
11 11
12struct perf_trace_buf *perf_trace_buf; 12char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
14 14
15struct perf_trace_buf *perf_trace_buf_nmi; 15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); 16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 17
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19
18/* Count the events in use (per event id, not per instance) */ 20/* Count the events in use (per event id, not per instance) */
19static int total_profile_count; 21static int total_profile_count;
20 22
21static int ftrace_profile_enable_event(struct ftrace_event_call *event) 23static int ftrace_profile_enable_event(struct ftrace_event_call *event)
22{ 24{
23 struct perf_trace_buf *buf; 25 char *buf;
24 int ret = -ENOMEM; 26 int ret = -ENOMEM;
25 27
26 if (atomic_inc_return(&event->profile_count)) 28 if (atomic_inc_return(&event->profile_count))
27 return 0; 29 return 0;
28 30
29 if (!total_profile_count) { 31 if (!total_profile_count) {
30 buf = alloc_percpu(struct perf_trace_buf); 32 buf = (char *)alloc_percpu(perf_trace_t);
31 if (!buf) 33 if (!buf)
32 goto fail_buf; 34 goto fail_buf;
33 35
34 rcu_assign_pointer(perf_trace_buf, buf); 36 rcu_assign_pointer(perf_trace_buf, buf);
35 37
36 buf = alloc_percpu(struct perf_trace_buf); 38 buf = (char *)alloc_percpu(perf_trace_t);
37 if (!buf) 39 if (!buf)
38 goto fail_buf_nmi; 40 goto fail_buf_nmi;
39 41
@@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id)
79 81
80static void ftrace_profile_disable_event(struct ftrace_event_call *event) 82static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{ 83{
82 struct perf_trace_buf *buf, *nmi_buf; 84 char *buf, *nmi_buf;
83 85
84 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (!atomic_add_negative(-1, &event->profile_count))
85 return; 87 return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3696476f307d..aff5f80b59b8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -243,7 +243,11 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
243 ret = snprintf(buf, n, "@0x%p", ff->data); 243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) { 244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data; 245 struct symbol_cache *sc = ff->data;
246 ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); 246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
247 } else if (ff->func == fetch_retvalue) 251 } else if (ff->func == fetch_retvalue)
248 ret = snprintf(buf, n, "$retval"); 252 ret = snprintf(buf, n, "$retval");
249 else if (ff->func == fetch_stack_address) 253 else if (ff->func == fetch_stack_address)
@@ -479,7 +483,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
479 return ret; 483 return ret;
480} 484}
481 485
482static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 486/* Recursive argument parser */
487static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
483{ 488{
484 int ret = 0; 489 int ret = 0;
485 unsigned long param; 490 unsigned long param;
@@ -539,7 +544,7 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
539 if (!id) 544 if (!id)
540 return -ENOMEM; 545 return -ENOMEM;
541 id->offset = offset; 546 id->offset = offset;
542 ret = parse_probe_arg(arg, &id->orig, is_return); 547 ret = __parse_probe_arg(arg, &id->orig, is_return);
543 if (ret) 548 if (ret)
544 kfree(id); 549 kfree(id);
545 else { 550 else {
@@ -556,6 +561,16 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
556 return ret; 561 return ret;
557} 562}
558 563
564/* String length checking wrapper */
565static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
566{
567 if (strlen(arg) > MAX_ARGSTR_LEN) {
568 pr_info("Argument is too long.: %s\n", arg);
569 return -ENOSPC;
570 }
571 return __parse_probe_arg(arg, ff, is_return);
572}
573
559/* Return 1 if name is reserved or already used by another argument */ 574/* Return 1 if name is reserved or already used by another argument */
560static int conflict_field_name(const char *name, 575static int conflict_field_name(const char *name,
561 struct probe_arg *args, int narg) 576 struct probe_arg *args, int narg)
@@ -694,20 +709,23 @@ static int create_trace_probe(int argc, char **argv)
694 } 709 }
695 710
696 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 711 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
697 712 if (!tp->args[i].name) {
698 /* Parse fetch argument */ 713 pr_info("Failed to allocate argument%d name '%s'.\n",
699 if (strlen(arg) > MAX_ARGSTR_LEN) { 714 i, argv[i]);
700 pr_info("Argument%d(%s) is too long.\n", i, arg); 715 ret = -ENOMEM;
701 ret = -ENOSPC;
702 goto error; 716 goto error;
703 } 717 }
718
719 /* Parse fetch argument */
704 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 720 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
705 if (ret) { 721 if (ret) {
706 pr_info("Parse error at argument%d. (%d)\n", i, ret); 722 pr_info("Parse error at argument%d. (%d)\n", i, ret);
723 kfree(tp->args[i].name);
707 goto error; 724 goto error;
708 } 725 }
726
727 tp->nr_args++;
709 } 728 }
710 tp->nr_args = i;
711 729
712 ret = register_trace_probe(tp); 730 ret = register_trace_probe(tp);
713 if (ret) 731 if (ret)
@@ -758,12 +776,14 @@ static int probes_seq_show(struct seq_file *m, void *v)
758 char buf[MAX_ARGSTR_LEN + 1]; 776 char buf[MAX_ARGSTR_LEN + 1];
759 777
760 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 778 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
761 seq_printf(m, ":%s", tp->call.name); 779 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
762 780
763 if (tp->symbol) 781 if (!tp->symbol)
782 seq_printf(m, " 0x%p", tp->rp.kp.addr);
783 else if (tp->rp.kp.offset)
764 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 784 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
765 else 785 else
766 seq_printf(m, " 0x%p", tp->rp.kp.addr); 786 seq_printf(m, " %s", probe_symbol(tp));
767 787
768 for (i = 0; i < tp->nr_args; i++) { 788 for (i = 0; i < tp->nr_args; i++) {
769 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 789 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
@@ -1208,11 +1228,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1208 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1228 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1209 struct ftrace_event_call *call = &tp->call; 1229 struct ftrace_event_call *call = &tp->call;
1210 struct kprobe_trace_entry *entry; 1230 struct kprobe_trace_entry *entry;
1211 struct perf_trace_buf *trace_buf;
1212 struct trace_entry *ent; 1231 struct trace_entry *ent;
1213 int size, __size, i, pc, __cpu; 1232 int size, __size, i, pc, __cpu;
1214 unsigned long irq_flags; 1233 unsigned long irq_flags;
1234 char *trace_buf;
1215 char *raw_data; 1235 char *raw_data;
1236 int rctx;
1216 1237
1217 pc = preempt_count(); 1238 pc = preempt_count();
1218 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1239 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1227,6 +1248,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1227 * This also protects the rcu read side 1248 * This also protects the rcu read side
1228 */ 1249 */
1229 local_irq_save(irq_flags); 1250 local_irq_save(irq_flags);
1251
1252 rctx = perf_swevent_get_recursion_context();
1253 if (rctx < 0)
1254 goto end_recursion;
1255
1230 __cpu = smp_processor_id(); 1256 __cpu = smp_processor_id();
1231 1257
1232 if (in_nmi()) 1258 if (in_nmi())
@@ -1237,18 +1263,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1237 if (!trace_buf) 1263 if (!trace_buf)
1238 goto end; 1264 goto end;
1239 1265
1240 trace_buf = per_cpu_ptr(trace_buf, __cpu); 1266 raw_data = per_cpu_ptr(trace_buf, __cpu);
1241
1242 if (trace_buf->recursion++)
1243 goto end_recursion;
1244
1245 /*
1246 * Make recursion update visible before entering perf_tp_event
1247 * so that we protect from perf recursions.
1248 */
1249 barrier();
1250
1251 raw_data = trace_buf->buf;
1252 1267
1253 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1268 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1254 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1269 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1263,9 +1278,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1263 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1278 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1264 perf_tp_event(call->id, entry->ip, 1, entry, size); 1279 perf_tp_event(call->id, entry->ip, 1, entry, size);
1265 1280
1266end_recursion:
1267 trace_buf->recursion--;
1268end: 1281end:
1282 perf_swevent_put_recursion_context(rctx);
1283end_recursion:
1269 local_irq_restore(irq_flags); 1284 local_irq_restore(irq_flags);
1270 1285
1271 return 0; 1286 return 0;
@@ -1278,11 +1293,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1278 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1293 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1279 struct ftrace_event_call *call = &tp->call; 1294 struct ftrace_event_call *call = &tp->call;
1280 struct kretprobe_trace_entry *entry; 1295 struct kretprobe_trace_entry *entry;
1281 struct perf_trace_buf *trace_buf;
1282 struct trace_entry *ent; 1296 struct trace_entry *ent;
1283 int size, __size, i, pc, __cpu; 1297 int size, __size, i, pc, __cpu;
1284 unsigned long irq_flags; 1298 unsigned long irq_flags;
1299 char *trace_buf;
1285 char *raw_data; 1300 char *raw_data;
1301 int rctx;
1286 1302
1287 pc = preempt_count(); 1303 pc = preempt_count();
1288 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1304 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1297,6 +1313,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1297 * This also protects the rcu read side 1313 * This also protects the rcu read side
1298 */ 1314 */
1299 local_irq_save(irq_flags); 1315 local_irq_save(irq_flags);
1316
1317 rctx = perf_swevent_get_recursion_context();
1318 if (rctx < 0)
1319 goto end_recursion;
1320
1300 __cpu = smp_processor_id(); 1321 __cpu = smp_processor_id();
1301 1322
1302 if (in_nmi()) 1323 if (in_nmi())
@@ -1307,18 +1328,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1307 if (!trace_buf) 1328 if (!trace_buf)
1308 goto end; 1329 goto end;
1309 1330
1310 trace_buf = per_cpu_ptr(trace_buf, __cpu); 1331 raw_data = per_cpu_ptr(trace_buf, __cpu);
1311
1312 if (trace_buf->recursion++)
1313 goto end_recursion;
1314
1315 /*
1316 * Make recursion update visible before entering perf_tp_event
1317 * so that we protect from perf recursions.
1318 */
1319 barrier();
1320
1321 raw_data = trace_buf->buf;
1322 1332
1323 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1333 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1324 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1334 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1334,9 +1344,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1334 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1344 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1335 perf_tp_event(call->id, entry->ret_ip, 1, entry, size); 1345 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1336 1346
1337end_recursion:
1338 trace_buf->recursion--;
1339end: 1347end:
1348 perf_swevent_put_recursion_context(rctx);
1349end_recursion:
1340 local_irq_restore(irq_flags); 1350 local_irq_restore(irq_flags);
1341 1351
1342 return 0; 1352 return 0;
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..ddfa0fd43bc0
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27
28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers.
38 */
39#define KSYM_TRACER_MAX HBP_NUM
40
41#define KSYM_TRACER_OP_LEN 3 /* rw- */
42
43struct trace_ksym {
44 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter;
48#endif
49 struct hlist_node ksym_hlist;
50};
51
52static struct trace_array *ksym_trace_array;
53
54static unsigned int ksym_filter_entry_count;
55static unsigned int ksym_tracing_enabled;
56
57static HLIST_HEAD(ksym_filter_head);
58
59static DEFINE_MUTEX(ksym_tracer_mutex);
60
61#ifdef CONFIG_PROFILE_KSYM_TRACER
62
63#define MAX_UL_INT 0xffffffff
64
65void ksym_collect_stats(unsigned long hbp_hit_addr)
66{
67 struct hlist_node *node;
68 struct trace_ksym *entry;
69
70 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) &&
73 (entry->counter <= MAX_UL_INT)) {
74 entry->counter++;
75 break;
76 }
77 }
78 rcu_read_unlock();
79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81
82void ksym_hbp_handler(struct perf_event *hbp, void *data)
83{
84 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer;
88 int pc;
89
90 if (!ksym_tracing_enabled)
91 return;
92
93 buffer = ksym_trace_array->buffer;
94
95 pc = preempt_count();
96
97 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
98 sizeof(*entry), 0, pc);
99 if (!event)
100 return;
101
102 entry = ring_buffer_event_data(event);
103 entry->ip = instruction_pointer(regs);
104 entry->type = hw_breakpoint_type(hbp);
105 entry->addr = hw_breakpoint_addr(hbp);
106 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
107
108#ifdef CONFIG_PROFILE_KSYM_TRACER
109 ksym_collect_stats(hw_breakpoint_addr(hbp));
110#endif /* CONFIG_PROFILE_KSYM_TRACER */
111
112 trace_buffer_unlock_commit(buffer, event, 0, pc);
113}
114
115/* Valid access types are represented as
116 *
117 * rw- : Set Read/Write Access Breakpoint
118 * -w- : Set Write Access Breakpoint
119 * --- : Clear Breakpoints
120 * --x : Set Execution Break points (Not available yet)
121 *
122 */
123static int ksym_trace_get_access_type(char *str)
124{
125 int access = 0;
126
127 if (str[0] == 'r')
128 access |= HW_BREAKPOINT_R;
129
130 if (str[1] == 'w')
131 access |= HW_BREAKPOINT_W;
132
133 if (str[2] == 'x')
134 access |= HW_BREAKPOINT_X;
135
136 switch (access) {
137 case HW_BREAKPOINT_R:
138 case HW_BREAKPOINT_W:
139 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
140 return access;
141 default:
142 return -EINVAL;
143 }
144}
145
146/*
147 * There can be several possible malformed requests and we attempt to capture
148 * all of them. We enumerate some of the rules
149 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
150 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
151 * <module>:<ksym_name>:<op>.
152 * 2. No delimiter symbol ':' in the input string
153 * 3. Spurious operator symbols or symbols not in their respective positions
154 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
155 * 5. Kernel symbol not a part of /proc/kallsyms
156 * 6. Duplicate requests
157 */
158static int parse_ksym_trace_str(char *input_string, char **ksymname,
159 unsigned long *addr)
160{
161 int ret;
162
163 *ksymname = strsep(&input_string, ":");
164 *addr = kallsyms_lookup_name(*ksymname);
165
166 /* Check for malformed request: (2), (1) and (5) */
167 if ((!input_string) ||
168 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
169 (*addr == 0))
170 return -EINVAL;;
171
172 ret = ksym_trace_get_access_type(input_string);
173
174 return ret;
175}
176
177int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
178{
179 struct trace_ksym *entry;
180 int ret = -ENOMEM;
181
182 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
183 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
184 " new requests for tracing can be accepted now.\n",
185 KSYM_TRACER_MAX);
186 return -ENOSPC;
187 }
188
189 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
190 if (!entry)
191 return -ENOMEM;
192
193 hw_breakpoint_init(&entry->attr);
194
195 entry->attr.bp_type = op;
196 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler);
202
203 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again"
206 " later!!\n");
207 goto err;
208 }
209
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212
213 return 0;
214
215err:
216 kfree(entry);
217
218 return ret;
219}
220
221static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
222 size_t count, loff_t *ppos)
223{
224 struct trace_ksym *entry;
225 struct hlist_node *node;
226 struct trace_seq *s;
227 ssize_t cnt = 0;
228 int ret;
229
230 s = kmalloc(sizeof(*s), GFP_KERNEL);
231 if (!s)
232 return -ENOMEM;
233 trace_seq_init(s);
234
235 mutex_lock(&ksym_tracer_mutex);
236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
242 ret = trace_seq_puts(s, "-w-\n");
243 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
244 ret = trace_seq_puts(s, "rw-\n");
245 WARN_ON_ONCE(!ret);
246 }
247
248 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
249
250 mutex_unlock(&ksym_tracer_mutex);
251
252 kfree(s);
253
254 return cnt;
255}
256
257static void __ksym_trace_reset(void)
258{
259 struct trace_ksym *entry;
260 struct hlist_node *node, *node1;
261
262 mutex_lock(&ksym_tracer_mutex);
263 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
264 ksym_hlist) {
265 unregister_wide_hw_breakpoint(entry->ksym_hbp);
266 ksym_filter_entry_count--;
267 hlist_del_rcu(&(entry->ksym_hlist));
268 synchronize_rcu();
269 kfree(entry);
270 }
271 mutex_unlock(&ksym_tracer_mutex);
272}
273
274static ssize_t ksym_trace_filter_write(struct file *file,
275 const char __user *buffer,
276 size_t count, loff_t *ppos)
277{
278 struct trace_ksym *entry;
279 struct hlist_node *node;
280 char *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0;
283
284 input_string = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string)
286 return -ENOMEM;
287
288 if (copy_from_user(input_string, buffer, count)) {
289 kfree(input_string);
290 return -EFAULT;
291 }
292 input_string[count] = '\0';
293
294 strstrip(input_string);
295
296 /*
297 * Clear all breakpoints if:
298 * 1: echo > ksym_trace_filter
299 * 2: echo 0 > ksym_trace_filter
300 * 3: echo "*:---" > ksym_trace_filter
301 */
302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset();
305 kfree(input_string);
306 return count;
307 }
308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) {
311 kfree(input_string);
312 return ret;
313 }
314
315 mutex_lock(&ksym_tracer_mutex);
316
317 ret = -EINVAL;
318 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
319 if (entry->attr.bp_addr == ksym_addr) {
320 /* Check for malformed request: (6) */
321 if (entry->attr.bp_type != op)
322 changed = 1;
323 else
324 goto out;
325 break;
326 }
327 }
328 if (changed) {
329 unregister_wide_hw_breakpoint(entry->ksym_hbp);
330 entry->attr.bp_type = op;
331 ret = 0;
332 if (op > 0) {
333 entry->ksym_hbp =
334 register_wide_hw_breakpoint(&entry->attr,
335 ksym_hbp_handler);
336 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp);
338 else
339 goto out;
340 }
341 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu();
345 kfree(entry);
346 goto out;
347 } else {
348 /* Check for malformed request: (4) */
349 if (op == 0)
350 goto out;
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 }
353out:
354 mutex_unlock(&ksym_tracer_mutex);
355
356 kfree(input_string);
357
358 if (!ret)
359 ret = count;
360 return ret;
361}
362
363static const struct file_operations ksym_tracing_fops = {
364 .open = tracing_open_generic,
365 .read = ksym_trace_filter_read,
366 .write = ksym_trace_filter_write,
367};
368
369static void ksym_trace_reset(struct trace_array *tr)
370{
371 ksym_tracing_enabled = 0;
372 __ksym_trace_reset();
373}
374
375static int ksym_trace_init(struct trace_array *tr)
376{
377 int cpu, ret = 0;
378
379 for_each_online_cpu(cpu)
380 tracing_reset(tr, cpu);
381 ksym_tracing_enabled = 1;
382 ksym_trace_array = tr;
383
384 return ret;
385}
386
387static void ksym_trace_print_header(struct seq_file *m)
388{
389 seq_puts(m,
390 "# TASK-PID CPU# Symbol "
391 "Type Function\n");
392 seq_puts(m,
393 "# | | | "
394 " | |\n");
395}
396
397static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
398{
399 struct trace_entry *entry = iter->ent;
400 struct trace_seq *s = &iter->seq;
401 struct ksym_trace_entry *field;
402 char str[KSYM_SYMBOL_LEN];
403 int ret;
404
405 if (entry->type != TRACE_KSYM)
406 return TRACE_TYPE_UNHANDLED;
407
408 trace_assign_type(field, entry);
409
410 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
411 entry->pid, iter->cpu, (char *)field->addr);
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 switch (field->type) {
416 case HW_BREAKPOINT_R:
417 ret = trace_seq_printf(s, " R ");
418 break;
419 case HW_BREAKPOINT_W:
420 ret = trace_seq_printf(s, " W ");
421 break;
422 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
423 ret = trace_seq_printf(s, " RW ");
424 break;
425 default:
426 return TRACE_TYPE_PARTIAL_LINE;
427 }
428
429 if (!ret)
430 return TRACE_TYPE_PARTIAL_LINE;
431
432 sprint_symbol(str, field->ip);
433 ret = trace_seq_printf(s, "%s\n", str);
434 if (!ret)
435 return TRACE_TYPE_PARTIAL_LINE;
436
437 return TRACE_TYPE_HANDLED;
438}
439
440struct tracer ksym_tracer __read_mostly =
441{
442 .name = "ksym_tracer",
443 .init = ksym_trace_init,
444 .reset = ksym_trace_reset,
445#ifdef CONFIG_FTRACE_SELFTEST
446 .selftest = trace_selftest_startup_ksym,
447#endif
448 .print_header = ksym_trace_print_header,
449 .print_line = ksym_trace_output
450};
451
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m)
473{
474 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480
481static int ksym_tracer_stat_show(struct seq_file *m, void *v)
482{
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
489
490 access_type = entry->attr.bp_type;
491
492 switch (access_type) {
493 case HW_BREAKPOINT_R:
494 seq_puts(m, " R ");
495 break;
496 case HW_BREAKPOINT_W:
497 seq_puts(m, " W ");
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 }
505
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511
512 return 0;
513}
514
515static void *ksym_tracer_stat_start(struct tracer_stat *trace)
516{
517 return ksym_filter_head.first;
518}
519
520static void *
521ksym_tracer_stat_next(void *v, int idx)
522{
523 struct hlist_node *stat = v;
524
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534};
535
536__init static int ksym_tracer_stat_init(void)
537{
538 int ret;
539
540 ret = register_stat_tracer(&ksym_tracer_stats);
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546
547 return 0;
548}
549fs_initcall(ksym_tracer_stat_init);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES: 19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM:
20 return 1; 21 return 1;
21 } 22 }
22 return 0; 23 return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
808 return ret; 809 return ret;
809} 810}
810#endif /* CONFIG_HW_BRANCH_TRACER */ 811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy;
815
816int
817trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
818{
819 unsigned long count;
820 int ret;
821
822 /* start the tracing */
823 ret = tracer_init(trace, tr);
824 if (ret) {
825 warn_failed_init_tracer(trace, ret);
826 return ret;
827 }
828
829 ksym_selftest_dummy = 0;
830 /* Register the read-write tracing request */
831
832 ret = process_new_ksym_entry("ksym_selftest_dummy",
833 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
834 (unsigned long)(&ksym_selftest_dummy));
835
836 if (ret < 0) {
837 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
838 goto ret_path;
839 }
840 /* Perform a read and a write operation over the dummy variable to
841 * trigger the tracer
842 */
843 if (ksym_selftest_dummy == 0)
844 ksym_selftest_dummy++;
845
846 /* stop the tracing. */
847 tracing_stop();
848 /* check the trace buffer */
849 ret = trace_test_buffer(tr, &count);
850 trace->reset(tr);
851 tracing_start();
852
853 /* read & write operations - one each is performed on the dummy variable
854 * triggering two entries in the trace buffer
855 */
856 if (!ret && count != 2) {
857 printk(KERN_CONT "Ksym tracer startup test failed");
858 ret = -1;
859 }
860
861ret_path:
862 return ret;
863}
864#endif /* CONFIG_KSYM_TRACER */
865
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 51213b0aa81b..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,32 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
51 return syscalls_metadata[nr]; 51 return syscalls_metadata[nr];
52} 52}
53 53
54int syscall_name_to_nr(char *name)
55{
56 int i;
57
58 if (!syscalls_metadata)
59 return -1;
60
61 for (i = 0; i < NR_syscalls; i++) {
62 if (syscalls_metadata[i]) {
63 if (!strcmp(syscalls_metadata[i]->name, name))
64 return i;
65 }
66 }
67 return -1;
68}
69
70void set_syscall_enter_id(int num, int id)
71{
72 syscalls_metadata[num]->enter_id = id;
73}
74
75void set_syscall_exit_id(int num, int id)
76{
77 syscalls_metadata[num]->exit_id = id;
78}
79
80enum print_line_t 54enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags) 55print_syscall_enter(struct trace_iterator *iter, int flags)
82{ 56{
@@ -93,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
93 if (!entry) 67 if (!entry)
94 goto end; 68 goto end;
95 69
96 if (entry->enter_id != ent->type) { 70 if (entry->enter_event->id != ent->type) {
97 WARN_ON_ONCE(1); 71 WARN_ON_ONCE(1);
98 goto end; 72 goto end;
99 } 73 }
@@ -148,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
148 return TRACE_TYPE_HANDLED; 122 return TRACE_TYPE_HANDLED;
149 } 123 }
150 124
151 if (entry->exit_id != ent->type) { 125 if (entry->exit_event->id != ent->type) {
152 WARN_ON_ONCE(1); 126 WARN_ON_ONCE(1);
153 return TRACE_TYPE_UNHANDLED; 127 return TRACE_TYPE_UNHANDLED;
154 } 128 }
@@ -172,18 +146,11 @@ extern char *__bad_type_size(void);
172int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
173{ 147{
174 int i; 148 int i;
175 int nr;
176 int ret; 149 int ret;
177 struct syscall_metadata *entry; 150 struct syscall_metadata *entry = call->data;
178 struct syscall_trace_enter trace; 151 struct syscall_trace_enter trace;
179 int offset = offsetof(struct syscall_trace_enter, args); 152 int offset = offsetof(struct syscall_trace_enter, args);
180 153
181 nr = syscall_name_to_nr(call->data);
182 entry = syscall_nr_to_meta(nr);
183
184 if (!entry)
185 return 0;
186
187 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
188 "\tsigned:%u;\n", 155 "\tsigned:%u;\n",
189 SYSCALL_FIELD(int, nr)); 156 SYSCALL_FIELD(int, nr));
@@ -245,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
245int syscall_enter_define_fields(struct ftrace_event_call *call) 212int syscall_enter_define_fields(struct ftrace_event_call *call)
246{ 213{
247 struct syscall_trace_enter trace; 214 struct syscall_trace_enter trace;
248 struct syscall_metadata *meta; 215 struct syscall_metadata *meta = call->data;
249 int ret; 216 int ret;
250 int nr;
251 int i; 217 int i;
252 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
253 219
254 nr = syscall_name_to_nr(call->data);
255 meta = syscall_nr_to_meta(nr);
256
257 if (!meta)
258 return 0;
259
260 ret = trace_define_common_fields(call); 220 ret = trace_define_common_fields(call);
261 if (ret) 221 if (ret)
262 return ret; 222 return ret;
263 223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret)
226 return ret;
227
264 for (i = 0; i < meta->nb_args; i++) { 228 for (i = 0; i < meta->nb_args; i++) {
265 ret = trace_define_field(call, meta->types[i], 229 ret = trace_define_field(call, meta->types[i],
266 meta->args[i], offset, 230 meta->args[i], offset,
@@ -281,6 +245,10 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
281 if (ret) 245 if (ret)
282 return ret; 246 return ret;
283 247
248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret)
250 return ret;
251
284 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 252 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
285 FILTER_OTHER); 253 FILTER_OTHER);
286 254
@@ -308,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
308 276
309 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 277 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
310 278
311 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, 279 event = trace_current_buffer_lock_reserve(&buffer,
312 size, 0, 0); 280 sys_data->enter_event->id, size, 0, 0);
313 if (!event) 281 if (!event)
314 return; 282 return;
315 283
@@ -340,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
340 if (!sys_data) 308 if (!sys_data)
341 return; 309 return;
342 310
343 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, 311 event = trace_current_buffer_lock_reserve(&buffer,
344 sizeof(*entry), 0, 0); 312 sys_data->exit_event->id, sizeof(*entry), 0, 0);
345 if (!event) 313 if (!event)
346 return; 314 return;
347 315
@@ -358,10 +326,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
358{ 326{
359 int ret = 0; 327 int ret = 0;
360 int num; 328 int num;
361 char *name;
362 329
363 name = (char *)call->data; 330 num = ((struct syscall_metadata *)call->data)->syscall_nr;
364 num = syscall_name_to_nr(name);
365 if (num < 0 || num >= NR_syscalls) 331 if (num < 0 || num >= NR_syscalls)
366 return -ENOSYS; 332 return -ENOSYS;
367 mutex_lock(&syscall_trace_lock); 333 mutex_lock(&syscall_trace_lock);
@@ -381,10 +347,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
381void unreg_event_syscall_enter(struct ftrace_event_call *call) 347void unreg_event_syscall_enter(struct ftrace_event_call *call)
382{ 348{
383 int num; 349 int num;
384 char *name;
385 350
386 name = (char *)call->data; 351 num = ((struct syscall_metadata *)call->data)->syscall_nr;
387 num = syscall_name_to_nr(name);
388 if (num < 0 || num >= NR_syscalls) 352 if (num < 0 || num >= NR_syscalls)
389 return; 353 return;
390 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
@@ -399,10 +363,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
399{ 363{
400 int ret = 0; 364 int ret = 0;
401 int num; 365 int num;
402 char *name;
403 366
404 name = call->data; 367 num = ((struct syscall_metadata *)call->data)->syscall_nr;
405 num = syscall_name_to_nr(name);
406 if (num < 0 || num >= NR_syscalls) 368 if (num < 0 || num >= NR_syscalls)
407 return -ENOSYS; 369 return -ENOSYS;
408 mutex_lock(&syscall_trace_lock); 370 mutex_lock(&syscall_trace_lock);
@@ -422,10 +384,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
422void unreg_event_syscall_exit(struct ftrace_event_call *call) 384void unreg_event_syscall_exit(struct ftrace_event_call *call)
423{ 385{
424 int num; 386 int num;
425 char *name;
426 387
427 name = call->data; 388 num = ((struct syscall_metadata *)call->data)->syscall_nr;
428 num = syscall_name_to_nr(name);
429 if (num < 0 || num >= NR_syscalls) 389 if (num < 0 || num >= NR_syscalls)
430 return; 390 return;
431 mutex_lock(&syscall_trace_lock); 391 mutex_lock(&syscall_trace_lock);
@@ -436,13 +396,17 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
436 mutex_unlock(&syscall_trace_lock); 396 mutex_unlock(&syscall_trace_lock);
437} 397}
438 398
439struct trace_event event_syscall_enter = { 399int init_syscall_trace(struct ftrace_event_call *call)
440 .trace = print_syscall_enter, 400{
441}; 401 int id;
442 402
443struct trace_event event_syscall_exit = { 403 id = register_ftrace_event(call->event);
444 .trace = print_syscall_exit, 404 if (!id)
445}; 405 return -ENODEV;
406 call->id = id;
407 INIT_LIST_HEAD(&call->fields);
408 return 0;
409}
446 410
447int __init init_ftrace_syscalls(void) 411int __init init_ftrace_syscalls(void)
448{ 412{
@@ -460,6 +424,10 @@ int __init init_ftrace_syscalls(void)
460 for (i = 0; i < NR_syscalls; i++) { 424 for (i = 0; i < NR_syscalls; i++) {
461 addr = arch_syscall_addr(i); 425 addr = arch_syscall_addr(i);
462 meta = find_syscall_meta(addr); 426 meta = find_syscall_meta(addr);
427 if (!meta)
428 continue;
429
430 meta->syscall_nr = i;
463 syscalls_metadata[i] = meta; 431 syscalls_metadata[i] = meta;
464 } 432 }
465 433
@@ -477,11 +445,12 @@ static int sys_prof_refcount_exit;
477static void prof_syscall_enter(struct pt_regs *regs, long id) 445static void prof_syscall_enter(struct pt_regs *regs, long id)
478{ 446{
479 struct syscall_metadata *sys_data; 447 struct syscall_metadata *sys_data;
480 struct perf_trace_buf *trace_buf;
481 struct syscall_trace_enter *rec; 448 struct syscall_trace_enter *rec;
482 unsigned long flags; 449 unsigned long flags;
450 char *trace_buf;
483 char *raw_data; 451 char *raw_data;
484 int syscall_nr; 452 int syscall_nr;
453 int rctx;
485 int size; 454 int size;
486 int cpu; 455 int cpu;
487 456
@@ -505,54 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
505 /* Protect the per cpu buffer, begin the rcu read side */ 474 /* Protect the per cpu buffer, begin the rcu read side */
506 local_irq_save(flags); 475 local_irq_save(flags);
507 476
477 rctx = perf_swevent_get_recursion_context();
478 if (rctx < 0)
479 goto end_recursion;
480
508 cpu = smp_processor_id(); 481 cpu = smp_processor_id();
509 482
510 if (in_nmi()) 483 trace_buf = rcu_dereference(perf_trace_buf);
511 trace_buf = rcu_dereference(perf_trace_buf_nmi);
512 else
513 trace_buf = rcu_dereference(perf_trace_buf);
514 484
515 if (!trace_buf) 485 if (!trace_buf)
516 goto end; 486 goto end;
517 487
518 trace_buf = per_cpu_ptr(trace_buf, cpu); 488 raw_data = per_cpu_ptr(trace_buf, cpu);
519
520 if (trace_buf->recursion++)
521 goto end_recursion;
522
523 /*
524 * Make recursion update visible before entering perf_tp_event
525 * so that we protect from perf recursions.
526 */
527 barrier();
528
529 raw_data = trace_buf->buf;
530 489
531 /* zero the dead bytes from align to not leak stack to user */ 490 /* zero the dead bytes from align to not leak stack to user */
532 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
533 492
534 rec = (struct syscall_trace_enter *) raw_data; 493 rec = (struct syscall_trace_enter *) raw_data;
535 tracing_generic_entry_update(&rec->ent, 0, 0); 494 tracing_generic_entry_update(&rec->ent, 0, 0);
536 rec->ent.type = sys_data->enter_id; 495 rec->ent.type = sys_data->enter_event->id;
537 rec->nr = syscall_nr; 496 rec->nr = syscall_nr;
538 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 497 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
539 (unsigned long *)&rec->args); 498 (unsigned long *)&rec->args);
540 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
541 500
542end_recursion:
543 trace_buf->recursion--;
544end: 501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
545 local_irq_restore(flags); 504 local_irq_restore(flags);
546} 505}
547 506
548int reg_prof_syscall_enter(char *name) 507int prof_sysenter_enable(struct ftrace_event_call *call)
549{ 508{
550 int ret = 0; 509 int ret = 0;
551 int num; 510 int num;
552 511
553 num = syscall_name_to_nr(name); 512 num = ((struct syscall_metadata *)call->data)->syscall_nr;
554 if (num < 0 || num >= NR_syscalls)
555 return -ENOSYS;
556 513
557 mutex_lock(&syscall_trace_lock); 514 mutex_lock(&syscall_trace_lock);
558 if (!sys_prof_refcount_enter) 515 if (!sys_prof_refcount_enter)
@@ -568,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
568 return ret; 525 return ret;
569} 526}
570 527
571void unreg_prof_syscall_enter(char *name) 528void prof_sysenter_disable(struct ftrace_event_call *call)
572{ 529{
573 int num; 530 int num;
574 531
575 num = syscall_name_to_nr(name); 532 num = ((struct syscall_metadata *)call->data)->syscall_nr;
576 if (num < 0 || num >= NR_syscalls)
577 return;
578 533
579 mutex_lock(&syscall_trace_lock); 534 mutex_lock(&syscall_trace_lock);
580 sys_prof_refcount_enter--; 535 sys_prof_refcount_enter--;
@@ -588,10 +543,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
588{ 543{
589 struct syscall_metadata *sys_data; 544 struct syscall_metadata *sys_data;
590 struct syscall_trace_exit *rec; 545 struct syscall_trace_exit *rec;
591 struct perf_trace_buf *trace_buf;
592 unsigned long flags; 546 unsigned long flags;
593 int syscall_nr; 547 int syscall_nr;
548 char *trace_buf;
594 char *raw_data; 549 char *raw_data;
550 int rctx;
595 int size; 551 int size;
596 int cpu; 552 int cpu;
597 553
@@ -617,28 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
617 573
618 /* Protect the per cpu buffer, begin the rcu read side */ 574 /* Protect the per cpu buffer, begin the rcu read side */
619 local_irq_save(flags); 575 local_irq_save(flags);
576
577 rctx = perf_swevent_get_recursion_context();
578 if (rctx < 0)
579 goto end_recursion;
580
620 cpu = smp_processor_id(); 581 cpu = smp_processor_id();
621 582
622 if (in_nmi()) 583 trace_buf = rcu_dereference(perf_trace_buf);
623 trace_buf = rcu_dereference(perf_trace_buf_nmi);
624 else
625 trace_buf = rcu_dereference(perf_trace_buf);
626 584
627 if (!trace_buf) 585 if (!trace_buf)
628 goto end; 586 goto end;
629 587
630 trace_buf = per_cpu_ptr(trace_buf, cpu); 588 raw_data = per_cpu_ptr(trace_buf, cpu);
631
632 if (trace_buf->recursion++)
633 goto end_recursion;
634
635 /*
636 * Make recursion update visible before entering perf_tp_event
637 * so that we protect from perf recursions.
638 */
639 barrier();
640
641 raw_data = trace_buf->buf;
642 589
643 /* zero the dead bytes from align to not leak stack to user */ 590 /* zero the dead bytes from align to not leak stack to user */
644 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -646,26 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
646 rec = (struct syscall_trace_exit *)raw_data; 593 rec = (struct syscall_trace_exit *)raw_data;
647 594
648 tracing_generic_entry_update(&rec->ent, 0, 0); 595 tracing_generic_entry_update(&rec->ent, 0, 0);
649 rec->ent.type = sys_data->exit_id; 596 rec->ent.type = sys_data->exit_event->id;
650 rec->nr = syscall_nr; 597 rec->nr = syscall_nr;
651 rec->ret = syscall_get_return_value(current, regs); 598 rec->ret = syscall_get_return_value(current, regs);
652 599
653 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
654 601
655end_recursion:
656 trace_buf->recursion--;
657end: 602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
658 local_irq_restore(flags); 605 local_irq_restore(flags);
659} 606}
660 607
661int reg_prof_syscall_exit(char *name) 608int prof_sysexit_enable(struct ftrace_event_call *call)
662{ 609{
663 int ret = 0; 610 int ret = 0;
664 int num; 611 int num;
665 612
666 num = syscall_name_to_nr(name); 613 num = ((struct syscall_metadata *)call->data)->syscall_nr;
667 if (num < 0 || num >= NR_syscalls)
668 return -ENOSYS;
669 614
670 mutex_lock(&syscall_trace_lock); 615 mutex_lock(&syscall_trace_lock);
671 if (!sys_prof_refcount_exit) 616 if (!sys_prof_refcount_exit)
@@ -681,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
681 return ret; 626 return ret;
682} 627}
683 628
684void unreg_prof_syscall_exit(char *name) 629void prof_sysexit_disable(struct ftrace_event_call *call)
685{ 630{
686 int num; 631 int num;
687 632
688 num = syscall_name_to_nr(name); 633 num = ((struct syscall_metadata *)call->data)->syscall_nr;
689 if (num < 0 || num >= NR_syscalls)
690 return;
691 634
692 mutex_lock(&syscall_trace_lock); 635 mutex_lock(&syscall_trace_lock);
693 sys_prof_refcount_exit--; 636 sys_prof_refcount_exit--;