aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--Documentation/trace/ftrace.txt2097
-rw-r--r--include/linux/ftrace.h6
-rw-r--r--include/linux/ftrace_event.h111
-rw-r--r--include/linux/kernel.h70
-rw-r--r--include/linux/ring_buffer.h6
-rw-r--r--include/linux/trace_clock.h1
-rw-r--r--include/trace/ftrace.h49
-rw-r--r--kernel/trace/Kconfig49
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/ftrace.c98
-rw-r--r--kernel/trace/ring_buffer.c500
-rw-r--r--kernel/trace/trace.c2204
-rw-r--r--kernel/trace/trace.h144
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c10
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1397
-rw-r--r--kernel/trace/trace_events_filter.c34
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c207
-rw-r--r--kernel/trace/trace_functions_graph.c12
-rw-r--r--kernel/trace/trace_irqsoff.c85
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c119
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c87
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_stack.c76
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c90
-rw-r--r--kernel/tracepoint.c21
34 files changed, 5686 insertions, 1922 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8ccbf27aead4..5abc09a93bc2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -320,6 +320,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
320 on: enable for both 32- and 64-bit processes 320 on: enable for both 32- and 64-bit processes
321 off: disable for both 32- and 64-bit processes 321 off: disable for both 32- and 64-bit processes
322 322
323 alloc_snapshot [FTRACE]
324 Allocate the ftrace snapshot buffer on boot up when the
325 main buffer is allocated. This is handy if debugging
326 and you need to use tracing_snapshot() on boot up, and
327 do not want to use tracing_snapshot_alloc() as it needs
328 to be done where GFP_KERNEL allocations are allowed.
329
323 amd_iommu= [HW,X86-64] 330 amd_iommu= [HW,X86-64]
324 Pass parameters to the AMD IOMMU driver in the system. 331 Pass parameters to the AMD IOMMU driver in the system.
325 Possible values are: 332 Possible values are:
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index a372304aef10..bfe8c29b1f1d 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -8,6 +8,7 @@ Copyright 2008 Red Hat Inc.
8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton, 8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton,
9 John Kacur, and David Teigland. 9 John Kacur, and David Teigland.
10Written for: 2.6.28-rc2 10Written for: 2.6.28-rc2
11Updated for: 3.10
11 12
12Introduction 13Introduction
13------------ 14------------
@@ -17,13 +18,16 @@ designers of systems to find what is going on inside the kernel.
17It can be used for debugging or analyzing latencies and 18It can be used for debugging or analyzing latencies and
18performance issues that take place outside of user-space. 19performance issues that take place outside of user-space.
19 20
20Although ftrace is the function tracer, it also includes an 21Although ftrace is typically considered the function tracer, it
21infrastructure that allows for other types of tracing. Some of 22is really a frame work of several assorted tracing utilities.
22the tracers that are currently in ftrace include a tracer to 23There's latency tracing to examine what occurs between interrupts
23trace context switches, the time it takes for a high priority 24disabled and enabled, as well as for preemption and from a time
24task to run after it was woken up, the time interrupts are 25a task is woken to the task is actually scheduled in.
25disabled, and more (ftrace allows for tracer plugins, which 26
26means that the list of tracers can always grow). 27One of the most common uses of ftrace is the event tracing.
28Through out the kernel is hundreds of static event points that
29can be enabled via the debugfs file system to see what is
30going on in certain parts of the kernel.
27 31
28 32
29Implementation Details 33Implementation Details
@@ -61,7 +65,7 @@ the extended "/sys/kernel/debug/tracing" path name.
61 65
62That's it! (assuming that you have ftrace configured into your kernel) 66That's it! (assuming that you have ftrace configured into your kernel)
63 67
64After mounting the debugfs, you can see a directory called 68After mounting debugfs, you can see a directory called
65"tracing". This directory contains the control and output files 69"tracing". This directory contains the control and output files
66of ftrace. Here is a list of some of the key files: 70of ftrace. Here is a list of some of the key files:
67 71
@@ -84,7 +88,9 @@ of ftrace. Here is a list of some of the key files:
84 88
85 This sets or displays whether writing to the trace 89 This sets or displays whether writing to the trace
86 ring buffer is enabled. Echo 0 into this file to disable 90 ring buffer is enabled. Echo 0 into this file to disable
87 the tracer or 1 to enable it. 91 the tracer or 1 to enable it. Note, this only disables
92 writing to the ring buffer, the tracing overhead may
93 still be occurring.
88 94
89 trace: 95 trace:
90 96
@@ -109,7 +115,15 @@ of ftrace. Here is a list of some of the key files:
109 115
110 This file lets the user control the amount of data 116 This file lets the user control the amount of data
111 that is displayed in one of the above output 117 that is displayed in one of the above output
112 files. 118 files. Options also exist to modify how a tracer
119 or events work (stack traces, timestamps, etc).
120
121 options:
122
123 This is a directory that has a file for every available
124 trace option (also in trace_options). Options may also be set
125 or cleared by writing a "1" or "0" respectively into the
126 corresponding file with the option name.
113 127
114 tracing_max_latency: 128 tracing_max_latency:
115 129
@@ -121,10 +135,17 @@ of ftrace. Here is a list of some of the key files:
121 latency is greater than the value in this 135 latency is greater than the value in this
122 file. (in microseconds) 136 file. (in microseconds)
123 137
138 tracing_thresh:
139
140 Some latency tracers will record a trace whenever the
141 latency is greater than the number in this file.
142 Only active when the file contains a number greater than 0.
143 (in microseconds)
144
124 buffer_size_kb: 145 buffer_size_kb:
125 146
126 This sets or displays the number of kilobytes each CPU 147 This sets or displays the number of kilobytes each CPU
127 buffer can hold. The tracer buffers are the same size 148 buffer holds. By default, the trace buffers are the same size
128 for each CPU. The displayed number is the size of the 149 for each CPU. The displayed number is the size of the
129 CPU buffer and not total size of all buffers. The 150 CPU buffer and not total size of all buffers. The
130 trace buffers are allocated in pages (blocks of memory 151 trace buffers are allocated in pages (blocks of memory
@@ -133,16 +154,30 @@ of ftrace. Here is a list of some of the key files:
133 than requested, the rest of the page will be used, 154 than requested, the rest of the page will be used,
134 making the actual allocation bigger than requested. 155 making the actual allocation bigger than requested.
135 ( Note, the size may not be a multiple of the page size 156 ( Note, the size may not be a multiple of the page size
136 due to buffer management overhead. ) 157 due to buffer management meta-data. )
137 158
138 This can only be updated when the current_tracer 159 buffer_total_size_kb:
139 is set to "nop". 160
161 This displays the total combined size of all the trace buffers.
162
163 free_buffer:
164
165 If a process is performing the tracing, and the ring buffer
166 should be shrunk "freed" when the process is finished, even
167 if it were to be killed by a signal, this file can be used
168 for that purpose. On close of this file, the ring buffer will
169 be resized to its minimum size. Having a process that is tracing
170 also open this file, when the process exits its file descriptor
171 for this file will be closed, and in doing so, the ring buffer
172 will be "freed".
173
174 It may also stop tracing if disable_on_free option is set.
140 175
141 tracing_cpumask: 176 tracing_cpumask:
142 177
143 This is a mask that lets the user only trace 178 This is a mask that lets the user only trace
144 on specified CPUS. The format is a hex string 179 on specified CPUs. The format is a hex string
145 representing the CPUS. 180 representing the CPUs.
146 181
147 set_ftrace_filter: 182 set_ftrace_filter:
148 183
@@ -183,6 +218,261 @@ of ftrace. Here is a list of some of the key files:
183 "set_ftrace_notrace". (See the section "dynamic ftrace" 218 "set_ftrace_notrace". (See the section "dynamic ftrace"
184 below for more details.) 219 below for more details.)
185 220
221 enabled_functions:
222
223 This file is more for debugging ftrace, but can also be useful
224 in seeing if any function has a callback attached to it.
225 Not only does the trace infrastructure use ftrace function
226 trace utility, but other subsystems might too. This file
227 displays all functions that have a callback attached to them
228 as well as the number of callbacks that have been attached.
229 Note, a callback may also call multiple functions which will
230 not be listed in this count.
231
232 If the callback registered to be traced by a function with
233 the "save regs" attribute (thus even more overhead), a 'R'
234 will be displayed on the same line as the function that
235 is returning registers.
236
237 function_profile_enabled:
238
239 When set it will enable all functions with either the function
240 tracer, or if enabled, the function graph tracer. It will
241 keep a histogram of the number of functions that were called
242 and if run with the function graph tracer, it will also keep
243 track of the time spent in those functions. The histogram
244 content can be displayed in the files:
245
246 trace_stats/function<cpu> ( function0, function1, etc).
247
248 trace_stats:
249
250 A directory that holds different tracing stats.
251
252 kprobe_events:
253
254 Enable dynamic trace points. See kprobetrace.txt.
255
256 kprobe_profile:
257
258 Dynamic trace points stats. See kprobetrace.txt.
259
260 max_graph_depth:
261
262 Used with the function graph tracer. This is the max depth
263 it will trace into a function. Setting this to a value of
264 one will show only the first kernel function that is called
265 from user space.
266
267 printk_formats:
268
269 This is for tools that read the raw format files. If an event in
270 the ring buffer references a string (currently only trace_printk()
271 does this), only a pointer to the string is recorded into the buffer
272 and not the string itself. This prevents tools from knowing what
273 that string was. This file displays the string and address for
274 the string allowing tools to map the pointers to what the
275 strings were.
276
277 saved_cmdlines:
278
279 Only the pid of the task is recorded in a trace event unless
280 the event specifically saves the task comm as well. Ftrace
281 makes a cache of pid mappings to comms to try to display
282 comms for events. If a pid for a comm is not listed, then
283 "<...>" is displayed in the output.
284
285 snapshot:
286
287 This displays the "snapshot" buffer and also lets the user
288 take a snapshot of the current running trace.
289 See the "Snapshot" section below for more details.
290
291 stack_max_size:
292
293 When the stack tracer is activated, this will display the
294 maximum stack size it has encountered.
295 See the "Stack Trace" section below.
296
297 stack_trace:
298
299 This displays the stack back trace of the largest stack
300 that was encountered when the stack tracer is activated.
301 See the "Stack Trace" section below.
302
303 stack_trace_filter:
304
305 This is similar to "set_ftrace_filter" but it limits what
306 functions the stack tracer will check.
307
308 trace_clock:
309
310 Whenever an event is recorded into the ring buffer, a
311 "timestamp" is added. This stamp comes from a specified
312 clock. By default, ftrace uses the "local" clock. This
313 clock is very fast and strictly per cpu, but on some
314 systems it may not be monotonic with respect to other
315 CPUs. In other words, the local clocks may not be in sync
316 with local clocks on other CPUs.
317
318 Usual clocks for tracing:
319
320 # cat trace_clock
321 [local] global counter x86-tsc
322
323 local: Default clock, but may not be in sync across CPUs
324
325 global: This clock is in sync with all CPUs but may
326 be a bit slower than the local clock.
327
328 counter: This is not a clock at all, but literally an atomic
329 counter. It counts up one by one, but is in sync
330 with all CPUs. This is useful when you need to
331 know exactly the order events occurred with respect to
332 each other on different CPUs.
333
334 uptime: This uses the jiffies counter and the time stamp
335 is relative to the time since boot up.
336
337 perf: This makes ftrace use the same clock that perf uses.
338 Eventually perf will be able to read ftrace buffers
339 and this will help out in interleaving the data.
340
341 x86-tsc: Architectures may define their own clocks. For
342 example, x86 uses its own TSC cycle clock here.
343
344 To set a clock, simply echo the clock name into this file.
345
346 echo global > trace_clock
347
348 trace_marker:
349
350 This is a very useful file for synchronizing user space
351 with events happening in the kernel. Writing strings into
352 this file will be written into the ftrace buffer.
353
354 It is useful in applications to open this file at the start
355 of the application and just reference the file descriptor
356 for the file.
357
358 void trace_write(const char *fmt, ...)
359 {
360 va_list ap;
361 char buf[256];
362 int n;
363
364 if (trace_fd < 0)
365 return;
366
367 va_start(ap, fmt);
368 n = vsnprintf(buf, 256, fmt, ap);
369 va_end(ap);
370
371 write(trace_fd, buf, n);
372 }
373
374 start:
375
376 trace_fd = open("trace_marker", WR_ONLY);
377
378 uprobe_events:
379
380 Add dynamic tracepoints in programs.
381 See uprobetracer.txt
382
383 uprobe_profile:
384
385 Uprobe statistics. See uprobetrace.txt
386
387 instances:
388
389 This is a way to make multiple trace buffers where different
390 events can be recorded in different buffers.
391 See "Instances" section below.
392
393 events:
394
395 This is the trace event directory. It holds event tracepoints
396 (also known as static tracepoints) that have been compiled
397 into the kernel. It shows what event tracepoints exist
398 and how they are grouped by system. There are "enable"
399 files at various levels that can enable the tracepoints
400 when a "1" is written to them.
401
402 See events.txt for more information.
403
404 per_cpu:
405
406 This is a directory that contains the trace per_cpu information.
407
408 per_cpu/cpu0/buffer_size_kb:
409
410 The ftrace buffer is defined per_cpu. That is, there's a separate
411 buffer for each CPU to allow writes to be done atomically,
412 and free from cache bouncing. These buffers may have different
413 size buffers. This file is similar to the buffer_size_kb
414 file, but it only displays or sets the buffer size for the
415 specific CPU. (here cpu0).
416
417 per_cpu/cpu0/trace:
418
419 This is similar to the "trace" file, but it will only display
420 the data specific for the CPU. If written to, it only clears
421 the specific CPU buffer.
422
423 per_cpu/cpu0/trace_pipe
424
425 This is similar to the "trace_pipe" file, and is a consuming
426 read, but it will only display (and consume) the data specific
427 for the CPU.
428
429 per_cpu/cpu0/trace_pipe_raw
430
431 For tools that can parse the ftrace ring buffer binary format,
432 the trace_pipe_raw file can be used to extract the data
433 from the ring buffer directly. With the use of the splice()
434 system call, the buffer data can be quickly transferred to
435 a file or to the network where a server is collecting the
436 data.
437
438 Like trace_pipe, this is a consuming reader, where multiple
439 reads will always produce different data.
440
441 per_cpu/cpu0/snapshot:
442
443 This is similar to the main "snapshot" file, but will only
444 snapshot the current CPU (if supported). It only displays
445 the content of the snapshot for a given CPU, and if
446 written to, only clears this CPU buffer.
447
448 per_cpu/cpu0/snapshot_raw:
449
450 Similar to the trace_pipe_raw, but will read the binary format
451 from the snapshot buffer for the given CPU.
452
453 per_cpu/cpu0/stats:
454
455 This displays certain stats about the ring buffer:
456
457 entries: The number of events that are still in the buffer.
458
459 overrun: The number of lost events due to overwriting when
460 the buffer was full.
461
462 commit overrun: Should always be zero.
463 This gets set if so many events happened within a nested
464 event (ring buffer is re-entrant), that it fills the
465 buffer and starts dropping events.
466
467 bytes: Bytes actually read (not overwritten).
468
469 oldest event ts: The oldest timestamp in the buffer
470
471 now ts: The current timestamp
472
473 dropped events: Events lost due to overwrite option being off.
474
475 read events: The number of events read.
186 476
187The Tracers 477The Tracers
188----------- 478-----------
@@ -234,11 +524,6 @@ Here is the list of current tracers that may be configured.
234 RT tasks (as the current "wakeup" does). This is useful 524 RT tasks (as the current "wakeup" does). This is useful
235 for those interested in wake up timings of RT tasks. 525 for those interested in wake up timings of RT tasks.
236 526
237 "hw-branch-tracer"
238
239 Uses the BTS CPU feature on x86 CPUs to traces all
240 branches executed.
241
242 "nop" 527 "nop"
243 528
244 This is the "trace nothing" tracer. To remove all 529 This is the "trace nothing" tracer. To remove all
@@ -261,70 +546,100 @@ Here is an example of the output format of the file "trace"
261 -------- 546 --------
262# tracer: function 547# tracer: function
263# 548#
264# TASK-PID CPU# TIMESTAMP FUNCTION 549# entries-in-buffer/entries-written: 140080/250280 #P:4
265# | | | | | 550#
266 bash-4251 [01] 10152.583854: path_put <-path_walk 551# _-----=> irqs-off
267 bash-4251 [01] 10152.583855: dput <-path_put 552# / _----=> need-resched
268 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput 553# | / _---=> hardirq/softirq
554# || / _--=> preempt-depth
555# ||| / delay
556# TASK-PID CPU# |||| TIMESTAMP FUNCTION
557# | | | |||| | |
558 bash-1977 [000] .... 17284.993652: sys_close <-system_call_fastpath
559 bash-1977 [000] .... 17284.993653: __close_fd <-sys_close
560 bash-1977 [000] .... 17284.993653: _raw_spin_lock <-__close_fd
561 sshd-1974 [003] .... 17284.993653: __srcu_read_unlock <-fsnotify
562 bash-1977 [000] .... 17284.993654: add_preempt_count <-_raw_spin_lock
563 bash-1977 [000] ...1 17284.993655: _raw_spin_unlock <-__close_fd
564 bash-1977 [000] ...1 17284.993656: sub_preempt_count <-_raw_spin_unlock
565 bash-1977 [000] .... 17284.993657: filp_close <-__close_fd
566 bash-1977 [000] .... 17284.993657: dnotify_flush <-filp_close
567 sshd-1974 [003] .... 17284.993658: sys_select <-system_call_fastpath
269 -------- 568 --------
270 569
271A header is printed with the tracer name that is represented by 570A header is printed with the tracer name that is represented by
272the trace. In this case the tracer is "function". Then a header 571the trace. In this case the tracer is "function". Then it shows the
273showing the format. Task name "bash", the task PID "4251", the 572number of events in the buffer as well as the total number of entries
274CPU that it was running on "01", the timestamp in <secs>.<usecs> 573that were written. The difference is the number of entries that were
275format, the function name that was traced "path_put" and the 574lost due to the buffer filling up (250280 - 140080 = 110200 events
276parent function that called this function "path_walk". The 575lost).
277timestamp is the time at which the function was entered. 576
577The header explains the content of the events. Task name "bash", the task
578PID "1977", the CPU that it was running on "000", the latency format
579(explained below), the timestamp in <secs>.<usecs> format, the
580function name that was traced "sys_close" and the parent function that
581called this function "system_call_fastpath". The timestamp is the time
582at which the function was entered.
278 583
279Latency trace format 584Latency trace format
280-------------------- 585--------------------
281 586
282When the latency-format option is enabled, the trace file gives 587When the latency-format option is enabled or when one of the latency
283somewhat more information to see why a latency happened. 588tracers is set, the trace file gives somewhat more information to see
284Here is a typical trace. 589why a latency happened. Here is a typical trace.
285 590
286# tracer: irqsoff 591# tracer: irqsoff
287# 592#
288irqsoff latency trace v1.1.5 on 2.6.26-rc8 593# irqsoff latency trace v1.1.5 on 3.8.0-test+
289-------------------------------------------------------------------- 594# --------------------------------------------------------------------
290 latency: 97 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 595# latency: 259 us, #4/4, CPU#2 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
291 ----------------- 596# -----------------
292 | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) 597# | task: ps-6143 (uid:0 nice:0 policy:0 rt_prio:0)
293 ----------------- 598# -----------------
294 => started at: apic_timer_interrupt 599# => started at: __lock_task_sighand
295 => ended at: do_softirq 600# => ended at: _raw_spin_unlock_irqrestore
296 601#
297# _------=> CPU# 602#
298# / _-----=> irqs-off 603# _------=> CPU#
299# | / _----=> need-resched 604# / _-----=> irqs-off
300# || / _---=> hardirq/softirq 605# | / _----=> need-resched
301# ||| / _--=> preempt-depth 606# || / _---=> hardirq/softirq
302# |||| / 607# ||| / _--=> preempt-depth
303# ||||| delay 608# |||| / delay
304# cmd pid ||||| time | caller 609# cmd pid ||||| time | caller
305# \ / ||||| \ | / 610# \ / ||||| \ | /
306 <idle>-0 0d..1 0us+: trace_hardirqs_off_thunk (apic_timer_interrupt) 611 ps-6143 2d... 0us!: trace_hardirqs_off <-__lock_task_sighand
307 <idle>-0 0d.s. 97us : __do_softirq (do_softirq) 612 ps-6143 2d..1 259us+: trace_hardirqs_on <-_raw_spin_unlock_irqrestore
308 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq) 613 ps-6143 2d..1 263us+: time_hardirqs_on <-_raw_spin_unlock_irqrestore
614 ps-6143 2d..1 306us : <stack trace>
615 => trace_hardirqs_on_caller
616 => trace_hardirqs_on
617 => _raw_spin_unlock_irqrestore
618 => do_task_stat
619 => proc_tgid_stat
620 => proc_single_show
621 => seq_read
622 => vfs_read
623 => sys_read
624 => system_call_fastpath
309 625
310 626
311This shows that the current tracer is "irqsoff" tracing the time 627This shows that the current tracer is "irqsoff" tracing the time
312for which interrupts were disabled. It gives the trace version 628for which interrupts were disabled. It gives the trace version (which
313and the version of the kernel upon which this was executed on 629never changes) and the version of the kernel upon which this was executed on
314(2.6.26-rc8). Then it displays the max latency in microsecs (97 630(3.10). Then it displays the max latency in microseconds (259 us). The number
315us). The number of trace entries displayed and the total number 631of trace entries displayed and the total number (both are four: #4/4).
316recorded (both are three: #3/3). The type of preemption that was 632VP, KP, SP, and HP are always zero and are reserved for later use.
317used (PREEMPT). VP, KP, SP, and HP are always zero and are 633#P is the number of online CPUs (#P:4).
318reserved for later use. #P is the number of online CPUS (#P:2).
319 634
320The task is the process that was running when the latency 635The task is the process that was running when the latency
321occurred. (swapper pid: 0). 636occurred. (ps pid: 6143).
322 637
323The start and stop (the functions in which the interrupts were 638The start and stop (the functions in which the interrupts were
324disabled and enabled respectively) that caused the latencies: 639disabled and enabled respectively) that caused the latencies:
325 640
326 apic_timer_interrupt is where the interrupts were disabled. 641 __lock_task_sighand is where the interrupts were disabled.
327 do_softirq is where they were enabled again. 642 _raw_spin_unlock_irqrestore is where they were enabled again.
328 643
329The next lines after the header are the trace itself. The header 644The next lines after the header are the trace itself. The header
330explains which is which. 645explains which is which.
@@ -367,16 +682,43 @@ The above is mostly meaningful for kernel developers.
367 682
368 The rest is the same as the 'trace' file. 683 The rest is the same as the 'trace' file.
369 684
685 Note, the latency tracers will usually end with a back trace
686 to easily find where the latency occurred.
370 687
371trace_options 688trace_options
372------------- 689-------------
373 690
374The trace_options file is used to control what gets printed in 691The trace_options file (or the options directory) is used to control
375the trace output. To see what is available, simply cat the file: 692what gets printed in the trace output, or manipulate the tracers.
693To see what is available, simply cat the file:
376 694
377 cat trace_options 695 cat trace_options
378 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 696print-parent
379 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 697nosym-offset
698nosym-addr
699noverbose
700noraw
701nohex
702nobin
703noblock
704nostacktrace
705trace_printk
706noftrace_preempt
707nobranch
708annotate
709nouserstacktrace
710nosym-userobj
711noprintk-msg-only
712context-info
713latency-format
714sleep-time
715graph-time
716record-cmd
717overwrite
718nodisable_on_free
719irq-info
720markers
721function-trace
380 722
381To disable one of the options, echo in the option prepended with 723To disable one of the options, echo in the option prepended with
382"no". 724"no".
@@ -428,13 +770,34 @@ Here are the available options:
428 770
429 bin - This will print out the formats in raw binary. 771 bin - This will print out the formats in raw binary.
430 772
431 block - TBD (needs update) 773 block - When set, reading trace_pipe will not block when polled.
432 774
433 stacktrace - This is one of the options that changes the trace 775 stacktrace - This is one of the options that changes the trace
434 itself. When a trace is recorded, so is the stack 776 itself. When a trace is recorded, so is the stack
435 of functions. This allows for back traces of 777 of functions. This allows for back traces of
436 trace sites. 778 trace sites.
437 779
780 trace_printk - Can disable trace_printk() from writing into the buffer.
781
782 branch - Enable branch tracing with the tracer.
783
784 annotate - It is sometimes confusing when the CPU buffers are full
785 and one CPU buffer had a lot of events recently, thus
786 a shorter time frame, were another CPU may have only had
787 a few events, which lets it have older events. When
788 the trace is reported, it shows the oldest events first,
789 and it may look like only one CPU ran (the one with the
790 oldest events). When the annotate option is set, it will
791 display when a new CPU buffer started:
792
793 <idle>-0 [001] dNs4 21169.031481: wake_up_idle_cpu <-add_timer_on
794 <idle>-0 [001] dNs4 21169.031482: _raw_spin_unlock_irqrestore <-add_timer_on
795 <idle>-0 [001] .Ns4 21169.031484: sub_preempt_count <-_raw_spin_unlock_irqrestore
796##### CPU 2 buffer started ####
797 <idle>-0 [002] .N.1 21169.031484: rcu_idle_exit <-cpu_idle
798 <idle>-0 [001] .Ns3 21169.031484: _raw_spin_unlock <-clocksource_watchdog
799 <idle>-0 [001] .Ns3 21169.031485: sub_preempt_count <-_raw_spin_unlock
800
438 userstacktrace - This option changes the trace. It records a 801 userstacktrace - This option changes the trace. It records a
439 stacktrace of the current userspace thread. 802 stacktrace of the current userspace thread.
440 803
@@ -451,9 +814,13 @@ Here are the available options:
451 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 814 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
452x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 815x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
453 816
454 sched-tree - trace all tasks that are on the runqueue, at 817
455 every scheduling event. Will add overhead if 818 printk-msg-only - When set, trace_printk()s will only show the format
456 there's a lot of tasks running at once. 819 and not their parameters (if trace_bprintk() or
820 trace_bputs() was used to save the trace_printk()).
821
822 context-info - Show only the event data. Hides the comm, PID,
823 timestamp, CPU, and other useful data.
457 824
458 latency-format - This option changes the trace. When 825 latency-format - This option changes the trace. When
459 it is enabled, the trace displays 826 it is enabled, the trace displays
@@ -461,31 +828,61 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
461 latencies, as described in "Latency 828 latencies, as described in "Latency
462 trace format". 829 trace format".
463 830
831 sleep-time - When running function graph tracer, to include
832 the time a task schedules out in its function.
833 When enabled, it will account time the task has been
834 scheduled out as part of the function call.
835
836 graph-time - When running function graph tracer, to include the
837 time to call nested functions. When this is not set,
838 the time reported for the function will only include
839 the time the function itself executed for, not the time
840 for functions that it called.
841
842 record-cmd - When any event or tracer is enabled, a hook is enabled
843 in the sched_switch trace point to fill comm cache
844 with mapped pids and comms. But this may cause some
845 overhead, and if you only care about pids, and not the
846 name of the task, disabling this option can lower the
847 impact of tracing.
848
464 overwrite - This controls what happens when the trace buffer is 849 overwrite - This controls what happens when the trace buffer is
465 full. If "1" (default), the oldest events are 850 full. If "1" (default), the oldest events are
466 discarded and overwritten. If "0", then the newest 851 discarded and overwritten. If "0", then the newest
467 events are discarded. 852 events are discarded.
853 (see per_cpu/cpu0/stats for overrun and dropped)
468 854
469ftrace_enabled 855 disable_on_free - When the free_buffer is closed, tracing will
470-------------- 856 stop (tracing_on set to 0).
471 857
472The following tracers (listed below) give different output 858 irq-info - Shows the interrupt, preempt count, need resched data.
473depending on whether or not the sysctl ftrace_enabled is set. To 859 When disabled, the trace looks like:
474set ftrace_enabled, one can either use the sysctl function or
475set it via the proc file system interface.
476 860
477 sysctl kernel.ftrace_enabled=1 861# tracer: function
862#
863# entries-in-buffer/entries-written: 144405/9452052 #P:4
864#
865# TASK-PID CPU# TIMESTAMP FUNCTION
866# | | | | |
867 <idle>-0 [002] 23636.756054: ttwu_do_activate.constprop.89 <-try_to_wake_up
868 <idle>-0 [002] 23636.756054: activate_task <-ttwu_do_activate.constprop.89
869 <idle>-0 [002] 23636.756055: enqueue_task <-activate_task
478 870
479 or
480 871
481 echo 1 > /proc/sys/kernel/ftrace_enabled 872 markers - When set, the trace_marker is writable (only by root).
873 When disabled, the trace_marker will error with EINVAL
874 on write.
875
876
877 function-trace - The latency tracers will enable function tracing
878 if this option is enabled (default it is). When
879 it is disabled, the latency tracers do not trace
880 functions. This keeps the overhead of the tracer down
881 when performing latency tests.
482 882
483To disable ftrace_enabled simply replace the '1' with '0' in the 883 Note: Some tracers have their own options. They only appear
484above commands. 884 when the tracer is active.
485 885
486When ftrace_enabled is set the tracers will also record the
487functions that are within the trace. The descriptions of the
488tracers will also show an example with ftrace enabled.
489 886
490 887
491irqsoff 888irqsoff
@@ -506,95 +903,133 @@ new trace is saved.
506To reset the maximum, echo 0 into tracing_max_latency. Here is 903To reset the maximum, echo 0 into tracing_max_latency. Here is
507an example: 904an example:
508 905
906 # echo 0 > options/function-trace
509 # echo irqsoff > current_tracer 907 # echo irqsoff > current_tracer
510 # echo latency-format > trace_options
511 # echo 0 > tracing_max_latency
512 # echo 1 > tracing_on 908 # echo 1 > tracing_on
909 # echo 0 > tracing_max_latency
513 # ls -ltr 910 # ls -ltr
514 [...] 911 [...]
515 # echo 0 > tracing_on 912 # echo 0 > tracing_on
516 # cat trace 913 # cat trace
517# tracer: irqsoff 914# tracer: irqsoff
518# 915#
519irqsoff latency trace v1.1.5 on 2.6.26 916# irqsoff latency trace v1.1.5 on 3.8.0-test+
520-------------------------------------------------------------------- 917# --------------------------------------------------------------------
521 latency: 12 us, #3/3, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 918# latency: 16 us, #4/4, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
522 ----------------- 919# -----------------
523 | task: bash-3730 (uid:0 nice:0 policy:0 rt_prio:0) 920# | task: swapper/0-0 (uid:0 nice:0 policy:0 rt_prio:0)
524 ----------------- 921# -----------------
525 => started at: sys_setpgid 922# => started at: run_timer_softirq
526 => ended at: sys_setpgid 923# => ended at: run_timer_softirq
527 924#
528# _------=> CPU# 925#
529# / _-----=> irqs-off 926# _------=> CPU#
530# | / _----=> need-resched 927# / _-----=> irqs-off
531# || / _---=> hardirq/softirq 928# | / _----=> need-resched
532# ||| / _--=> preempt-depth 929# || / _---=> hardirq/softirq
533# |||| / 930# ||| / _--=> preempt-depth
534# ||||| delay 931# |||| / delay
535# cmd pid ||||| time | caller 932# cmd pid ||||| time | caller
536# \ / ||||| \ | / 933# \ / ||||| \ | /
537 bash-3730 1d... 0us : _write_lock_irq (sys_setpgid) 934 <idle>-0 0d.s2 0us+: _raw_spin_lock_irq <-run_timer_softirq
538 bash-3730 1d..1 1us+: _write_unlock_irq (sys_setpgid) 935 <idle>-0 0dNs3 17us : _raw_spin_unlock_irq <-run_timer_softirq
539 bash-3730 1d..2 14us : trace_hardirqs_on (sys_setpgid) 936 <idle>-0 0dNs3 17us+: trace_hardirqs_on <-run_timer_softirq
540 937 <idle>-0 0dNs3 25us : <stack trace>
541 938 => _raw_spin_unlock_irq
542Here we see that that we had a latency of 12 microsecs (which is 939 => run_timer_softirq
543very good). The _write_lock_irq in sys_setpgid disabled 940 => __do_softirq
544interrupts. The difference between the 12 and the displayed 941 => call_softirq
545timestamp 14us occurred because the clock was incremented 942 => do_softirq
943 => irq_exit
944 => smp_apic_timer_interrupt
945 => apic_timer_interrupt
946 => rcu_idle_exit
947 => cpu_idle
948 => rest_init
949 => start_kernel
950 => x86_64_start_reservations
951 => x86_64_start_kernel
952
953Here we see that that we had a latency of 16 microseconds (which is
954very good). The _raw_spin_lock_irq in run_timer_softirq disabled
955interrupts. The difference between the 16 and the displayed
956timestamp 25us occurred because the clock was incremented
546between the time of recording the max latency and the time of 957between the time of recording the max latency and the time of
547recording the function that had that latency. 958recording the function that had that latency.
548 959
549Note the above example had ftrace_enabled not set. If we set the 960Note the above example had function-trace not set. If we set
550ftrace_enabled, we get a much larger output: 961function-trace, we get a much larger output:
962
963 with echo 1 > options/function-trace
551 964
552# tracer: irqsoff 965# tracer: irqsoff
553# 966#
554irqsoff latency trace v1.1.5 on 2.6.26-rc8 967# irqsoff latency trace v1.1.5 on 3.8.0-test+
555-------------------------------------------------------------------- 968# --------------------------------------------------------------------
556 latency: 50 us, #101/101, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 969# latency: 71 us, #168/168, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
557 ----------------- 970# -----------------
558 | task: ls-4339 (uid:0 nice:0 policy:0 rt_prio:0) 971# | task: bash-2042 (uid:0 nice:0 policy:0 rt_prio:0)
559 ----------------- 972# -----------------
560 => started at: __alloc_pages_internal 973# => started at: ata_scsi_queuecmd
561 => ended at: __alloc_pages_internal 974# => ended at: ata_scsi_queuecmd
562 975#
563# _------=> CPU# 976#
564# / _-----=> irqs-off 977# _------=> CPU#
565# | / _----=> need-resched 978# / _-----=> irqs-off
566# || / _---=> hardirq/softirq 979# | / _----=> need-resched
567# ||| / _--=> preempt-depth 980# || / _---=> hardirq/softirq
568# |||| / 981# ||| / _--=> preempt-depth
569# ||||| delay 982# |||| / delay
570# cmd pid ||||| time | caller 983# cmd pid ||||| time | caller
571# \ / ||||| \ | / 984# \ / ||||| \ | /
572 ls-4339 0...1 0us+: get_page_from_freelist (__alloc_pages_internal) 985 bash-2042 3d... 0us : _raw_spin_lock_irqsave <-ata_scsi_queuecmd
573 ls-4339 0d..1 3us : rmqueue_bulk (get_page_from_freelist) 986 bash-2042 3d... 0us : add_preempt_count <-_raw_spin_lock_irqsave
574 ls-4339 0d..1 3us : _spin_lock (rmqueue_bulk) 987 bash-2042 3d..1 1us : ata_scsi_find_dev <-ata_scsi_queuecmd
575 ls-4339 0d..1 4us : add_preempt_count (_spin_lock) 988 bash-2042 3d..1 1us : __ata_scsi_find_dev <-ata_scsi_find_dev
576 ls-4339 0d..2 4us : __rmqueue (rmqueue_bulk) 989 bash-2042 3d..1 2us : ata_find_dev.part.14 <-__ata_scsi_find_dev
577 ls-4339 0d..2 5us : __rmqueue_smallest (__rmqueue) 990 bash-2042 3d..1 2us : ata_qc_new_init <-__ata_scsi_queuecmd
578 ls-4339 0d..2 5us : __mod_zone_page_state (__rmqueue_smallest) 991 bash-2042 3d..1 3us : ata_sg_init <-__ata_scsi_queuecmd
579 ls-4339 0d..2 6us : __rmqueue (rmqueue_bulk) 992 bash-2042 3d..1 4us : ata_scsi_rw_xlat <-__ata_scsi_queuecmd
580 ls-4339 0d..2 6us : __rmqueue_smallest (__rmqueue) 993 bash-2042 3d..1 4us : ata_build_rw_tf <-ata_scsi_rw_xlat
581 ls-4339 0d..2 7us : __mod_zone_page_state (__rmqueue_smallest)
582 ls-4339 0d..2 7us : __rmqueue (rmqueue_bulk)
583 ls-4339 0d..2 8us : __rmqueue_smallest (__rmqueue)
584[...] 994[...]
585 ls-4339 0d..2 46us : __rmqueue_smallest (__rmqueue) 995 bash-2042 3d..1 67us : delay_tsc <-__delay
586 ls-4339 0d..2 47us : __mod_zone_page_state (__rmqueue_smallest) 996 bash-2042 3d..1 67us : add_preempt_count <-delay_tsc
587 ls-4339 0d..2 47us : __rmqueue (rmqueue_bulk) 997 bash-2042 3d..2 67us : sub_preempt_count <-delay_tsc
588 ls-4339 0d..2 48us : __rmqueue_smallest (__rmqueue) 998 bash-2042 3d..1 67us : add_preempt_count <-delay_tsc
589 ls-4339 0d..2 48us : __mod_zone_page_state (__rmqueue_smallest) 999 bash-2042 3d..2 68us : sub_preempt_count <-delay_tsc
590 ls-4339 0d..2 49us : _spin_unlock (rmqueue_bulk) 1000 bash-2042 3d..1 68us+: ata_bmdma_start <-ata_bmdma_qc_issue
591 ls-4339 0d..2 49us : sub_preempt_count (_spin_unlock) 1001 bash-2042 3d..1 71us : _raw_spin_unlock_irqrestore <-ata_scsi_queuecmd
592 ls-4339 0d..1 50us : get_page_from_freelist (__alloc_pages_internal) 1002 bash-2042 3d..1 71us : _raw_spin_unlock_irqrestore <-ata_scsi_queuecmd
593 ls-4339 0d..2 51us : trace_hardirqs_on (__alloc_pages_internal) 1003 bash-2042 3d..1 72us+: trace_hardirqs_on <-ata_scsi_queuecmd
594 1004 bash-2042 3d..1 120us : <stack trace>
595 1005 => _raw_spin_unlock_irqrestore
596 1006 => ata_scsi_queuecmd
597Here we traced a 50 microsecond latency. But we also see all the 1007 => scsi_dispatch_cmd
1008 => scsi_request_fn
1009 => __blk_run_queue_uncond
1010 => __blk_run_queue
1011 => blk_queue_bio
1012 => generic_make_request
1013 => submit_bio
1014 => submit_bh
1015 => __ext3_get_inode_loc
1016 => ext3_iget
1017 => ext3_lookup
1018 => lookup_real
1019 => __lookup_hash
1020 => walk_component
1021 => lookup_last
1022 => path_lookupat
1023 => filename_lookup
1024 => user_path_at_empty
1025 => user_path_at
1026 => vfs_fstatat
1027 => vfs_stat
1028 => sys_newstat
1029 => system_call_fastpath
1030
1031
1032Here we traced a 71 microsecond latency. But we also see all the
598functions that were called during that time. Note that by 1033functions that were called during that time. Note that by
599enabling function tracing, we incur an added overhead. This 1034enabling function tracing, we incur an added overhead. This
600overhead may extend the latency times. But nevertheless, this 1035overhead may extend the latency times. But nevertheless, this
@@ -614,120 +1049,122 @@ Like the irqsoff tracer, it records the maximum latency for
614which preemption was disabled. The control of preemptoff tracer 1049which preemption was disabled. The control of preemptoff tracer
615is much like the irqsoff tracer. 1050is much like the irqsoff tracer.
616 1051
1052 # echo 0 > options/function-trace
617 # echo preemptoff > current_tracer 1053 # echo preemptoff > current_tracer
618 # echo latency-format > trace_options
619 # echo 0 > tracing_max_latency
620 # echo 1 > tracing_on 1054 # echo 1 > tracing_on
1055 # echo 0 > tracing_max_latency
621 # ls -ltr 1056 # ls -ltr
622 [...] 1057 [...]
623 # echo 0 > tracing_on 1058 # echo 0 > tracing_on
624 # cat trace 1059 # cat trace
625# tracer: preemptoff 1060# tracer: preemptoff
626# 1061#
627preemptoff latency trace v1.1.5 on 2.6.26-rc8 1062# preemptoff latency trace v1.1.5 on 3.8.0-test+
628-------------------------------------------------------------------- 1063# --------------------------------------------------------------------
629 latency: 29 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1064# latency: 46 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
630 ----------------- 1065# -----------------
631 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0) 1066# | task: sshd-1991 (uid:0 nice:0 policy:0 rt_prio:0)
632 ----------------- 1067# -----------------
633 => started at: do_IRQ 1068# => started at: do_IRQ
634 => ended at: __do_softirq 1069# => ended at: do_IRQ
635 1070#
636# _------=> CPU# 1071#
637# / _-----=> irqs-off 1072# _------=> CPU#
638# | / _----=> need-resched 1073# / _-----=> irqs-off
639# || / _---=> hardirq/softirq 1074# | / _----=> need-resched
640# ||| / _--=> preempt-depth 1075# || / _---=> hardirq/softirq
641# |||| / 1076# ||| / _--=> preempt-depth
642# ||||| delay 1077# |||| / delay
643# cmd pid ||||| time | caller 1078# cmd pid ||||| time | caller
644# \ / ||||| \ | / 1079# \ / ||||| \ | /
645 sshd-4261 0d.h. 0us+: irq_enter (do_IRQ) 1080 sshd-1991 1d.h. 0us+: irq_enter <-do_IRQ
646 sshd-4261 0d.s. 29us : _local_bh_enable (__do_softirq) 1081 sshd-1991 1d..1 46us : irq_exit <-do_IRQ
647 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) 1082 sshd-1991 1d..1 47us+: trace_preempt_on <-do_IRQ
1083 sshd-1991 1d..1 52us : <stack trace>
1084 => sub_preempt_count
1085 => irq_exit
1086 => do_IRQ
1087 => ret_from_intr
648 1088
649 1089
650This has some more changes. Preemption was disabled when an 1090This has some more changes. Preemption was disabled when an
651interrupt came in (notice the 'h'), and was enabled while doing 1091interrupt came in (notice the 'h'), and was enabled on exit.
652a softirq. (notice the 's'). But we also see that interrupts 1092But we also see that interrupts have been disabled when entering
653have been disabled when entering the preempt off section and 1093the preempt off section and leaving it (the 'd'). We do not know if
654leaving it (the 'd'). We do not know if interrupts were enabled 1094interrupts were enabled in the mean time or shortly after this
655in the mean time. 1095was over.
656 1096
657# tracer: preemptoff 1097# tracer: preemptoff
658# 1098#
659preemptoff latency trace v1.1.5 on 2.6.26-rc8 1099# preemptoff latency trace v1.1.5 on 3.8.0-test+
660-------------------------------------------------------------------- 1100# --------------------------------------------------------------------
661 latency: 63 us, #87/87, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1101# latency: 83 us, #241/241, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
662 ----------------- 1102# -----------------
663 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0) 1103# | task: bash-1994 (uid:0 nice:0 policy:0 rt_prio:0)
664 ----------------- 1104# -----------------
665 => started at: remove_wait_queue 1105# => started at: wake_up_new_task
666 => ended at: __do_softirq 1106# => ended at: task_rq_unlock
667 1107#
668# _------=> CPU# 1108#
669# / _-----=> irqs-off 1109# _------=> CPU#
670# | / _----=> need-resched 1110# / _-----=> irqs-off
671# || / _---=> hardirq/softirq 1111# | / _----=> need-resched
672# ||| / _--=> preempt-depth 1112# || / _---=> hardirq/softirq
673# |||| / 1113# ||| / _--=> preempt-depth
674# ||||| delay 1114# |||| / delay
675# cmd pid ||||| time | caller 1115# cmd pid ||||| time | caller
676# \ / ||||| \ | / 1116# \ / ||||| \ | /
677 sshd-4261 0d..1 0us : _spin_lock_irqsave (remove_wait_queue) 1117 bash-1994 1d..1 0us : _raw_spin_lock_irqsave <-wake_up_new_task
678 sshd-4261 0d..1 1us : _spin_unlock_irqrestore (remove_wait_queue) 1118 bash-1994 1d..1 0us : select_task_rq_fair <-select_task_rq
679 sshd-4261 0d..1 2us : do_IRQ (common_interrupt) 1119 bash-1994 1d..1 1us : __rcu_read_lock <-select_task_rq_fair
680 sshd-4261 0d..1 2us : irq_enter (do_IRQ) 1120 bash-1994 1d..1 1us : source_load <-select_task_rq_fair
681 sshd-4261 0d..1 2us : idle_cpu (irq_enter) 1121 bash-1994 1d..1 1us : source_load <-select_task_rq_fair
682 sshd-4261 0d..1 3us : add_preempt_count (irq_enter)
683 sshd-4261 0d.h1 3us : idle_cpu (irq_enter)
684 sshd-4261 0d.h. 4us : handle_fasteoi_irq (do_IRQ)
685[...] 1122[...]
686 sshd-4261 0d.h. 12us : add_preempt_count (_spin_lock) 1123 bash-1994 1d..1 12us : irq_enter <-smp_apic_timer_interrupt
687 sshd-4261 0d.h1 12us : ack_ioapic_quirk_irq (handle_fasteoi_irq) 1124 bash-1994 1d..1 12us : rcu_irq_enter <-irq_enter
688 sshd-4261 0d.h1 13us : move_native_irq (ack_ioapic_quirk_irq) 1125 bash-1994 1d..1 13us : add_preempt_count <-irq_enter
689 sshd-4261 0d.h1 13us : _spin_unlock (handle_fasteoi_irq) 1126 bash-1994 1d.h1 13us : exit_idle <-smp_apic_timer_interrupt
690 sshd-4261 0d.h1 14us : sub_preempt_count (_spin_unlock) 1127 bash-1994 1d.h1 13us : hrtimer_interrupt <-smp_apic_timer_interrupt
691 sshd-4261 0d.h1 14us : irq_exit (do_IRQ) 1128 bash-1994 1d.h1 13us : _raw_spin_lock <-hrtimer_interrupt
692 sshd-4261 0d.h1 15us : sub_preempt_count (irq_exit) 1129 bash-1994 1d.h1 14us : add_preempt_count <-_raw_spin_lock
693 sshd-4261 0d..2 15us : do_softirq (irq_exit) 1130 bash-1994 1d.h2 14us : ktime_get_update_offsets <-hrtimer_interrupt
694 sshd-4261 0d... 15us : __do_softirq (do_softirq)
695 sshd-4261 0d... 16us : __local_bh_disable (__do_softirq)
696 sshd-4261 0d... 16us+: add_preempt_count (__local_bh_disable)
697 sshd-4261 0d.s4 20us : add_preempt_count (__local_bh_disable)
698 sshd-4261 0d.s4 21us : sub_preempt_count (local_bh_enable)
699 sshd-4261 0d.s5 21us : sub_preempt_count (local_bh_enable)
700[...] 1131[...]
701 sshd-4261 0d.s6 41us : add_preempt_count (__local_bh_disable) 1132 bash-1994 1d.h1 35us : lapic_next_event <-clockevents_program_event
702 sshd-4261 0d.s6 42us : sub_preempt_count (local_bh_enable) 1133 bash-1994 1d.h1 35us : irq_exit <-smp_apic_timer_interrupt
703 sshd-4261 0d.s7 42us : sub_preempt_count (local_bh_enable) 1134 bash-1994 1d.h1 36us : sub_preempt_count <-irq_exit
704 sshd-4261 0d.s5 43us : add_preempt_count (__local_bh_disable) 1135 bash-1994 1d..2 36us : do_softirq <-irq_exit
705 sshd-4261 0d.s5 43us : sub_preempt_count (local_bh_enable_ip) 1136 bash-1994 1d..2 36us : __do_softirq <-call_softirq
706 sshd-4261 0d.s6 44us : sub_preempt_count (local_bh_enable_ip) 1137 bash-1994 1d..2 36us : __local_bh_disable <-__do_softirq
707 sshd-4261 0d.s5 44us : add_preempt_count (__local_bh_disable) 1138 bash-1994 1d.s2 37us : add_preempt_count <-_raw_spin_lock_irq
708 sshd-4261 0d.s5 45us : sub_preempt_count (local_bh_enable) 1139 bash-1994 1d.s3 38us : _raw_spin_unlock <-run_timer_softirq
1140 bash-1994 1d.s3 39us : sub_preempt_count <-_raw_spin_unlock
1141 bash-1994 1d.s2 39us : call_timer_fn <-run_timer_softirq
709[...] 1142[...]
710 sshd-4261 0d.s. 63us : _local_bh_enable (__do_softirq) 1143 bash-1994 1dNs2 81us : cpu_needs_another_gp <-rcu_process_callbacks
711 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) 1144 bash-1994 1dNs2 82us : __local_bh_enable <-__do_softirq
1145 bash-1994 1dNs2 82us : sub_preempt_count <-__local_bh_enable
1146 bash-1994 1dN.2 82us : idle_cpu <-irq_exit
1147 bash-1994 1dN.2 83us : rcu_irq_exit <-irq_exit
1148 bash-1994 1dN.2 83us : sub_preempt_count <-irq_exit
1149 bash-1994 1.N.1 84us : _raw_spin_unlock_irqrestore <-task_rq_unlock
1150 bash-1994 1.N.1 84us+: trace_preempt_on <-task_rq_unlock
1151 bash-1994 1.N.1 104us : <stack trace>
1152 => sub_preempt_count
1153 => _raw_spin_unlock_irqrestore
1154 => task_rq_unlock
1155 => wake_up_new_task
1156 => do_fork
1157 => sys_clone
1158 => stub_clone
712 1159
713 1160
714The above is an example of the preemptoff trace with 1161The above is an example of the preemptoff trace with
715ftrace_enabled set. Here we see that interrupts were disabled 1162function-trace set. Here we see that interrupts were not disabled
716the entire time. The irq_enter code lets us know that we entered 1163the entire time. The irq_enter code lets us know that we entered
717an interrupt 'h'. Before that, the functions being traced still 1164an interrupt 'h'. Before that, the functions being traced still
718show that it is not in an interrupt, but we can see from the 1165show that it is not in an interrupt, but we can see from the
719functions themselves that this is not the case. 1166functions themselves that this is not the case.
720 1167
721Notice that __do_softirq when called does not have a
722preempt_count. It may seem that we missed a preempt enabling.
723What really happened is that the preempt count is held on the
724thread's stack and we switched to the softirq stack (4K stacks
725in effect). The code does not copy the preempt count, but
726because interrupts are disabled, we do not need to worry about
727it. Having a tracer like this is good for letting people know
728what really happens inside the kernel.
729
730
731preemptirqsoff 1168preemptirqsoff
732-------------- 1169--------------
733 1170
@@ -762,38 +1199,57 @@ tracer.
762Again, using this trace is much like the irqsoff and preemptoff 1199Again, using this trace is much like the irqsoff and preemptoff
763tracers. 1200tracers.
764 1201
1202 # echo 0 > options/function-trace
765 # echo preemptirqsoff > current_tracer 1203 # echo preemptirqsoff > current_tracer
766 # echo latency-format > trace_options
767 # echo 0 > tracing_max_latency
768 # echo 1 > tracing_on 1204 # echo 1 > tracing_on
1205 # echo 0 > tracing_max_latency
769 # ls -ltr 1206 # ls -ltr
770 [...] 1207 [...]
771 # echo 0 > tracing_on 1208 # echo 0 > tracing_on
772 # cat trace 1209 # cat trace
773# tracer: preemptirqsoff 1210# tracer: preemptirqsoff
774# 1211#
775preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 1212# preemptirqsoff latency trace v1.1.5 on 3.8.0-test+
776-------------------------------------------------------------------- 1213# --------------------------------------------------------------------
777 latency: 293 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1214# latency: 100 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
778 ----------------- 1215# -----------------
779 | task: ls-4860 (uid:0 nice:0 policy:0 rt_prio:0) 1216# | task: ls-2230 (uid:0 nice:0 policy:0 rt_prio:0)
780 ----------------- 1217# -----------------
781 => started at: apic_timer_interrupt 1218# => started at: ata_scsi_queuecmd
782 => ended at: __do_softirq 1219# => ended at: ata_scsi_queuecmd
783 1220#
784# _------=> CPU# 1221#
785# / _-----=> irqs-off 1222# _------=> CPU#
786# | / _----=> need-resched 1223# / _-----=> irqs-off
787# || / _---=> hardirq/softirq 1224# | / _----=> need-resched
788# ||| / _--=> preempt-depth 1225# || / _---=> hardirq/softirq
789# |||| / 1226# ||| / _--=> preempt-depth
790# ||||| delay 1227# |||| / delay
791# cmd pid ||||| time | caller 1228# cmd pid ||||| time | caller
792# \ / ||||| \ | / 1229# \ / ||||| \ | /
793 ls-4860 0d... 0us!: trace_hardirqs_off_thunk (apic_timer_interrupt) 1230 ls-2230 3d... 0us+: _raw_spin_lock_irqsave <-ata_scsi_queuecmd
794 ls-4860 0d.s. 294us : _local_bh_enable (__do_softirq) 1231 ls-2230 3...1 100us : _raw_spin_unlock_irqrestore <-ata_scsi_queuecmd
795 ls-4860 0d.s1 294us : trace_preempt_on (__do_softirq) 1232 ls-2230 3...1 101us+: trace_preempt_on <-ata_scsi_queuecmd
796 1233 ls-2230 3...1 111us : <stack trace>
1234 => sub_preempt_count
1235 => _raw_spin_unlock_irqrestore
1236 => ata_scsi_queuecmd
1237 => scsi_dispatch_cmd
1238 => scsi_request_fn
1239 => __blk_run_queue_uncond
1240 => __blk_run_queue
1241 => blk_queue_bio
1242 => generic_make_request
1243 => submit_bio
1244 => submit_bh
1245 => ext3_bread
1246 => ext3_dir_bread
1247 => htree_dirblock_to_tree
1248 => ext3_htree_fill_tree
1249 => ext3_readdir
1250 => vfs_readdir
1251 => sys_getdents
1252 => system_call_fastpath
797 1253
798 1254
799The trace_hardirqs_off_thunk is called from assembly on x86 when 1255The trace_hardirqs_off_thunk is called from assembly on x86 when
@@ -802,105 +1258,158 @@ function tracing, we do not know if interrupts were enabled
802within the preemption points. We do see that it started with 1258within the preemption points. We do see that it started with
803preemption enabled. 1259preemption enabled.
804 1260
805Here is a trace with ftrace_enabled set: 1261Here is a trace with function-trace set:
806
807 1262
808# tracer: preemptirqsoff 1263# tracer: preemptirqsoff
809# 1264#
810preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 1265# preemptirqsoff latency trace v1.1.5 on 3.8.0-test+
811-------------------------------------------------------------------- 1266# --------------------------------------------------------------------
812 latency: 105 us, #183/183, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1267# latency: 161 us, #339/339, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
813 ----------------- 1268# -----------------
814 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0) 1269# | task: ls-2269 (uid:0 nice:0 policy:0 rt_prio:0)
815 ----------------- 1270# -----------------
816 => started at: write_chan 1271# => started at: schedule
817 => ended at: __do_softirq 1272# => ended at: mutex_unlock
818 1273#
819# _------=> CPU# 1274#
820# / _-----=> irqs-off 1275# _------=> CPU#
821# | / _----=> need-resched 1276# / _-----=> irqs-off
822# || / _---=> hardirq/softirq 1277# | / _----=> need-resched
823# ||| / _--=> preempt-depth 1278# || / _---=> hardirq/softirq
824# |||| / 1279# ||| / _--=> preempt-depth
825# ||||| delay 1280# |||| / delay
826# cmd pid ||||| time | caller 1281# cmd pid ||||| time | caller
827# \ / ||||| \ | / 1282# \ / ||||| \ | /
828 ls-4473 0.N.. 0us : preempt_schedule (write_chan) 1283kworker/-59 3...1 0us : __schedule <-schedule
829 ls-4473 0dN.1 1us : _spin_lock (schedule) 1284kworker/-59 3d..1 0us : rcu_preempt_qs <-rcu_note_context_switch
830 ls-4473 0dN.1 2us : add_preempt_count (_spin_lock) 1285kworker/-59 3d..1 1us : add_preempt_count <-_raw_spin_lock_irq
831 ls-4473 0d..2 2us : put_prev_task_fair (schedule) 1286kworker/-59 3d..2 1us : deactivate_task <-__schedule
832[...] 1287kworker/-59 3d..2 1us : dequeue_task <-deactivate_task
833 ls-4473 0d..2 13us : set_normalized_timespec (ktime_get_ts) 1288kworker/-59 3d..2 2us : update_rq_clock <-dequeue_task
834 ls-4473 0d..2 13us : __switch_to (schedule) 1289kworker/-59 3d..2 2us : dequeue_task_fair <-dequeue_task
835 sshd-4261 0d..2 14us : finish_task_switch (schedule) 1290kworker/-59 3d..2 2us : update_curr <-dequeue_task_fair
836 sshd-4261 0d..2 14us : _spin_unlock_irq (finish_task_switch) 1291kworker/-59 3d..2 2us : update_min_vruntime <-update_curr
837 sshd-4261 0d..1 15us : add_preempt_count (_spin_lock_irqsave) 1292kworker/-59 3d..2 3us : cpuacct_charge <-update_curr
838 sshd-4261 0d..2 16us : _spin_unlock_irqrestore (hrtick_set) 1293kworker/-59 3d..2 3us : __rcu_read_lock <-cpuacct_charge
839 sshd-4261 0d..2 16us : do_IRQ (common_interrupt) 1294kworker/-59 3d..2 3us : __rcu_read_unlock <-cpuacct_charge
840 sshd-4261 0d..2 17us : irq_enter (do_IRQ) 1295kworker/-59 3d..2 3us : update_cfs_rq_blocked_load <-dequeue_task_fair
841 sshd-4261 0d..2 17us : idle_cpu (irq_enter) 1296kworker/-59 3d..2 4us : clear_buddies <-dequeue_task_fair
842 sshd-4261 0d..2 18us : add_preempt_count (irq_enter) 1297kworker/-59 3d..2 4us : account_entity_dequeue <-dequeue_task_fair
843 sshd-4261 0d.h2 18us : idle_cpu (irq_enter) 1298kworker/-59 3d..2 4us : update_min_vruntime <-dequeue_task_fair
844 sshd-4261 0d.h. 18us : handle_fasteoi_irq (do_IRQ) 1299kworker/-59 3d..2 4us : update_cfs_shares <-dequeue_task_fair
845 sshd-4261 0d.h. 19us : _spin_lock (handle_fasteoi_irq) 1300kworker/-59 3d..2 5us : hrtick_update <-dequeue_task_fair
846 sshd-4261 0d.h. 19us : add_preempt_count (_spin_lock) 1301kworker/-59 3d..2 5us : wq_worker_sleeping <-__schedule
847 sshd-4261 0d.h1 20us : _spin_unlock (handle_fasteoi_irq) 1302kworker/-59 3d..2 5us : kthread_data <-wq_worker_sleeping
848 sshd-4261 0d.h1 20us : sub_preempt_count (_spin_unlock) 1303kworker/-59 3d..2 5us : put_prev_task_fair <-__schedule
849[...] 1304kworker/-59 3d..2 6us : pick_next_task_fair <-pick_next_task
850 sshd-4261 0d.h1 28us : _spin_unlock (handle_fasteoi_irq) 1305kworker/-59 3d..2 6us : clear_buddies <-pick_next_task_fair
851 sshd-4261 0d.h1 29us : sub_preempt_count (_spin_unlock) 1306kworker/-59 3d..2 6us : set_next_entity <-pick_next_task_fair
852 sshd-4261 0d.h2 29us : irq_exit (do_IRQ) 1307kworker/-59 3d..2 6us : update_stats_wait_end <-set_next_entity
853 sshd-4261 0d.h2 29us : sub_preempt_count (irq_exit) 1308 ls-2269 3d..2 7us : finish_task_switch <-__schedule
854 sshd-4261 0d..3 30us : do_softirq (irq_exit) 1309 ls-2269 3d..2 7us : _raw_spin_unlock_irq <-finish_task_switch
855 sshd-4261 0d... 30us : __do_softirq (do_softirq) 1310 ls-2269 3d..2 8us : do_IRQ <-ret_from_intr
856 sshd-4261 0d... 31us : __local_bh_disable (__do_softirq) 1311 ls-2269 3d..2 8us : irq_enter <-do_IRQ
857 sshd-4261 0d... 31us+: add_preempt_count (__local_bh_disable) 1312 ls-2269 3d..2 8us : rcu_irq_enter <-irq_enter
858 sshd-4261 0d.s4 34us : add_preempt_count (__local_bh_disable) 1313 ls-2269 3d..2 9us : add_preempt_count <-irq_enter
1314 ls-2269 3d.h2 9us : exit_idle <-do_IRQ
859[...] 1315[...]
860 sshd-4261 0d.s3 43us : sub_preempt_count (local_bh_enable_ip) 1316 ls-2269 3d.h3 20us : sub_preempt_count <-_raw_spin_unlock
861 sshd-4261 0d.s4 44us : sub_preempt_count (local_bh_enable_ip) 1317 ls-2269 3d.h2 20us : irq_exit <-do_IRQ
862 sshd-4261 0d.s3 44us : smp_apic_timer_interrupt (apic_timer_interrupt) 1318 ls-2269 3d.h2 21us : sub_preempt_count <-irq_exit
863 sshd-4261 0d.s3 45us : irq_enter (smp_apic_timer_interrupt) 1319 ls-2269 3d..3 21us : do_softirq <-irq_exit
864 sshd-4261 0d.s3 45us : idle_cpu (irq_enter) 1320 ls-2269 3d..3 21us : __do_softirq <-call_softirq
865 sshd-4261 0d.s3 46us : add_preempt_count (irq_enter) 1321 ls-2269 3d..3 21us+: __local_bh_disable <-__do_softirq
866 sshd-4261 0d.H3 46us : idle_cpu (irq_enter) 1322 ls-2269 3d.s4 29us : sub_preempt_count <-_local_bh_enable_ip
867 sshd-4261 0d.H3 47us : hrtimer_interrupt (smp_apic_timer_interrupt) 1323 ls-2269 3d.s5 29us : sub_preempt_count <-_local_bh_enable_ip
868 sshd-4261 0d.H3 47us : ktime_get (hrtimer_interrupt) 1324 ls-2269 3d.s5 31us : do_IRQ <-ret_from_intr
1325 ls-2269 3d.s5 31us : irq_enter <-do_IRQ
1326 ls-2269 3d.s5 31us : rcu_irq_enter <-irq_enter
869[...] 1327[...]
870 sshd-4261 0d.H3 81us : tick_program_event (hrtimer_interrupt) 1328 ls-2269 3d.s5 31us : rcu_irq_enter <-irq_enter
871 sshd-4261 0d.H3 82us : ktime_get (tick_program_event) 1329 ls-2269 3d.s5 32us : add_preempt_count <-irq_enter
872 sshd-4261 0d.H3 82us : ktime_get_ts (ktime_get) 1330 ls-2269 3d.H5 32us : exit_idle <-do_IRQ
873 sshd-4261 0d.H3 83us : getnstimeofday (ktime_get_ts) 1331 ls-2269 3d.H5 32us : handle_irq <-do_IRQ
874 sshd-4261 0d.H3 83us : set_normalized_timespec (ktime_get_ts) 1332 ls-2269 3d.H5 32us : irq_to_desc <-handle_irq
875 sshd-4261 0d.H3 84us : clockevents_program_event (tick_program_event) 1333 ls-2269 3d.H5 33us : handle_fasteoi_irq <-handle_irq
876 sshd-4261 0d.H3 84us : lapic_next_event (clockevents_program_event)
877 sshd-4261 0d.H3 85us : irq_exit (smp_apic_timer_interrupt)
878 sshd-4261 0d.H3 85us : sub_preempt_count (irq_exit)
879 sshd-4261 0d.s4 86us : sub_preempt_count (irq_exit)
880 sshd-4261 0d.s3 86us : add_preempt_count (__local_bh_disable)
881[...] 1334[...]
882 sshd-4261 0d.s1 98us : sub_preempt_count (net_rx_action) 1335 ls-2269 3d.s5 158us : _raw_spin_unlock_irqrestore <-rtl8139_poll
883 sshd-4261 0d.s. 99us : add_preempt_count (_spin_lock_irq) 1336 ls-2269 3d.s3 158us : net_rps_action_and_irq_enable.isra.65 <-net_rx_action
884 sshd-4261 0d.s1 99us+: _spin_unlock_irq (run_timer_softirq) 1337 ls-2269 3d.s3 159us : __local_bh_enable <-__do_softirq
885 sshd-4261 0d.s. 104us : _local_bh_enable (__do_softirq) 1338 ls-2269 3d.s3 159us : sub_preempt_count <-__local_bh_enable
886 sshd-4261 0d.s. 104us : sub_preempt_count (_local_bh_enable) 1339 ls-2269 3d..3 159us : idle_cpu <-irq_exit
887 sshd-4261 0d.s. 105us : _local_bh_enable (__do_softirq) 1340 ls-2269 3d..3 159us : rcu_irq_exit <-irq_exit
888 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) 1341 ls-2269 3d..3 160us : sub_preempt_count <-irq_exit
889 1342 ls-2269 3d... 161us : __mutex_unlock_slowpath <-mutex_unlock
890 1343 ls-2269 3d... 162us+: trace_hardirqs_on <-mutex_unlock
891This is a very interesting trace. It started with the preemption 1344 ls-2269 3d... 186us : <stack trace>
892of the ls task. We see that the task had the "need_resched" bit 1345 => __mutex_unlock_slowpath
893set via the 'N' in the trace. Interrupts were disabled before 1346 => mutex_unlock
894the spin_lock at the beginning of the trace. We see that a 1347 => process_output
895schedule took place to run sshd. When the interrupts were 1348 => n_tty_write
896enabled, we took an interrupt. On return from the interrupt 1349 => tty_write
897handler, the softirq ran. We took another interrupt while 1350 => vfs_write
898running the softirq as we see from the capital 'H'. 1351 => sys_write
1352 => system_call_fastpath
1353
1354This is an interesting trace. It started with kworker running and
1355scheduling out and ls taking over. But as soon as ls released the
1356rq lock and enabled interrupts (but not preemption) an interrupt
1357triggered. When the interrupt finished, it started running softirqs.
1358But while the softirq was running, another interrupt triggered.
1359When an interrupt is running inside a softirq, the annotation is 'H'.
899 1360
900 1361
901wakeup 1362wakeup
902------ 1363------
903 1364
1365One common case that people are interested in tracing is the
1366time it takes for a task that is woken to actually wake up.
1367Now for non Real-Time tasks, this can be arbitrary. But tracing
1368it none the less can be interesting.
1369
1370Without function tracing:
1371
1372 # echo 0 > options/function-trace
1373 # echo wakeup > current_tracer
1374 # echo 1 > tracing_on
1375 # echo 0 > tracing_max_latency
1376 # chrt -f 5 sleep 1
1377 # echo 0 > tracing_on
1378 # cat trace
1379# tracer: wakeup
1380#
1381# wakeup latency trace v1.1.5 on 3.8.0-test+
1382# --------------------------------------------------------------------
1383# latency: 15 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
1384# -----------------
1385# | task: kworker/3:1H-312 (uid:0 nice:-20 policy:0 rt_prio:0)
1386# -----------------
1387#
1388# _------=> CPU#
1389# / _-----=> irqs-off
1390# | / _----=> need-resched
1391# || / _---=> hardirq/softirq
1392# ||| / _--=> preempt-depth
1393# |||| / delay
1394# cmd pid ||||| time | caller
1395# \ / ||||| \ | /
1396 <idle>-0 3dNs7 0us : 0:120:R + [003] 312:100:R kworker/3:1H
1397 <idle>-0 3dNs7 1us+: ttwu_do_activate.constprop.87 <-try_to_wake_up
1398 <idle>-0 3d..3 15us : __schedule <-schedule
1399 <idle>-0 3d..3 15us : 0:120:R ==> [003] 312:100:R kworker/3:1H
1400
1401The tracer only traces the highest priority task in the system
1402to avoid tracing the normal circumstances. Here we see that
1403the kworker with a nice priority of -20 (not very nice), took
1404just 15 microseconds from the time it woke up, to the time it
1405ran.
1406
1407Non Real-Time tasks are not that interesting. A more interesting
1408trace is to concentrate only on Real-Time tasks.
1409
1410wakeup_rt
1411---------
1412
904In a Real-Time environment it is very important to know the 1413In a Real-Time environment it is very important to know the
905wakeup time it takes for the highest priority task that is woken 1414wakeup time it takes for the highest priority task that is woken
906up to the time that it executes. This is also known as "schedule 1415up to the time that it executes. This is also known as "schedule
@@ -914,124 +1423,229 @@ Real-Time environments are interested in the worst case latency.
914That is the longest latency it takes for something to happen, 1423That is the longest latency it takes for something to happen,
915and not the average. We can have a very fast scheduler that may 1424and not the average. We can have a very fast scheduler that may
916only have a large latency once in a while, but that would not 1425only have a large latency once in a while, but that would not
917work well with Real-Time tasks. The wakeup tracer was designed 1426work well with Real-Time tasks. The wakeup_rt tracer was designed
918to record the worst case wakeups of RT tasks. Non-RT tasks are 1427to record the worst case wakeups of RT tasks. Non-RT tasks are
919not recorded because the tracer only records one worst case and 1428not recorded because the tracer only records one worst case and
920tracing non-RT tasks that are unpredictable will overwrite the 1429tracing non-RT tasks that are unpredictable will overwrite the
921worst case latency of RT tasks. 1430worst case latency of RT tasks (just run the normal wakeup
1431tracer for a while to see that effect).
922 1432
923Since this tracer only deals with RT tasks, we will run this 1433Since this tracer only deals with RT tasks, we will run this
924slightly differently than we did with the previous tracers. 1434slightly differently than we did with the previous tracers.
925Instead of performing an 'ls', we will run 'sleep 1' under 1435Instead of performing an 'ls', we will run 'sleep 1' under
926'chrt' which changes the priority of the task. 1436'chrt' which changes the priority of the task.
927 1437
928 # echo wakeup > current_tracer 1438 # echo 0 > options/function-trace
929 # echo latency-format > trace_options 1439 # echo wakeup_rt > current_tracer
930 # echo 0 > tracing_max_latency
931 # echo 1 > tracing_on 1440 # echo 1 > tracing_on
1441 # echo 0 > tracing_max_latency
932 # chrt -f 5 sleep 1 1442 # chrt -f 5 sleep 1
933 # echo 0 > tracing_on 1443 # echo 0 > tracing_on
934 # cat trace 1444 # cat trace
935# tracer: wakeup 1445# tracer: wakeup
936# 1446#
937wakeup latency trace v1.1.5 on 2.6.26-rc8 1447# tracer: wakeup_rt
938-------------------------------------------------------------------- 1448#
939 latency: 4 us, #2/2, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1449# wakeup_rt latency trace v1.1.5 on 3.8.0-test+
940 ----------------- 1450# --------------------------------------------------------------------
941 | task: sleep-4901 (uid:0 nice:0 policy:1 rt_prio:5) 1451# latency: 5 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
942 ----------------- 1452# -----------------
943 1453# | task: sleep-2389 (uid:0 nice:0 policy:1 rt_prio:5)
944# _------=> CPU# 1454# -----------------
945# / _-----=> irqs-off 1455#
946# | / _----=> need-resched 1456# _------=> CPU#
947# || / _---=> hardirq/softirq 1457# / _-----=> irqs-off
948# ||| / _--=> preempt-depth 1458# | / _----=> need-resched
949# |||| / 1459# || / _---=> hardirq/softirq
950# ||||| delay 1460# ||| / _--=> preempt-depth
951# cmd pid ||||| time | caller 1461# |||| / delay
952# \ / ||||| \ | / 1462# cmd pid ||||| time | caller
953 <idle>-0 1d.h4 0us+: try_to_wake_up (wake_up_process) 1463# \ / ||||| \ | /
954 <idle>-0 1d..4 4us : schedule (cpu_idle) 1464 <idle>-0 3d.h4 0us : 0:120:R + [003] 2389: 94:R sleep
955 1465 <idle>-0 3d.h4 1us+: ttwu_do_activate.constprop.87 <-try_to_wake_up
956 1466 <idle>-0 3d..3 5us : __schedule <-schedule
957Running this on an idle system, we see that it only took 4 1467 <idle>-0 3d..3 5us : 0:120:R ==> [003] 2389: 94:R sleep
958microseconds to perform the task switch. Note, since the trace 1468
959marker in the schedule is before the actual "switch", we stop 1469
960the tracing when the recorded task is about to schedule in. This 1470Running this on an idle system, we see that it only took 5 microseconds
961may change if we add a new marker at the end of the scheduler. 1471to perform the task switch. Note, since the trace point in the schedule
962 1472is before the actual "switch", we stop the tracing when the recorded task
963Notice that the recorded task is 'sleep' with the PID of 4901 1473is about to schedule in. This may change if we add a new marker at the
1474end of the scheduler.
1475
1476Notice that the recorded task is 'sleep' with the PID of 2389
964and it has an rt_prio of 5. This priority is user-space priority 1477and it has an rt_prio of 5. This priority is user-space priority
965and not the internal kernel priority. The policy is 1 for 1478and not the internal kernel priority. The policy is 1 for
966SCHED_FIFO and 2 for SCHED_RR. 1479SCHED_FIFO and 2 for SCHED_RR.
967 1480
968Doing the same with chrt -r 5 and ftrace_enabled set. 1481Note, that the trace data shows the internal priority (99 - rtprio).
969 1482
970# tracer: wakeup 1483 <idle>-0 3d..3 5us : 0:120:R ==> [003] 2389: 94:R sleep
1484
1485The 0:120:R means idle was running with a nice priority of 0 (120 - 20)
1486and in the running state 'R'. The sleep task was scheduled in with
14872389: 94:R. That is the priority is the kernel rtprio (99 - 5 = 94)
1488and it too is in the running state.
1489
1490Doing the same with chrt -r 5 and function-trace set.
1491
1492 echo 1 > options/function-trace
1493
1494# tracer: wakeup_rt
971# 1495#
972wakeup latency trace v1.1.5 on 2.6.26-rc8 1496# wakeup_rt latency trace v1.1.5 on 3.8.0-test+
973-------------------------------------------------------------------- 1497# --------------------------------------------------------------------
974 latency: 50 us, #60/60, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) 1498# latency: 29 us, #85/85, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
975 ----------------- 1499# -----------------
976 | task: sleep-4068 (uid:0 nice:0 policy:2 rt_prio:5) 1500# | task: sleep-2448 (uid:0 nice:0 policy:1 rt_prio:5)
977 ----------------- 1501# -----------------
978 1502#
979# _------=> CPU# 1503# _------=> CPU#
980# / _-----=> irqs-off 1504# / _-----=> irqs-off
981# | / _----=> need-resched 1505# | / _----=> need-resched
982# || / _---=> hardirq/softirq 1506# || / _---=> hardirq/softirq
983# ||| / _--=> preempt-depth 1507# ||| / _--=> preempt-depth
984# |||| / 1508# |||| / delay
985# ||||| delay 1509# cmd pid ||||| time | caller
986# cmd pid ||||| time | caller 1510# \ / ||||| \ | /
987# \ / ||||| \ | / 1511 <idle>-0 3d.h4 1us+: 0:120:R + [003] 2448: 94:R sleep
988ksoftirq-7 1d.H3 0us : try_to_wake_up (wake_up_process) 1512 <idle>-0 3d.h4 2us : ttwu_do_activate.constprop.87 <-try_to_wake_up
989ksoftirq-7 1d.H4 1us : sub_preempt_count (marker_probe_cb) 1513 <idle>-0 3d.h3 3us : check_preempt_curr <-ttwu_do_wakeup
990ksoftirq-7 1d.H3 2us : check_preempt_wakeup (try_to_wake_up) 1514 <idle>-0 3d.h3 3us : resched_task <-check_preempt_curr
991ksoftirq-7 1d.H3 3us : update_curr (check_preempt_wakeup) 1515 <idle>-0 3dNh3 4us : task_woken_rt <-ttwu_do_wakeup
992ksoftirq-7 1d.H3 4us : calc_delta_mine (update_curr) 1516 <idle>-0 3dNh3 4us : _raw_spin_unlock <-try_to_wake_up
993ksoftirq-7 1d.H3 5us : __resched_task (check_preempt_wakeup) 1517 <idle>-0 3dNh3 4us : sub_preempt_count <-_raw_spin_unlock
994ksoftirq-7 1d.H3 6us : task_wake_up_rt (try_to_wake_up) 1518 <idle>-0 3dNh2 5us : ttwu_stat <-try_to_wake_up
995ksoftirq-7 1d.H3 7us : _spin_unlock_irqrestore (try_to_wake_up) 1519 <idle>-0 3dNh2 5us : _raw_spin_unlock_irqrestore <-try_to_wake_up
996[...] 1520 <idle>-0 3dNh2 6us : sub_preempt_count <-_raw_spin_unlock_irqrestore
997ksoftirq-7 1d.H2 17us : irq_exit (smp_apic_timer_interrupt) 1521 <idle>-0 3dNh1 6us : _raw_spin_lock <-__run_hrtimer
998ksoftirq-7 1d.H2 18us : sub_preempt_count (irq_exit) 1522 <idle>-0 3dNh1 6us : add_preempt_count <-_raw_spin_lock
999ksoftirq-7 1d.s3 19us : sub_preempt_count (irq_exit) 1523 <idle>-0 3dNh2 7us : _raw_spin_unlock <-hrtimer_interrupt
1000ksoftirq-7 1..s2 20us : rcu_process_callbacks (__do_softirq) 1524 <idle>-0 3dNh2 7us : sub_preempt_count <-_raw_spin_unlock
1001[...] 1525 <idle>-0 3dNh1 7us : tick_program_event <-hrtimer_interrupt
1002ksoftirq-7 1..s2 26us : __rcu_process_callbacks (rcu_process_callbacks) 1526 <idle>-0 3dNh1 7us : clockevents_program_event <-tick_program_event
1003ksoftirq-7 1d.s2 27us : _local_bh_enable (__do_softirq) 1527 <idle>-0 3dNh1 8us : ktime_get <-clockevents_program_event
1004ksoftirq-7 1d.s2 28us : sub_preempt_count (_local_bh_enable) 1528 <idle>-0 3dNh1 8us : lapic_next_event <-clockevents_program_event
1005ksoftirq-7 1.N.3 29us : sub_preempt_count (ksoftirqd) 1529 <idle>-0 3dNh1 8us : irq_exit <-smp_apic_timer_interrupt
1006ksoftirq-7 1.N.2 30us : _cond_resched (ksoftirqd) 1530 <idle>-0 3dNh1 9us : sub_preempt_count <-irq_exit
1007ksoftirq-7 1.N.2 31us : __cond_resched (_cond_resched) 1531 <idle>-0 3dN.2 9us : idle_cpu <-irq_exit
1008ksoftirq-7 1.N.2 32us : add_preempt_count (__cond_resched) 1532 <idle>-0 3dN.2 9us : rcu_irq_exit <-irq_exit
1009ksoftirq-7 1.N.2 33us : schedule (__cond_resched) 1533 <idle>-0 3dN.2 10us : rcu_eqs_enter_common.isra.45 <-rcu_irq_exit
1010ksoftirq-7 1.N.2 33us : add_preempt_count (schedule) 1534 <idle>-0 3dN.2 10us : sub_preempt_count <-irq_exit
1011ksoftirq-7 1.N.3 34us : hrtick_clear (schedule) 1535 <idle>-0 3.N.1 11us : rcu_idle_exit <-cpu_idle
1012ksoftirq-7 1dN.3 35us : _spin_lock (schedule) 1536 <idle>-0 3dN.1 11us : rcu_eqs_exit_common.isra.43 <-rcu_idle_exit
1013ksoftirq-7 1dN.3 36us : add_preempt_count (_spin_lock) 1537 <idle>-0 3.N.1 11us : tick_nohz_idle_exit <-cpu_idle
1014ksoftirq-7 1d..4 37us : put_prev_task_fair (schedule) 1538 <idle>-0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
1015ksoftirq-7 1d..4 38us : update_curr (put_prev_task_fair) 1539 <idle>-0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit
1016[...] 1540 <idle>-0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
1017ksoftirq-7 1d..5 47us : _spin_trylock (tracing_record_cmdline) 1541 <idle>-0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit
1018ksoftirq-7 1d..5 48us : add_preempt_count (_spin_trylock) 1542 <idle>-0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz
1019ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline) 1543 <idle>-0 3dN.1 13us : add_preempt_count <-_raw_spin_lock
1020ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) 1544 <idle>-0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz
1021ksoftirq-7 1d..4 50us : schedule (__cond_resched) 1545 <idle>-0 3dN.2 14us : sched_avg_update <-__update_cpu_load
1022 1546 <idle>-0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz
1023The interrupt went off while running ksoftirqd. This task runs 1547 <idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock
1024at SCHED_OTHER. Why did not we see the 'N' set early? This may 1548 <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit
1025be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K 1549 <idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
1026stacks configured, the interrupt and softirq run with their own 1550 <idle>-0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit
1027stack. Some information is held on the top of the task's stack 1551 <idle>-0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel
1028(need_resched and preempt_count are both stored there). The 1552 <idle>-0 3dN.1 16us : lock_hrtimer_base.isra.18 <-hrtimer_try_to_cancel
1029setting of the NEED_RESCHED bit is done directly to the task's 1553 <idle>-0 3dN.1 16us : _raw_spin_lock_irqsave <-lock_hrtimer_base.isra.18
1030stack, but the reading of the NEED_RESCHED is done by looking at 1554 <idle>-0 3dN.1 16us : add_preempt_count <-_raw_spin_lock_irqsave
1031the current stack, which in this case is the stack for the hard 1555 <idle>-0 3dN.2 17us : __remove_hrtimer <-remove_hrtimer.part.16
1032interrupt. This hides the fact that NEED_RESCHED has been set. 1556 <idle>-0 3dN.2 17us : hrtimer_force_reprogram <-__remove_hrtimer
1033We do not see the 'N' until we switch back to the task's 1557 <idle>-0 3dN.2 17us : tick_program_event <-hrtimer_force_reprogram
1034assigned stack. 1558 <idle>-0 3dN.2 18us : clockevents_program_event <-tick_program_event
1559 <idle>-0 3dN.2 18us : ktime_get <-clockevents_program_event
1560 <idle>-0 3dN.2 18us : lapic_next_event <-clockevents_program_event
1561 <idle>-0 3dN.2 19us : _raw_spin_unlock_irqrestore <-hrtimer_try_to_cancel
1562 <idle>-0 3dN.2 19us : sub_preempt_count <-_raw_spin_unlock_irqrestore
1563 <idle>-0 3dN.1 19us : hrtimer_forward <-tick_nohz_idle_exit
1564 <idle>-0 3dN.1 20us : ktime_add_safe <-hrtimer_forward
1565 <idle>-0 3dN.1 20us : ktime_add_safe <-hrtimer_forward
1566 <idle>-0 3dN.1 20us : hrtimer_start_range_ns <-hrtimer_start_expires.constprop.11
1567 <idle>-0 3dN.1 20us : __hrtimer_start_range_ns <-hrtimer_start_range_ns
1568 <idle>-0 3dN.1 21us : lock_hrtimer_base.isra.18 <-__hrtimer_start_range_ns
1569 <idle>-0 3dN.1 21us : _raw_spin_lock_irqsave <-lock_hrtimer_base.isra.18
1570 <idle>-0 3dN.1 21us : add_preempt_count <-_raw_spin_lock_irqsave
1571 <idle>-0 3dN.2 22us : ktime_add_safe <-__hrtimer_start_range_ns
1572 <idle>-0 3dN.2 22us : enqueue_hrtimer <-__hrtimer_start_range_ns
1573 <idle>-0 3dN.2 22us : tick_program_event <-__hrtimer_start_range_ns
1574 <idle>-0 3dN.2 23us : clockevents_program_event <-tick_program_event
1575 <idle>-0 3dN.2 23us : ktime_get <-clockevents_program_event
1576 <idle>-0 3dN.2 23us : lapic_next_event <-clockevents_program_event
1577 <idle>-0 3dN.2 24us : _raw_spin_unlock_irqrestore <-__hrtimer_start_range_ns
1578 <idle>-0 3dN.2 24us : sub_preempt_count <-_raw_spin_unlock_irqrestore
1579 <idle>-0 3dN.1 24us : account_idle_ticks <-tick_nohz_idle_exit
1580 <idle>-0 3dN.1 24us : account_idle_time <-account_idle_ticks
1581 <idle>-0 3.N.1 25us : sub_preempt_count <-cpu_idle
1582 <idle>-0 3.N.. 25us : schedule <-cpu_idle
1583 <idle>-0 3.N.. 25us : __schedule <-preempt_schedule
1584 <idle>-0 3.N.. 26us : add_preempt_count <-__schedule
1585 <idle>-0 3.N.1 26us : rcu_note_context_switch <-__schedule
1586 <idle>-0 3.N.1 26us : rcu_sched_qs <-rcu_note_context_switch
1587 <idle>-0 3dN.1 27us : rcu_preempt_qs <-rcu_note_context_switch
1588 <idle>-0 3.N.1 27us : _raw_spin_lock_irq <-__schedule
1589 <idle>-0 3dN.1 27us : add_preempt_count <-_raw_spin_lock_irq
1590 <idle>-0 3dN.2 28us : put_prev_task_idle <-__schedule
1591 <idle>-0 3dN.2 28us : pick_next_task_stop <-pick_next_task
1592 <idle>-0 3dN.2 28us : pick_next_task_rt <-pick_next_task
1593 <idle>-0 3dN.2 29us : dequeue_pushable_task <-pick_next_task_rt
1594 <idle>-0 3d..3 29us : __schedule <-preempt_schedule
1595 <idle>-0 3d..3 30us : 0:120:R ==> [003] 2448: 94:R sleep
1596
1597This isn't that big of a trace, even with function tracing enabled,
1598so I included the entire trace.
1599
1600The interrupt went off while when the system was idle. Somewhere
1601before task_woken_rt() was called, the NEED_RESCHED flag was set,
1602this is indicated by the first occurrence of the 'N' flag.
1603
1604Latency tracing and events
1605--------------------------
1606As function tracing can induce a much larger latency, but without
1607seeing what happens within the latency it is hard to know what
1608caused it. There is a middle ground, and that is with enabling
1609events.
1610
1611 # echo 0 > options/function-trace
1612 # echo wakeup_rt > current_tracer
1613 # echo 1 > events/enable
1614 # echo 1 > tracing_on
1615 # echo 0 > tracing_max_latency
1616 # chrt -f 5 sleep 1
1617 # echo 0 > tracing_on
1618 # cat trace
1619# tracer: wakeup_rt
1620#
1621# wakeup_rt latency trace v1.1.5 on 3.8.0-test+
1622# --------------------------------------------------------------------
1623# latency: 6 us, #12/12, CPU#2 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4)
1624# -----------------
1625# | task: sleep-5882 (uid:0 nice:0 policy:1 rt_prio:5)
1626# -----------------
1627#
1628# _------=> CPU#
1629# / _-----=> irqs-off
1630# | / _----=> need-resched
1631# || / _---=> hardirq/softirq
1632# ||| / _--=> preempt-depth
1633# |||| / delay
1634# cmd pid ||||| time | caller
1635# \ / ||||| \ | /
1636 <idle>-0 2d.h4 0us : 0:120:R + [002] 5882: 94:R sleep
1637 <idle>-0 2d.h4 0us : ttwu_do_activate.constprop.87 <-try_to_wake_up
1638 <idle>-0 2d.h4 1us : sched_wakeup: comm=sleep pid=5882 prio=94 success=1 target_cpu=002
1639 <idle>-0 2dNh2 1us : hrtimer_expire_exit: hrtimer=ffff88007796feb8
1640 <idle>-0 2.N.2 2us : power_end: cpu_id=2
1641 <idle>-0 2.N.2 3us : cpu_idle: state=4294967295 cpu_id=2
1642 <idle>-0 2dN.3 4us : hrtimer_cancel: hrtimer=ffff88007d50d5e0
1643 <idle>-0 2dN.3 4us : hrtimer_start: hrtimer=ffff88007d50d5e0 function=tick_sched_timer expires=34311211000000 softexpires=34311211000000
1644 <idle>-0 2.N.2 5us : rcu_utilization: Start context switch
1645 <idle>-0 2.N.2 5us : rcu_utilization: End context switch
1646 <idle>-0 2d..3 6us : __schedule <-schedule
1647 <idle>-0 2d..3 6us : 0:120:R ==> [002] 5882: 94:R sleep
1648
1035 1649
1036function 1650function
1037-------- 1651--------
@@ -1039,6 +1653,7 @@ function
1039This tracer is the function tracer. Enabling the function tracer 1653This tracer is the function tracer. Enabling the function tracer
1040can be done from the debug file system. Make sure the 1654can be done from the debug file system. Make sure the
1041ftrace_enabled is set; otherwise this tracer is a nop. 1655ftrace_enabled is set; otherwise this tracer is a nop.
1656See the "ftrace_enabled" section below.
1042 1657
1043 # sysctl kernel.ftrace_enabled=1 1658 # sysctl kernel.ftrace_enabled=1
1044 # echo function > current_tracer 1659 # echo function > current_tracer
@@ -1048,23 +1663,23 @@ ftrace_enabled is set; otherwise this tracer is a nop.
1048 # cat trace 1663 # cat trace
1049# tracer: function 1664# tracer: function
1050# 1665#
1051# TASK-PID CPU# TIMESTAMP FUNCTION 1666# entries-in-buffer/entries-written: 24799/24799 #P:4
1052# | | | | | 1667#
1053 bash-4003 [00] 123.638713: finish_task_switch <-schedule 1668# _-----=> irqs-off
1054 bash-4003 [00] 123.638714: _spin_unlock_irq <-finish_task_switch 1669# / _----=> need-resched
1055 bash-4003 [00] 123.638714: sub_preempt_count <-_spin_unlock_irq 1670# | / _---=> hardirq/softirq
1056 bash-4003 [00] 123.638715: hrtick_set <-schedule 1671# || / _--=> preempt-depth
1057 bash-4003 [00] 123.638715: _spin_lock_irqsave <-hrtick_set 1672# ||| / delay
1058 bash-4003 [00] 123.638716: add_preempt_count <-_spin_lock_irqsave 1673# TASK-PID CPU# |||| TIMESTAMP FUNCTION
1059 bash-4003 [00] 123.638716: _spin_unlock_irqrestore <-hrtick_set 1674# | | | |||| | |
1060 bash-4003 [00] 123.638717: sub_preempt_count <-_spin_unlock_irqrestore 1675 bash-1994 [002] .... 3082.063030: mutex_unlock <-rb_simple_write
1061 bash-4003 [00] 123.638717: hrtick_clear <-hrtick_set 1676 bash-1994 [002] .... 3082.063031: __mutex_unlock_slowpath <-mutex_unlock
1062 bash-4003 [00] 123.638718: sub_preempt_count <-schedule 1677 bash-1994 [002] .... 3082.063031: __fsnotify_parent <-fsnotify_modify
1063 bash-4003 [00] 123.638718: sub_preempt_count <-preempt_schedule 1678 bash-1994 [002] .... 3082.063032: fsnotify <-fsnotify_modify
1064 bash-4003 [00] 123.638719: wait_for_completion <-__stop_machine_run 1679 bash-1994 [002] .... 3082.063032: __srcu_read_lock <-fsnotify
1065 bash-4003 [00] 123.638719: wait_for_common <-wait_for_completion 1680 bash-1994 [002] .... 3082.063032: add_preempt_count <-__srcu_read_lock
1066 bash-4003 [00] 123.638720: _spin_lock_irq <-wait_for_common 1681 bash-1994 [002] ...1 3082.063032: sub_preempt_count <-__srcu_read_lock
1067 bash-4003 [00] 123.638720: add_preempt_count <-_spin_lock_irq 1682 bash-1994 [002] .... 3082.063033: __srcu_read_unlock <-fsnotify
1068[...] 1683[...]
1069 1684
1070 1685
@@ -1214,79 +1829,19 @@ int main (int argc, char **argv)
1214 return 0; 1829 return 0;
1215} 1830}
1216 1831
1832Or this simple script!
1217 1833
1218hw-branch-tracer (x86 only) 1834------
1219--------------------------- 1835#!/bin/bash
1220 1836
1221This tracer uses the x86 last branch tracing hardware feature to 1837debugfs=`sed -ne 's/^debugfs \(.*\) debugfs.*/\1/p' /proc/mounts`
1222collect a branch trace on all cpus with relatively low overhead. 1838echo nop > $debugfs/tracing/current_tracer
1223 1839echo 0 > $debugfs/tracing/tracing_on
1224The tracer uses a fixed-size circular buffer per cpu and only 1840echo $$ > $debugfs/tracing/set_ftrace_pid
1225traces ring 0 branches. The trace file dumps that buffer in the 1841echo function > $debugfs/tracing/current_tracer
1226following format: 1842echo 1 > $debugfs/tracing/tracing_on
1227 1843exec "$@"
1228# tracer: hw-branch-tracer 1844------
1229#
1230# CPU# TO <- FROM
1231 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6
1232 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a
1233 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf
1234 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf
1235 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a
1236 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf
1237
1238
1239The tracer may be used to dump the trace for the oops'ing cpu on
1240a kernel oops into the system log. To enable this,
1241ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1242can either use the sysctl function or set it via the proc system
1243interface.
1244
1245 sysctl kernel.ftrace_dump_on_oops=n
1246
1247or
1248
1249 echo n > /proc/sys/kernel/ftrace_dump_on_oops
1250
1251If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
1252only dump the buffer of the CPU that triggered the oops.
1253
1254Here's an example of such a dump after a null pointer
1255dereference in a kernel module:
1256
1257[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
1258[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops]
1259[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0
1260[57848.106019] Oops: 0002 [#1] SMP
1261[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus
1262[57848.106019] Dumping ftrace buffer:
1263[57848.106019] ---------------------------------
1264[...]
1265[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24
1266[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165
1267[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165
1268[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165
1269[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165
1270[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops]
1271[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30
1272[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b
1273[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31
1274[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1
1275[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30
1276[...]
1277[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2
1278[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881
1279[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881
1280[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96
1281[...]
1282[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3
1283[57848.106019] ---------------------------------
1284[57848.106019] CPU 0
1285[57848.106019] Modules linked in: oops
1286[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23
1287[57848.106019] RIP: 0010:[<ffffffffa0000006>] [<ffffffffa0000006>] open+0x6/0x14 [oops]
1288[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246
1289[...]
1290 1845
1291 1846
1292function graph tracer 1847function graph tracer
@@ -1473,16 +2028,18 @@ starts of pointing to a simple return. (Enabling FTRACE will
1473include the -pg switch in the compiling of the kernel.) 2028include the -pg switch in the compiling of the kernel.)
1474 2029
1475At compile time every C file object is run through the 2030At compile time every C file object is run through the
1476recordmcount.pl script (located in the scripts directory). This 2031recordmcount program (located in the scripts directory). This
1477script will process the C object using objdump to find all the 2032program will parse the ELF headers in the C object to find all
1478locations in the .text section that call mcount. (Note, only the 2033the locations in the .text section that call mcount. (Note, only
1479.text section is processed, since processing other sections like 2034white listed .text sections are processed, since processing other
1480.init.text may cause races due to those sections being freed). 2035sections like .init.text may cause races due to those sections
2036being freed unexpectedly).
1481 2037
1482A new section called "__mcount_loc" is created that holds 2038A new section called "__mcount_loc" is created that holds
1483references to all the mcount call sites in the .text section. 2039references to all the mcount call sites in the .text section.
1484This section is compiled back into the original object. The 2040The recordmcount program re-links this section back into the
1485final linker will add all these references into a single table. 2041original object. The final linking stage of the kernel will add all these
2042references into a single table.
1486 2043
1487On boot up, before SMP is initialized, the dynamic ftrace code 2044On boot up, before SMP is initialized, the dynamic ftrace code
1488scans this table and updates all the locations into nops. It 2045scans this table and updates all the locations into nops. It
@@ -1493,13 +2050,25 @@ unloaded, it also removes its functions from the ftrace function
1493list. This is automatic in the module unload code, and the 2050list. This is automatic in the module unload code, and the
1494module author does not need to worry about it. 2051module author does not need to worry about it.
1495 2052
1496When tracing is enabled, kstop_machine is called to prevent 2053When tracing is enabled, the process of modifying the function
1497races with the CPUS executing code being modified (which can 2054tracepoints is dependent on architecture. The old method is to use
1498cause the CPU to do undesirable things), and the nops are 2055kstop_machine to prevent races with the CPUs executing code being
2056modified (which can cause the CPU to do undesirable things, especially
2057if the modified code crosses cache (or page) boundaries), and the nops are
1499patched back to calls. But this time, they do not call mcount 2058patched back to calls. But this time, they do not call mcount
1500(which is just a function stub). They now call into the ftrace 2059(which is just a function stub). They now call into the ftrace
1501infrastructure. 2060infrastructure.
1502 2061
2062The new method of modifying the function tracepoints is to place
2063a breakpoint at the location to be modified, sync all CPUs, modify
2064the rest of the instruction not covered by the breakpoint. Sync
2065all CPUs again, and then remove the breakpoint with the finished
2066version to the ftrace call site.
2067
2068Some archs do not even need to monkey around with the synchronization,
2069and can just slap the new code on top of the old without any
2070problems with other CPUs executing it at the same time.
2071
1503One special side-effect to the recording of the functions being 2072One special side-effect to the recording of the functions being
1504traced is that we can now selectively choose which functions we 2073traced is that we can now selectively choose which functions we
1505wish to trace and which ones we want the mcount calls to remain 2074wish to trace and which ones we want the mcount calls to remain
@@ -1530,20 +2099,28 @@ mutex_lock
1530 2099
1531If I am only interested in sys_nanosleep and hrtimer_interrupt: 2100If I am only interested in sys_nanosleep and hrtimer_interrupt:
1532 2101
1533 # echo sys_nanosleep hrtimer_interrupt \ 2102 # echo sys_nanosleep hrtimer_interrupt > set_ftrace_filter
1534 > set_ftrace_filter
1535 # echo function > current_tracer 2103 # echo function > current_tracer
1536 # echo 1 > tracing_on 2104 # echo 1 > tracing_on
1537 # usleep 1 2105 # usleep 1
1538 # echo 0 > tracing_on 2106 # echo 0 > tracing_on
1539 # cat trace 2107 # cat trace
1540# tracer: ftrace 2108# tracer: function
2109#
2110# entries-in-buffer/entries-written: 5/5 #P:4
1541# 2111#
1542# TASK-PID CPU# TIMESTAMP FUNCTION 2112# _-----=> irqs-off
1543# | | | | | 2113# / _----=> need-resched
1544 usleep-4134 [00] 1317.070017: hrtimer_interrupt <-smp_apic_timer_interrupt 2114# | / _---=> hardirq/softirq
1545 usleep-4134 [00] 1317.070111: sys_nanosleep <-syscall_call 2115# || / _--=> preempt-depth
1546 <idle>-0 [00] 1317.070115: hrtimer_interrupt <-smp_apic_timer_interrupt 2116# ||| / delay
2117# TASK-PID CPU# |||| TIMESTAMP FUNCTION
2118# | | | |||| | |
2119 usleep-2665 [001] .... 4186.475355: sys_nanosleep <-system_call_fastpath
2120 <idle>-0 [001] d.h1 4186.475409: hrtimer_interrupt <-smp_apic_timer_interrupt
2121 usleep-2665 [001] d.h1 4186.475426: hrtimer_interrupt <-smp_apic_timer_interrupt
2122 <idle>-0 [003] d.h1 4186.475426: hrtimer_interrupt <-smp_apic_timer_interrupt
2123 <idle>-0 [002] d.h1 4186.475427: hrtimer_interrupt <-smp_apic_timer_interrupt
1547 2124
1548To see which functions are being traced, you can cat the file: 2125To see which functions are being traced, you can cat the file:
1549 2126
@@ -1571,20 +2148,25 @@ Note: It is better to use quotes to enclose the wild cards,
1571 2148
1572Produces: 2149Produces:
1573 2150
1574# tracer: ftrace 2151# tracer: function
1575# 2152#
1576# TASK-PID CPU# TIMESTAMP FUNCTION 2153# entries-in-buffer/entries-written: 897/897 #P:4
1577# | | | | | 2154#
1578 bash-4003 [00] 1480.611794: hrtimer_init <-copy_process 2155# _-----=> irqs-off
1579 bash-4003 [00] 1480.611941: hrtimer_start <-hrtick_set 2156# / _----=> need-resched
1580 bash-4003 [00] 1480.611956: hrtimer_cancel <-hrtick_clear 2157# | / _---=> hardirq/softirq
1581 bash-4003 [00] 1480.611956: hrtimer_try_to_cancel <-hrtimer_cancel 2158# || / _--=> preempt-depth
1582 <idle>-0 [00] 1480.612019: hrtimer_get_next_event <-get_next_timer_interrupt 2159# ||| / delay
1583 <idle>-0 [00] 1480.612025: hrtimer_get_next_event <-get_next_timer_interrupt 2160# TASK-PID CPU# |||| TIMESTAMP FUNCTION
1584 <idle>-0 [00] 1480.612032: hrtimer_get_next_event <-get_next_timer_interrupt 2161# | | | |||| | |
1585 <idle>-0 [00] 1480.612037: hrtimer_get_next_event <-get_next_timer_interrupt 2162 <idle>-0 [003] dN.1 4228.547803: hrtimer_cancel <-tick_nohz_idle_exit
1586 <idle>-0 [00] 1480.612382: hrtimer_get_next_event <-get_next_timer_interrupt 2163 <idle>-0 [003] dN.1 4228.547804: hrtimer_try_to_cancel <-hrtimer_cancel
1587 2164 <idle>-0 [003] dN.2 4228.547805: hrtimer_force_reprogram <-__remove_hrtimer
2165 <idle>-0 [003] dN.1 4228.547805: hrtimer_forward <-tick_nohz_idle_exit
2166 <idle>-0 [003] dN.1 4228.547805: hrtimer_start_range_ns <-hrtimer_start_expires.constprop.11
2167 <idle>-0 [003] d..1 4228.547858: hrtimer_get_next_event <-get_next_timer_interrupt
2168 <idle>-0 [003] d..1 4228.547859: hrtimer_start <-__tick_nohz_idle_enter
2169 <idle>-0 [003] d..2 4228.547860: hrtimer_force_reprogram <-__rem
1588 2170
1589Notice that we lost the sys_nanosleep. 2171Notice that we lost the sys_nanosleep.
1590 2172
@@ -1651,19 +2233,29 @@ traced.
1651 2233
1652Produces: 2234Produces:
1653 2235
1654# tracer: ftrace 2236# tracer: function
2237#
2238# entries-in-buffer/entries-written: 39608/39608 #P:4
1655# 2239#
1656# TASK-PID CPU# TIMESTAMP FUNCTION 2240# _-----=> irqs-off
1657# | | | | | 2241# / _----=> need-resched
1658 bash-4043 [01] 115.281644: finish_task_switch <-schedule 2242# | / _---=> hardirq/softirq
1659 bash-4043 [01] 115.281645: hrtick_set <-schedule 2243# || / _--=> preempt-depth
1660 bash-4043 [01] 115.281645: hrtick_clear <-hrtick_set 2244# ||| / delay
1661 bash-4043 [01] 115.281646: wait_for_completion <-__stop_machine_run 2245# TASK-PID CPU# |||| TIMESTAMP FUNCTION
1662 bash-4043 [01] 115.281647: wait_for_common <-wait_for_completion 2246# | | | |||| | |
1663 bash-4043 [01] 115.281647: kthread_stop <-stop_machine_run 2247 bash-1994 [000] .... 4342.324896: file_ra_state_init <-do_dentry_open
1664 bash-4043 [01] 115.281648: init_waitqueue_head <-kthread_stop 2248 bash-1994 [000] .... 4342.324897: open_check_o_direct <-do_last
1665 bash-4043 [01] 115.281648: wake_up_process <-kthread_stop 2249 bash-1994 [000] .... 4342.324897: ima_file_check <-do_last
1666 bash-4043 [01] 115.281649: try_to_wake_up <-wake_up_process 2250 bash-1994 [000] .... 4342.324898: process_measurement <-ima_file_check
2251 bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement
2252 bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action
2253 bash-1994 [000] .... 4342.324899: do_truncate <-do_last
2254 bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate
2255 bash-1994 [000] .... 4342.324899: notify_change <-do_truncate
2256 bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change
2257 bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time
2258 bash-1994 [000] .... 4342.324900: timespec_trunc <-current_fs_time
1667 2259
1668We can see that there's no more lock or preempt tracing. 2260We can see that there's no more lock or preempt tracing.
1669 2261
@@ -1729,6 +2321,28 @@ this special filter via:
1729 echo > set_graph_function 2321 echo > set_graph_function
1730 2322
1731 2323
2324ftrace_enabled
2325--------------
2326
2327Note, the proc sysctl ftrace_enable is a big on/off switch for the
2328function tracer. By default it is enabled (when function tracing is
2329enabled in the kernel). If it is disabled, all function tracing is
2330disabled. This includes not only the function tracers for ftrace, but
2331also for any other uses (perf, kprobes, stack tracing, profiling, etc).
2332
2333Please disable this with care.
2334
2335This can be disable (and enabled) with:
2336
2337 sysctl kernel.ftrace_enabled=0
2338 sysctl kernel.ftrace_enabled=1
2339
2340 or
2341
2342 echo 0 > /proc/sys/kernel/ftrace_enabled
2343 echo 1 > /proc/sys/kernel/ftrace_enabled
2344
2345
1732Filter commands 2346Filter commands
1733--------------- 2347---------------
1734 2348
@@ -1763,12 +2377,58 @@ The following commands are supported:
1763 2377
1764 echo '__schedule_bug:traceoff:5' > set_ftrace_filter 2378 echo '__schedule_bug:traceoff:5' > set_ftrace_filter
1765 2379
2380 To always disable tracing when __schedule_bug is hit:
2381
2382 echo '__schedule_bug:traceoff' > set_ftrace_filter
2383
1766 These commands are cumulative whether or not they are appended 2384 These commands are cumulative whether or not they are appended
1767 to set_ftrace_filter. To remove a command, prepend it by '!' 2385 to set_ftrace_filter. To remove a command, prepend it by '!'
1768 and drop the parameter: 2386 and drop the parameter:
1769 2387
2388 echo '!__schedule_bug:traceoff:0' > set_ftrace_filter
2389
2390 The above removes the traceoff command for __schedule_bug
2391 that have a counter. To remove commands without counters:
2392
1770 echo '!__schedule_bug:traceoff' > set_ftrace_filter 2393 echo '!__schedule_bug:traceoff' > set_ftrace_filter
1771 2394
2395- snapshot
2396 Will cause a snapshot to be triggered when the function is hit.
2397
2398 echo 'native_flush_tlb_others:snapshot' > set_ftrace_filter
2399
2400 To only snapshot once:
2401
2402 echo 'native_flush_tlb_others:snapshot:1' > set_ftrace_filter
2403
2404 To remove the above commands:
2405
2406 echo '!native_flush_tlb_others:snapshot' > set_ftrace_filter
2407 echo '!native_flush_tlb_others:snapshot:0' > set_ftrace_filter
2408
2409- enable_event/disable_event
2410 These commands can enable or disable a trace event. Note, because
2411 function tracing callbacks are very sensitive, when these commands
2412 are registered, the trace point is activated, but disabled in
2413 a "soft" mode. That is, the tracepoint will be called, but
2414 just will not be traced. The event tracepoint stays in this mode
2415 as long as there's a command that triggers it.
2416
2417 echo 'try_to_wake_up:enable_event:sched:sched_switch:2' > \
2418 set_ftrace_filter
2419
2420 The format is:
2421
2422 <function>:enable_event:<system>:<event>[:count]
2423 <function>:disable_event:<system>:<event>[:count]
2424
2425 To remove the events commands:
2426
2427
2428 echo '!try_to_wake_up:enable_event:sched:sched_switch:0' > \
2429 set_ftrace_filter
2430 echo '!schedule:disable_event:sched:sched_switch' > \
2431 set_ftrace_filter
1772 2432
1773trace_pipe 2433trace_pipe
1774---------- 2434----------
@@ -1787,28 +2447,31 @@ different. The trace is live.
1787 # cat trace 2447 # cat trace
1788# tracer: function 2448# tracer: function
1789# 2449#
1790# TASK-PID CPU# TIMESTAMP FUNCTION 2450# entries-in-buffer/entries-written: 0/0 #P:4
1791# | | | | | 2451#
2452# _-----=> irqs-off
2453# / _----=> need-resched
2454# | / _---=> hardirq/softirq
2455# || / _--=> preempt-depth
2456# ||| / delay
2457# TASK-PID CPU# |||| TIMESTAMP FUNCTION
2458# | | | |||| | |
1792 2459
1793 # 2460 #
1794 # cat /tmp/trace.out 2461 # cat /tmp/trace.out
1795 bash-4043 [00] 41.267106: finish_task_switch <-schedule 2462 bash-1994 [000] .... 5281.568961: mutex_unlock <-rb_simple_write
1796 bash-4043 [00] 41.267106: hrtick_set <-schedule 2463 bash-1994 [000] .... 5281.568963: __mutex_unlock_slowpath <-mutex_unlock
1797 bash-4043 [00] 41.267107: hrtick_clear <-hrtick_set 2464 bash-1994 [000] .... 5281.568963: __fsnotify_parent <-fsnotify_modify
1798 bash-4043 [00] 41.267108: wait_for_completion <-__stop_machine_run 2465 bash-1994 [000] .... 5281.568964: fsnotify <-fsnotify_modify
1799 bash-4043 [00] 41.267108: wait_for_common <-wait_for_completion 2466 bash-1994 [000] .... 5281.568964: __srcu_read_lock <-fsnotify
1800 bash-4043 [00] 41.267109: kthread_stop <-stop_machine_run 2467 bash-1994 [000] .... 5281.568964: add_preempt_count <-__srcu_read_lock
1801 bash-4043 [00] 41.267109: init_waitqueue_head <-kthread_stop 2468 bash-1994 [000] ...1 5281.568965: sub_preempt_count <-__srcu_read_lock
1802 bash-4043 [00] 41.267110: wake_up_process <-kthread_stop 2469 bash-1994 [000] .... 5281.568965: __srcu_read_unlock <-fsnotify
1803 bash-4043 [00] 41.267110: try_to_wake_up <-wake_up_process 2470 bash-1994 [000] .... 5281.568967: sys_dup2 <-system_call_fastpath
1804 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
1805 2471
1806 2472
1807Note, reading the trace_pipe file will block until more input is 2473Note, reading the trace_pipe file will block until more input is
1808added. By changing the tracer, trace_pipe will issue an EOF. We 2474added.
1809needed to set the function tracer _before_ we "cat" the
1810trace_pipe file.
1811
1812 2475
1813trace entries 2476trace entries
1814------------- 2477-------------
@@ -1817,31 +2480,50 @@ Having too much or not enough data can be troublesome in
1817diagnosing an issue in the kernel. The file buffer_size_kb is 2480diagnosing an issue in the kernel. The file buffer_size_kb is
1818used to modify the size of the internal trace buffers. The 2481used to modify the size of the internal trace buffers. The
1819number listed is the number of entries that can be recorded per 2482number listed is the number of entries that can be recorded per
1820CPU. To know the full size, multiply the number of possible CPUS 2483CPU. To know the full size, multiply the number of possible CPUs
1821with the number of entries. 2484with the number of entries.
1822 2485
1823 # cat buffer_size_kb 2486 # cat buffer_size_kb
18241408 (units kilobytes) 24871408 (units kilobytes)
1825 2488
1826Note, to modify this, you must have tracing completely disabled. 2489Or simply read buffer_total_size_kb
1827To do that, echo "nop" into the current_tracer. If the 2490
1828current_tracer is not set to "nop", an EINVAL error will be 2491 # cat buffer_total_size_kb
1829returned. 24925632
2493
2494To modify the buffer, simple echo in a number (in 1024 byte segments).
1830 2495
1831 # echo nop > current_tracer
1832 # echo 10000 > buffer_size_kb 2496 # echo 10000 > buffer_size_kb
1833 # cat buffer_size_kb 2497 # cat buffer_size_kb
183410000 (units kilobytes) 249810000 (units kilobytes)
1835 2499
1836The number of pages which will be allocated is limited to a 2500It will try to allocate as much as possible. If you allocate too
1837percentage of available memory. Allocating too much will produce 2501much, it can cause Out-Of-Memory to trigger.
1838an error.
1839 2502
1840 # echo 1000000000000 > buffer_size_kb 2503 # echo 1000000000000 > buffer_size_kb
1841-bash: echo: write error: Cannot allocate memory 2504-bash: echo: write error: Cannot allocate memory
1842 # cat buffer_size_kb 2505 # cat buffer_size_kb
184385 250685
1844 2507
2508The per_cpu buffers can be changed individually as well:
2509
2510 # echo 10000 > per_cpu/cpu0/buffer_size_kb
2511 # echo 100 > per_cpu/cpu1/buffer_size_kb
2512
2513When the per_cpu buffers are not the same, the buffer_size_kb
2514at the top level will just show an X
2515
2516 # cat buffer_size_kb
2517X
2518
2519This is where the buffer_total_size_kb is useful:
2520
2521 # cat buffer_total_size_kb
252212916
2523
2524Writing to the top level buffer_size_kb will reset all the buffers
2525to be the same again.
2526
1845Snapshot 2527Snapshot
1846-------- 2528--------
1847CONFIG_TRACER_SNAPSHOT makes a generic snapshot feature 2529CONFIG_TRACER_SNAPSHOT makes a generic snapshot feature
@@ -1925,7 +2607,188 @@ bash: echo: write error: Device or resource busy
1925 # cat snapshot 2607 # cat snapshot
1926cat: snapshot: Device or resource busy 2608cat: snapshot: Device or resource busy
1927 2609
2610
2611Instances
2612---------
2613In the debugfs tracing directory is a directory called "instances".
2614This directory can have new directories created inside of it using
2615mkdir, and removing directories with rmdir. The directory created
2616with mkdir in this directory will already contain files and other
2617directories after it is created.
2618
2619 # mkdir instances/foo
2620 # ls instances/foo
2621buffer_size_kb buffer_total_size_kb events free_buffer per_cpu
2622set_event snapshot trace trace_clock trace_marker trace_options
2623trace_pipe tracing_on
2624
2625As you can see, the new directory looks similar to the tracing directory
2626itself. In fact, it is very similar, except that the buffer and
2627events are agnostic from the main director, or from any other
2628instances that are created.
2629
2630The files in the new directory work just like the files with the
2631same name in the tracing directory except the buffer that is used
2632is a separate and new buffer. The files affect that buffer but do not
2633affect the main buffer with the exception of trace_options. Currently,
2634the trace_options affect all instances and the top level buffer
2635the same, but this may change in future releases. That is, options
2636may become specific to the instance they reside in.
2637
2638Notice that none of the function tracer files are there, nor is
2639current_tracer and available_tracers. This is because the buffers
2640can currently only have events enabled for them.
2641
2642 # mkdir instances/foo
2643 # mkdir instances/bar
2644 # mkdir instances/zoot
2645 # echo 100000 > buffer_size_kb
2646 # echo 1000 > instances/foo/buffer_size_kb
2647 # echo 5000 > instances/bar/per_cpu/cpu1/buffer_size_kb
2648 # echo function > current_trace
2649 # echo 1 > instances/foo/events/sched/sched_wakeup/enable
2650 # echo 1 > instances/foo/events/sched/sched_wakeup_new/enable
2651 # echo 1 > instances/foo/events/sched/sched_switch/enable
2652 # echo 1 > instances/bar/events/irq/enable
2653 # echo 1 > instances/zoot/events/syscalls/enable
2654 # cat trace_pipe
2655CPU:2 [LOST 11745 EVENTS]
2656 bash-2044 [002] .... 10594.481032: _raw_spin_lock_irqsave <-get_page_from_freelist
2657 bash-2044 [002] d... 10594.481032: add_preempt_count <-_raw_spin_lock_irqsave
2658 bash-2044 [002] d..1 10594.481032: __rmqueue <-get_page_from_freelist
2659 bash-2044 [002] d..1 10594.481033: _raw_spin_unlock <-get_page_from_freelist
2660 bash-2044 [002] d..1 10594.481033: sub_preempt_count <-_raw_spin_unlock
2661 bash-2044 [002] d... 10594.481033: get_pageblock_flags_group <-get_pageblock_migratetype
2662 bash-2044 [002] d... 10594.481034: __mod_zone_page_state <-get_page_from_freelist
2663 bash-2044 [002] d... 10594.481034: zone_statistics <-get_page_from_freelist
2664 bash-2044 [002] d... 10594.481034: __inc_zone_state <-zone_statistics
2665 bash-2044 [002] d... 10594.481034: __inc_zone_state <-zone_statistics
2666 bash-2044 [002] .... 10594.481035: arch_dup_task_struct <-copy_process
2667[...]
2668
2669 # cat instances/foo/trace_pipe
2670 bash-1998 [000] d..4 136.676759: sched_wakeup: comm=kworker/0:1 pid=59 prio=120 success=1 target_cpu=000
2671 bash-1998 [000] dN.4 136.676760: sched_wakeup: comm=bash pid=1998 prio=120 success=1 target_cpu=000
2672 <idle>-0 [003] d.h3 136.676906: sched_wakeup: comm=rcu_preempt pid=9 prio=120 success=1 target_cpu=003
2673 <idle>-0 [003] d..3 136.676909: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=rcu_preempt next_pid=9 next_prio=120
2674 rcu_preempt-9 [003] d..3 136.676916: sched_switch: prev_comm=rcu_preempt prev_pid=9 prev_prio=120 prev_state=S ==> next_comm=swapper/3 next_pid=0 next_prio=120
2675 bash-1998 [000] d..4 136.677014: sched_wakeup: comm=kworker/0:1 pid=59 prio=120 success=1 target_cpu=000
2676 bash-1998 [000] dN.4 136.677016: sched_wakeup: comm=bash pid=1998 prio=120 success=1 target_cpu=000
2677 bash-1998 [000] d..3 136.677018: sched_switch: prev_comm=bash prev_pid=1998 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:1 next_pid=59 next_prio=120
2678 kworker/0:1-59 [000] d..4 136.677022: sched_wakeup: comm=sshd pid=1995 prio=120 success=1 target_cpu=001
2679 kworker/0:1-59 [000] d..3 136.677025: sched_switch: prev_comm=kworker/0:1 prev_pid=59 prev_prio=120 prev_state=S ==> next_comm=bash next_pid=1998 next_prio=120
2680[...]
2681
2682 # cat instances/bar/trace_pipe
2683 migration/1-14 [001] d.h3 138.732674: softirq_raise: vec=3 [action=NET_RX]
2684 <idle>-0 [001] dNh3 138.732725: softirq_raise: vec=3 [action=NET_RX]
2685 bash-1998 [000] d.h1 138.733101: softirq_raise: vec=1 [action=TIMER]
2686 bash-1998 [000] d.h1 138.733102: softirq_raise: vec=9 [action=RCU]
2687 bash-1998 [000] ..s2 138.733105: softirq_entry: vec=1 [action=TIMER]
2688 bash-1998 [000] ..s2 138.733106: softirq_exit: vec=1 [action=TIMER]
2689 bash-1998 [000] ..s2 138.733106: softirq_entry: vec=9 [action=RCU]
2690 bash-1998 [000] ..s2 138.733109: softirq_exit: vec=9 [action=RCU]
2691 sshd-1995 [001] d.h1 138.733278: irq_handler_entry: irq=21 name=uhci_hcd:usb4
2692 sshd-1995 [001] d.h1 138.733280: irq_handler_exit: irq=21 ret=unhandled
2693 sshd-1995 [001] d.h1 138.733281: irq_handler_entry: irq=21 name=eth0
2694 sshd-1995 [001] d.h1 138.733283: irq_handler_exit: irq=21 ret=handled
2695[...]
2696
2697 # cat instances/zoot/trace
2698# tracer: nop
2699#
2700# entries-in-buffer/entries-written: 18996/18996 #P:4
2701#
2702# _-----=> irqs-off
2703# / _----=> need-resched
2704# | / _---=> hardirq/softirq
2705# || / _--=> preempt-depth
2706# ||| / delay
2707# TASK-PID CPU# |||| TIMESTAMP FUNCTION
2708# | | | |||| | |
2709 bash-1998 [000] d... 140.733501: sys_write -> 0x2
2710 bash-1998 [000] d... 140.733504: sys_dup2(oldfd: a, newfd: 1)
2711 bash-1998 [000] d... 140.733506: sys_dup2 -> 0x1
2712 bash-1998 [000] d... 140.733508: sys_fcntl(fd: a, cmd: 1, arg: 0)
2713 bash-1998 [000] d... 140.733509: sys_fcntl -> 0x1
2714 bash-1998 [000] d... 140.733510: sys_close(fd: a)
2715 bash-1998 [000] d... 140.733510: sys_close -> 0x0
2716 bash-1998 [000] d... 140.733514: sys_rt_sigprocmask(how: 0, nset: 0, oset: 6e2768, sigsetsize: 8)
2717 bash-1998 [000] d... 140.733515: sys_rt_sigprocmask -> 0x0
2718 bash-1998 [000] d... 140.733516: sys_rt_sigaction(sig: 2, act: 7fff718846f0, oact: 7fff71884650, sigsetsize: 8)
2719 bash-1998 [000] d... 140.733516: sys_rt_sigaction -> 0x0
2720
2721You can see that the trace of the top most trace buffer shows only
2722the function tracing. The foo instance displays wakeups and task
2723switches.
2724
2725To remove the instances, simply delete their directories:
2726
2727 # rmdir instances/foo
2728 # rmdir instances/bar
2729 # rmdir instances/zoot
2730
2731Note, if a process has a trace file open in one of the instance
2732directories, the rmdir will fail with EBUSY.
2733
2734
2735Stack trace
1928----------- 2736-----------
2737Since the kernel has a fixed sized stack, it is important not to
2738waste it in functions. A kernel developer must be conscience of
2739what they allocate on the stack. If they add too much, the system
2740can be in danger of a stack overflow, and corruption will occur,
2741usually leading to a system panic.
2742
2743There are some tools that check this, usually with interrupts
2744periodically checking usage. But if you can perform a check
2745at every function call that will become very useful. As ftrace provides
2746a function tracer, it makes it convenient to check the stack size
2747at every function call. This is enabled via the stack tracer.
2748
2749CONFIG_STACK_TRACER enables the ftrace stack tracing functionality.
2750To enable it, write a '1' into /proc/sys/kernel/stack_tracer_enabled.
2751
2752 # echo 1 > /proc/sys/kernel/stack_tracer_enabled
2753
2754You can also enable it from the kernel command line to trace
2755the stack size of the kernel during boot up, by adding "stacktrace"
2756to the kernel command line parameter.
2757
2758After running it for a few minutes, the output looks like:
2759
2760 # cat stack_max_size
27612928
2762
2763 # cat stack_trace
2764 Depth Size Location (18 entries)
2765 ----- ---- --------
2766 0) 2928 224 update_sd_lb_stats+0xbc/0x4ac
2767 1) 2704 160 find_busiest_group+0x31/0x1f1
2768 2) 2544 256 load_balance+0xd9/0x662
2769 3) 2288 80 idle_balance+0xbb/0x130
2770 4) 2208 128 __schedule+0x26e/0x5b9
2771 5) 2080 16 schedule+0x64/0x66
2772 6) 2064 128 schedule_timeout+0x34/0xe0
2773 7) 1936 112 wait_for_common+0x97/0xf1
2774 8) 1824 16 wait_for_completion+0x1d/0x1f
2775 9) 1808 128 flush_work+0xfe/0x119
2776 10) 1680 16 tty_flush_to_ldisc+0x1e/0x20
2777 11) 1664 48 input_available_p+0x1d/0x5c
2778 12) 1616 48 n_tty_poll+0x6d/0x134
2779 13) 1568 64 tty_poll+0x64/0x7f
2780 14) 1504 880 do_select+0x31e/0x511
2781 15) 624 400 core_sys_select+0x177/0x216
2782 16) 224 96 sys_select+0x91/0xb9
2783 17) 128 128 system_call_fastpath+0x16/0x1b
2784
2785Note, if -mfentry is being used by gcc, functions get traced before
2786they set up the stack frame. This means that leaf level functions
2787are not tested by the stack tracer when -mfentry is used.
2788
2789Currently, -mfentry is used by gcc 4.6.0 and above on x86 only.
2790
2791---------
1929 2792
1930More details can be found in the source code, in the 2793More details can be found in the source code, in the
1931kernel/trace/*.c files. 2794kernel/trace/*.c files.
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 52da2a250795..f83e17a40e8b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -261,8 +261,10 @@ struct ftrace_probe_ops {
261 void (*func)(unsigned long ip, 261 void (*func)(unsigned long ip,
262 unsigned long parent_ip, 262 unsigned long parent_ip,
263 void **data); 263 void **data);
264 int (*callback)(unsigned long ip, void **data); 264 int (*init)(struct ftrace_probe_ops *ops,
265 void (*free)(void **data); 265 unsigned long ip, void **data);
266 void (*free)(struct ftrace_probe_ops *ops,
267 unsigned long ip, void **data);
266 int (*print)(struct seq_file *m, 268 int (*print)(struct seq_file *m,
267 unsigned long ip, 269 unsigned long ip,
268 struct ftrace_probe_ops *ops, 270 struct ftrace_probe_ops *ops,
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 13a54d0bdfa8..34e00fb49bec 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -8,6 +8,7 @@
8#include <linux/perf_event.h> 8#include <linux/perf_event.h>
9 9
10struct trace_array; 10struct trace_array;
11struct trace_buffer;
11struct tracer; 12struct tracer;
12struct dentry; 13struct dentry;
13 14
@@ -38,6 +39,12 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
38const char *ftrace_print_hex_seq(struct trace_seq *p, 39const char *ftrace_print_hex_seq(struct trace_seq *p,
39 const unsigned char *buf, int len); 40 const unsigned char *buf, int len);
40 41
42struct trace_iterator;
43struct trace_event;
44
45int ftrace_raw_output_prep(struct trace_iterator *iter,
46 struct trace_event *event);
47
41/* 48/*
42 * The trace entry - the most basic unit of tracing. This is what 49 * The trace entry - the most basic unit of tracing. This is what
43 * is printed in the end as a single line in the trace output, such as: 50 * is printed in the end as a single line in the trace output, such as:
@@ -61,6 +68,7 @@ struct trace_entry {
61struct trace_iterator { 68struct trace_iterator {
62 struct trace_array *tr; 69 struct trace_array *tr;
63 struct tracer *trace; 70 struct tracer *trace;
71 struct trace_buffer *trace_buffer;
64 void *private; 72 void *private;
65 int cpu_file; 73 int cpu_file;
66 struct mutex mutex; 74 struct mutex mutex;
@@ -95,8 +103,6 @@ enum trace_iter_flags {
95}; 103};
96 104
97 105
98struct trace_event;
99
100typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, 106typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
101 int flags, struct trace_event *event); 107 int flags, struct trace_event *event);
102 108
@@ -128,6 +134,13 @@ enum print_line_t {
128void tracing_generic_entry_update(struct trace_entry *entry, 134void tracing_generic_entry_update(struct trace_entry *entry,
129 unsigned long flags, 135 unsigned long flags,
130 int pc); 136 int pc);
137struct ftrace_event_file;
138
139struct ring_buffer_event *
140trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
141 struct ftrace_event_file *ftrace_file,
142 int type, unsigned long len,
143 unsigned long flags, int pc);
131struct ring_buffer_event * 144struct ring_buffer_event *
132trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, 145trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
133 int type, unsigned long len, 146 int type, unsigned long len,
@@ -182,53 +195,49 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
182 enum trace_reg type, void *data); 195 enum trace_reg type, void *data);
183 196
184enum { 197enum {
185 TRACE_EVENT_FL_ENABLED_BIT,
186 TRACE_EVENT_FL_FILTERED_BIT, 198 TRACE_EVENT_FL_FILTERED_BIT,
187 TRACE_EVENT_FL_RECORDED_CMD_BIT,
188 TRACE_EVENT_FL_CAP_ANY_BIT, 199 TRACE_EVENT_FL_CAP_ANY_BIT,
189 TRACE_EVENT_FL_NO_SET_FILTER_BIT, 200 TRACE_EVENT_FL_NO_SET_FILTER_BIT,
190 TRACE_EVENT_FL_IGNORE_ENABLE_BIT, 201 TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
202 TRACE_EVENT_FL_WAS_ENABLED_BIT,
191}; 203};
192 204
205/*
206 * Event flags:
207 * FILTERED - The event has a filter attached
208 * CAP_ANY - Any user can enable for perf
209 * NO_SET_FILTER - Set when filter has error and is to be ignored
210 * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
211 * WAS_ENABLED - Set and stays set when an event was ever enabled
212 * (used for module unloading, if a module event is enabled,
213 * it is best to clear the buffers that used it).
214 */
193enum { 215enum {
194 TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT),
195 TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), 216 TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
196 TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
197 TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), 217 TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
198 TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), 218 TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
199 TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), 219 TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
220 TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
200}; 221};
201 222
202struct ftrace_event_call { 223struct ftrace_event_call {
203 struct list_head list; 224 struct list_head list;
204 struct ftrace_event_class *class; 225 struct ftrace_event_class *class;
205 char *name; 226 char *name;
206 struct dentry *dir;
207 struct trace_event event; 227 struct trace_event event;
208 const char *print_fmt; 228 const char *print_fmt;
209 struct event_filter *filter; 229 struct event_filter *filter;
230 struct list_head *files;
210 void *mod; 231 void *mod;
211 void *data; 232 void *data;
212
213 /* 233 /*
214 * 32 bit flags: 234 * bit 0: filter_active
215 * bit 1: enabled 235 * bit 1: allow trace by non root (cap any)
216 * bit 2: filter_active 236 * bit 2: failed to apply filter
217 * bit 3: enabled cmd record 237 * bit 3: ftrace internal event (do not enable)
218 * bit 4: allow trace by non root (cap any) 238 * bit 4: Event was enabled by module
219 * bit 5: failed to apply filter
220 * bit 6: ftrace internal event (do not enable)
221 *
222 * Changes to flags must hold the event_mutex.
223 *
224 * Note: Reads of flags do not hold the event_mutex since
225 * they occur in critical sections. But the way flags
226 * is currently used, these changes do no affect the code
227 * except that when a change is made, it may have a slight
228 * delay in propagating the changes to other CPUs due to
229 * caching and such.
230 */ 239 */
231 unsigned int flags; 240 int flags; /* static flags of different events */
232 241
233#ifdef CONFIG_PERF_EVENTS 242#ifdef CONFIG_PERF_EVENTS
234 int perf_refcount; 243 int perf_refcount;
@@ -236,6 +245,56 @@ struct ftrace_event_call {
236#endif 245#endif
237}; 246};
238 247
248struct trace_array;
249struct ftrace_subsystem_dir;
250
251enum {
252 FTRACE_EVENT_FL_ENABLED_BIT,
253 FTRACE_EVENT_FL_RECORDED_CMD_BIT,
254 FTRACE_EVENT_FL_SOFT_MODE_BIT,
255 FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
256};
257
258/*
259 * Ftrace event file flags:
260 * ENABLED - The event is enabled
261 * RECORDED_CMD - The comms should be recorded at sched_switch
262 * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED
263 * SOFT_DISABLED - When set, do not trace the event (even though its
264 * tracepoint may be enabled)
265 */
266enum {
267 FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT),
268 FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
269 FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
270 FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
271};
272
273struct ftrace_event_file {
274 struct list_head list;
275 struct ftrace_event_call *event_call;
276 struct dentry *dir;
277 struct trace_array *tr;
278 struct ftrace_subsystem_dir *system;
279
280 /*
281 * 32 bit flags:
282 * bit 0: enabled
283 * bit 1: enabled cmd record
284 * bit 2: enable/disable with the soft disable bit
285 * bit 3: soft disabled
286 *
287 * Note: The bits must be set atomically to prevent races
288 * from other writers. Reads of flags do not need to be in
289 * sync as they occur in critical sections. But the way flags
290 * is currently used, these changes do not affect the code
291 * except that when a change is made, it may have a slight
292 * delay in propagating the changes to other CPUs due to
293 * caching and such. Which is mostly OK ;-)
294 */
295 unsigned long flags;
296};
297
239#define __TRACE_EVENT_FLAGS(name, value) \ 298#define __TRACE_EVENT_FLAGS(name, value) \
240 static int __init trace_init_flags_##name(void) \ 299 static int __init trace_init_flags_##name(void) \
241 { \ 300 { \
@@ -274,7 +333,7 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
274extern int trace_add_event_call(struct ftrace_event_call *call); 333extern int trace_add_event_call(struct ftrace_event_call *call);
275extern void trace_remove_event_call(struct ftrace_event_call *call); 334extern void trace_remove_event_call(struct ftrace_event_call *call);
276 335
277#define is_signed_type(type) (((type)(-1)) < (type)0) 336#define is_signed_type(type) (((type)(-1)) < (type)1)
278 337
279int trace_set_clr_event(const char *system, const char *event, int set); 338int trace_set_clr_event(const char *system, const char *event, int set);
280 339
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 79fdd80a42d4..2dac79c39199 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -486,6 +486,8 @@ enum ftrace_dump_mode {
486void tracing_on(void); 486void tracing_on(void);
487void tracing_off(void); 487void tracing_off(void);
488int tracing_is_on(void); 488int tracing_is_on(void);
489void tracing_snapshot(void);
490void tracing_snapshot_alloc(void);
489 491
490extern void tracing_start(void); 492extern void tracing_start(void);
491extern void tracing_stop(void); 493extern void tracing_stop(void);
@@ -515,10 +517,32 @@ do { \
515 * 517 *
516 * This is intended as a debugging tool for the developer only. 518 * This is intended as a debugging tool for the developer only.
517 * Please refrain from leaving trace_printks scattered around in 519 * Please refrain from leaving trace_printks scattered around in
518 * your code. 520 * your code. (Extra memory is used for special buffers that are
521 * allocated when trace_printk() is used)
522 *
523 * A little optization trick is done here. If there's only one
524 * argument, there's no need to scan the string for printf formats.
525 * The trace_puts() will suffice. But how can we take advantage of
526 * using trace_puts() when trace_printk() has only one argument?
527 * By stringifying the args and checking the size we can tell
528 * whether or not there are args. __stringify((__VA_ARGS__)) will
529 * turn into "()\0" with a size of 3 when there are no args, anything
530 * else will be bigger. All we need to do is define a string to this,
531 * and then take its size and compare to 3. If it's bigger, use
532 * do_trace_printk() otherwise, optimize it to trace_puts(). Then just
533 * let gcc optimize the rest.
519 */ 534 */
520 535
521#define trace_printk(fmt, args...) \ 536#define trace_printk(fmt, ...) \
537do { \
538 char _______STR[] = __stringify((__VA_ARGS__)); \
539 if (sizeof(_______STR) > 3) \
540 do_trace_printk(fmt, ##__VA_ARGS__); \
541 else \
542 trace_puts(fmt); \
543} while (0)
544
545#define do_trace_printk(fmt, args...) \
522do { \ 546do { \
523 static const char *trace_printk_fmt \ 547 static const char *trace_printk_fmt \
524 __attribute__((section("__trace_printk_fmt"))) = \ 548 __attribute__((section("__trace_printk_fmt"))) = \
@@ -538,7 +562,45 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...);
538extern __printf(2, 3) 562extern __printf(2, 3)
539int __trace_printk(unsigned long ip, const char *fmt, ...); 563int __trace_printk(unsigned long ip, const char *fmt, ...);
540 564
541extern void trace_dump_stack(void); 565/**
566 * trace_puts - write a string into the ftrace buffer
567 * @str: the string to record
568 *
569 * Note: __trace_bputs is an internal function for trace_puts and
570 * the @ip is passed in via the trace_puts macro.
571 *
572 * This is similar to trace_printk() but is made for those really fast
573 * paths that a developer wants the least amount of "Heisenbug" affects,
574 * where the processing of the print format is still too much.
575 *
576 * This function allows a kernel developer to debug fast path sections
577 * that printk is not appropriate for. By scattering in various
578 * printk like tracing in the code, a developer can quickly see
579 * where problems are occurring.
580 *
581 * This is intended as a debugging tool for the developer only.
582 * Please refrain from leaving trace_puts scattered around in
583 * your code. (Extra memory is used for special buffers that are
584 * allocated when trace_puts() is used)
585 *
586 * Returns: 0 if nothing was written, positive # if string was.
587 * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
588 */
589
590extern int __trace_bputs(unsigned long ip, const char *str);
591extern int __trace_puts(unsigned long ip, const char *str, int size);
592#define trace_puts(str) ({ \
593 static const char *trace_printk_fmt \
594 __attribute__((section("__trace_printk_fmt"))) = \
595 __builtin_constant_p(str) ? str : NULL; \
596 \
597 if (__builtin_constant_p(str)) \
598 __trace_bputs(_THIS_IP_, trace_printk_fmt); \
599 else \
600 __trace_puts(_THIS_IP_, str, strlen(str)); \
601})
602
603extern void trace_dump_stack(int skip);
542 604
543/* 605/*
544 * The double __builtin_constant_p is because gcc will give us an error 606 * The double __builtin_constant_p is because gcc will give us an error
@@ -573,6 +635,8 @@ static inline void trace_dump_stack(void) { }
573static inline void tracing_on(void) { } 635static inline void tracing_on(void) { }
574static inline void tracing_off(void) { } 636static inline void tracing_off(void) { }
575static inline int tracing_is_on(void) { return 0; } 637static inline int tracing_is_on(void) { return 0; }
638static inline void tracing_snapshot(void) { }
639static inline void tracing_snapshot_alloc(void) { }
576 640
577static inline __printf(1, 2) 641static inline __printf(1, 2)
578int trace_printk(const char *fmt, ...) 642int trace_printk(const char *fmt, ...)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1342e69542f3..d69cf637a15a 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -4,6 +4,7 @@
4#include <linux/kmemcheck.h> 4#include <linux/kmemcheck.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/seq_file.h> 6#include <linux/seq_file.h>
7#include <linux/poll.h>
7 8
8struct ring_buffer; 9struct ring_buffer;
9struct ring_buffer_iter; 10struct ring_buffer_iter;
@@ -96,6 +97,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
96 __ring_buffer_alloc((size), (flags), &__key); \ 97 __ring_buffer_alloc((size), (flags), &__key); \
97}) 98})
98 99
100void ring_buffer_wait(struct ring_buffer *buffer, int cpu);
101int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
102 struct file *filp, poll_table *poll_table);
103
104
99#define RING_BUFFER_ALL_CPUS -1 105#define RING_BUFFER_ALL_CPUS -1
100 106
101void ring_buffer_free(struct ring_buffer *buffer); 107void ring_buffer_free(struct ring_buffer *buffer);
diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
index d563f37e1a1d..1d7ca2739272 100644
--- a/include/linux/trace_clock.h
+++ b/include/linux/trace_clock.h
@@ -16,6 +16,7 @@
16 16
17extern u64 notrace trace_clock_local(void); 17extern u64 notrace trace_clock_local(void);
18extern u64 notrace trace_clock(void); 18extern u64 notrace trace_clock(void);
19extern u64 notrace trace_clock_jiffies(void);
19extern u64 notrace trace_clock_global(void); 20extern u64 notrace trace_clock_global(void);
20extern u64 notrace trace_clock_counter(void); 21extern u64 notrace trace_clock_counter(void);
21 22
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40dc5e8fe340..19edd7facaa1 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -227,29 +227,18 @@ static notrace enum print_line_t \
227ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ 227ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \
228 struct trace_event *trace_event) \ 228 struct trace_event *trace_event) \
229{ \ 229{ \
230 struct ftrace_event_call *event; \
231 struct trace_seq *s = &iter->seq; \ 230 struct trace_seq *s = &iter->seq; \
231 struct trace_seq __maybe_unused *p = &iter->tmp_seq; \
232 struct ftrace_raw_##call *field; \ 232 struct ftrace_raw_##call *field; \
233 struct trace_entry *entry; \
234 struct trace_seq *p = &iter->tmp_seq; \
235 int ret; \ 233 int ret; \
236 \ 234 \
237 event = container_of(trace_event, struct ftrace_event_call, \ 235 field = (typeof(field))iter->ent; \
238 event); \
239 \
240 entry = iter->ent; \
241 \
242 if (entry->type != event->event.type) { \
243 WARN_ON_ONCE(1); \
244 return TRACE_TYPE_UNHANDLED; \
245 } \
246 \
247 field = (typeof(field))entry; \
248 \ 236 \
249 trace_seq_init(p); \ 237 ret = ftrace_raw_output_prep(iter, trace_event); \
250 ret = trace_seq_printf(s, "%s: ", event->name); \
251 if (ret) \ 238 if (ret) \
252 ret = trace_seq_printf(s, print); \ 239 return ret; \
240 \
241 ret = trace_seq_printf(s, print); \
253 if (!ret) \ 242 if (!ret) \
254 return TRACE_TYPE_PARTIAL_LINE; \ 243 return TRACE_TYPE_PARTIAL_LINE; \
255 \ 244 \
@@ -335,7 +324,7 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \
335 324
336#undef DECLARE_EVENT_CLASS 325#undef DECLARE_EVENT_CLASS
337#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ 326#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \
338static int notrace \ 327static int notrace __init \
339ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 328ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
340{ \ 329{ \
341 struct ftrace_raw_##call field; \ 330 struct ftrace_raw_##call field; \
@@ -414,7 +403,8 @@ static inline notrace int ftrace_get_offsets_##call( \
414 * 403 *
415 * static void ftrace_raw_event_<call>(void *__data, proto) 404 * static void ftrace_raw_event_<call>(void *__data, proto)
416 * { 405 * {
417 * struct ftrace_event_call *event_call = __data; 406 * struct ftrace_event_file *ftrace_file = __data;
407 * struct ftrace_event_call *event_call = ftrace_file->event_call;
418 * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets; 408 * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
419 * struct ring_buffer_event *event; 409 * struct ring_buffer_event *event;
420 * struct ftrace_raw_<call> *entry; <-- defined in stage 1 410 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
@@ -423,12 +413,16 @@ static inline notrace int ftrace_get_offsets_##call( \
423 * int __data_size; 413 * int __data_size;
424 * int pc; 414 * int pc;
425 * 415 *
416 * if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
417 * &ftrace_file->flags))
418 * return;
419 *
426 * local_save_flags(irq_flags); 420 * local_save_flags(irq_flags);
427 * pc = preempt_count(); 421 * pc = preempt_count();
428 * 422 *
429 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); 423 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
430 * 424 *
431 * event = trace_current_buffer_lock_reserve(&buffer, 425 * event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
432 * event_<call>->event.type, 426 * event_<call>->event.type,
433 * sizeof(*entry) + __data_size, 427 * sizeof(*entry) + __data_size,
434 * irq_flags, pc); 428 * irq_flags, pc);
@@ -440,7 +434,7 @@ static inline notrace int ftrace_get_offsets_##call( \
440 * __array macros. 434 * __array macros.
441 * 435 *
442 * if (!filter_current_check_discard(buffer, event_call, entry, event)) 436 * if (!filter_current_check_discard(buffer, event_call, entry, event))
443 * trace_current_buffer_unlock_commit(buffer, 437 * trace_nowake_buffer_unlock_commit(buffer,
444 * event, irq_flags, pc); 438 * event, irq_flags, pc);
445 * } 439 * }
446 * 440 *
@@ -518,7 +512,8 @@ static inline notrace int ftrace_get_offsets_##call( \
518static notrace void \ 512static notrace void \
519ftrace_raw_event_##call(void *__data, proto) \ 513ftrace_raw_event_##call(void *__data, proto) \
520{ \ 514{ \
521 struct ftrace_event_call *event_call = __data; \ 515 struct ftrace_event_file *ftrace_file = __data; \
516 struct ftrace_event_call *event_call = ftrace_file->event_call; \
522 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 517 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
523 struct ring_buffer_event *event; \ 518 struct ring_buffer_event *event; \
524 struct ftrace_raw_##call *entry; \ 519 struct ftrace_raw_##call *entry; \
@@ -527,12 +522,16 @@ ftrace_raw_event_##call(void *__data, proto) \
527 int __data_size; \ 522 int __data_size; \
528 int pc; \ 523 int pc; \
529 \ 524 \
525 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, \
526 &ftrace_file->flags)) \
527 return; \
528 \
530 local_save_flags(irq_flags); \ 529 local_save_flags(irq_flags); \
531 pc = preempt_count(); \ 530 pc = preempt_count(); \
532 \ 531 \
533 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 532 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
534 \ 533 \
535 event = trace_current_buffer_lock_reserve(&buffer, \ 534 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, \
536 event_call->event.type, \ 535 event_call->event.type, \
537 sizeof(*entry) + __data_size, \ 536 sizeof(*entry) + __data_size, \
538 irq_flags, pc); \ 537 irq_flags, pc); \
@@ -581,7 +580,7 @@ static inline void ftrace_test_probe_##call(void) \
581#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 580#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
582_TRACE_PERF_PROTO(call, PARAMS(proto)); \ 581_TRACE_PERF_PROTO(call, PARAMS(proto)); \
583static const char print_fmt_##call[] = print; \ 582static const char print_fmt_##call[] = print; \
584static struct ftrace_event_class __used event_class_##call = { \ 583static struct ftrace_event_class __used __refdata event_class_##call = { \
585 .system = __stringify(TRACE_SYSTEM), \ 584 .system = __stringify(TRACE_SYSTEM), \
586 .define_fields = ftrace_define_fields_##call, \ 585 .define_fields = ftrace_define_fields_##call, \
587 .fields = LIST_HEAD_INIT(event_class_##call.fields),\ 586 .fields = LIST_HEAD_INIT(event_class_##call.fields),\
@@ -705,5 +704,3 @@ static inline void perf_test_probe_##call(void) \
705#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 704#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
706#endif /* CONFIG_PERF_EVENTS */ 705#endif /* CONFIG_PERF_EVENTS */
707 706
708#undef _TRACE_PROFILE_INIT
709
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fc382d6e2765..5e9efd4b83a4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
176 select GENERIC_TRACER 176 select GENERIC_TRACER
177 select TRACER_MAX_TRACE 177 select TRACER_MAX_TRACE
178 select RING_BUFFER_ALLOW_SWAP 178 select RING_BUFFER_ALLOW_SWAP
179 select TRACER_SNAPSHOT
180 select TRACER_SNAPSHOT_PER_CPU_SWAP
179 help 181 help
180 This option measures the time spent in irqs-off critical 182 This option measures the time spent in irqs-off critical
181 sections, with microsecond accuracy. 183 sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
198 select GENERIC_TRACER 200 select GENERIC_TRACER
199 select TRACER_MAX_TRACE 201 select TRACER_MAX_TRACE
200 select RING_BUFFER_ALLOW_SWAP 202 select RING_BUFFER_ALLOW_SWAP
203 select TRACER_SNAPSHOT
204 select TRACER_SNAPSHOT_PER_CPU_SWAP
201 help 205 help
202 This option measures the time spent in preemption-off critical 206 This option measures the time spent in preemption-off critical
203 sections, with microsecond accuracy. 207 sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
217 select GENERIC_TRACER 221 select GENERIC_TRACER
218 select CONTEXT_SWITCH_TRACER 222 select CONTEXT_SWITCH_TRACER
219 select TRACER_MAX_TRACE 223 select TRACER_MAX_TRACE
224 select TRACER_SNAPSHOT
220 help 225 help
221 This tracer tracks the latency of the highest priority task 226 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 227 to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
248 echo 1 > /sys/kernel/debug/tracing/snapshot 253 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot 254 cat snapshot
250 255
256config TRACER_SNAPSHOT_PER_CPU_SWAP
257 bool "Allow snapshot to swap per CPU"
258 depends on TRACER_SNAPSHOT
259 select RING_BUFFER_ALLOW_SWAP
260 help
261 Allow doing a snapshot of a single CPU buffer instead of a
262 full swap (all buffers). If this is set, then the following is
263 allowed:
264
265 echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
266
267 After which, only the tracing buffer for CPU 2 was swapped with
268 the main tracing buffer, and the other CPU buffers remain the same.
269
270 When this is enabled, this adds a little more overhead to the
271 trace recording, as it needs to add some checks to synchronize
272 recording with swaps. But this does not affect the performance
273 of the overall system. This is enabled by default when the preempt
274 or irq latency tracers are enabled, as those need to swap as well
275 and already adds the overhead (plus a lot more).
276
251config TRACE_BRANCH_PROFILING 277config TRACE_BRANCH_PROFILING
252 bool 278 bool
253 select GENERIC_TRACER 279 select GENERIC_TRACER
@@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK
524 550
525 If unsure, say N. 551 If unsure, say N.
526 552
553config RING_BUFFER_STARTUP_TEST
554 bool "Ring buffer startup self test"
555 depends on RING_BUFFER
556 help
557 Run a simple self test on the ring buffer on boot up. Late in the
558 kernel boot sequence, the test will start that kicks off
559 a thread per cpu. Each thread will write various size events
560 into the ring buffer. Another thread is created to send IPIs
561 to each of the threads, where the IPI handler will also write
562 to the ring buffer, to test/stress the nesting ability.
563 If any anomalies are discovered, a warning will be displayed
564 and all ring buffers will be disabled.
565
566 The test runs for 10 seconds. This will slow your boot time
567 by at least 10 more seconds.
568
569 At the end of the test, statics and more checks are done.
570 It will output the stats of each per cpu buffer. What
571 was written, the sizes, what was read, what was lost, and
572 other similar details.
573
574 If unsure, say N
575
527endif # FTRACE 576endif # FTRACE
528 577
529endif # TRACING_SUPPORT 578endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5a0f781cd729..ed58a3216a6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
72 bool blk_tracer = blk_tracer_enabled; 72 bool blk_tracer = blk_tracer_enabled;
73 73
74 if (blk_tracer) { 74 if (blk_tracer) {
75 buffer = blk_tr->buffer; 75 buffer = blk_tr->trace_buffer.buffer;
76 pc = preempt_count(); 76 pc = preempt_count();
77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
78 sizeof(*t) + len, 78 sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
218 if (blk_tracer) { 218 if (blk_tracer) {
219 tracing_record_cmdline(current); 219 tracing_record_cmdline(current);
220 220
221 buffer = blk_tr->buffer; 221 buffer = blk_tr->trace_buffer.buffer;
222 pc = preempt_count(); 222 pc = preempt_count();
223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
224 sizeof(*t) + pdu_len, 224 sizeof(*t) + pdu_len,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b3fde6d7b7fc..8a5c017bb50c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
486#define PROFILES_PER_PAGE \ 486#define PROFILES_PER_PAGE \
487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) 487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
488 488
489static int ftrace_profile_bits __read_mostly;
490static int ftrace_profile_enabled __read_mostly; 489static int ftrace_profile_enabled __read_mostly;
491 490
492/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ 491/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
494 493
495static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); 494static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
496 495
497#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ 496#define FTRACE_PROFILE_HASH_BITS 10
497#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
498 498
499static void * 499static void *
500function_stat_next(void *v, int idx) 500function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
676 676
677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); 677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
678 678
679 for (i = 0; i < pages; i++) { 679 for (i = 1; i < pages; i++) {
680 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 680 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
681 if (!pg->next) 681 if (!pg->next)
682 goto out_free; 682 goto out_free;
@@ -724,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
724 if (!stat->hash) 724 if (!stat->hash)
725 return -ENOMEM; 725 return -ENOMEM;
726 726
727 if (!ftrace_profile_bits) {
728 size--;
729
730 for (; size; size >>= 1)
731 ftrace_profile_bits++;
732 }
733
734 /* Preallocate the function profiling pages */ 727 /* Preallocate the function profiling pages */
735 if (ftrace_profile_pages_init(stat) < 0) { 728 if (ftrace_profile_pages_init(stat) < 0) {
736 kfree(stat->hash); 729 kfree(stat->hash);
@@ -763,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
763 struct hlist_head *hhd; 756 struct hlist_head *hhd;
764 unsigned long key; 757 unsigned long key;
765 758
766 key = hash_long(ip, ftrace_profile_bits); 759 key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
767 hhd = &stat->hash[key]; 760 hhd = &stat->hash[key];
768 761
769 if (hlist_empty(hhd)) 762 if (hlist_empty(hhd))
@@ -782,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
782{ 775{
783 unsigned long key; 776 unsigned long key;
784 777
785 key = hash_long(rec->ip, ftrace_profile_bits); 778 key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
786 hlist_add_head_rcu(&rec->node, &stat->hash[key]); 779 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
787} 780}
788 781
@@ -1079,7 +1072,7 @@ struct ftrace_func_probe {
1079 unsigned long flags; 1072 unsigned long flags;
1080 unsigned long ip; 1073 unsigned long ip;
1081 void *data; 1074 void *data;
1082 struct rcu_head rcu; 1075 struct list_head free_list;
1083}; 1076};
1084 1077
1085struct ftrace_func_entry { 1078struct ftrace_func_entry {
@@ -1329,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1329 struct hlist_head *hhd; 1322 struct hlist_head *hhd;
1330 struct ftrace_hash *old_hash; 1323 struct ftrace_hash *old_hash;
1331 struct ftrace_hash *new_hash; 1324 struct ftrace_hash *new_hash;
1332 unsigned long key;
1333 int size = src->count; 1325 int size = src->count;
1334 int bits = 0; 1326 int bits = 0;
1335 int ret; 1327 int ret;
@@ -1372,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1372 for (i = 0; i < size; i++) { 1364 for (i = 0; i < size; i++) {
1373 hhd = &src->buckets[i]; 1365 hhd = &src->buckets[i];
1374 hlist_for_each_entry_safe(entry, tn, hhd, hlist) { 1366 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1375 if (bits > 0)
1376 key = hash_long(entry->ip, bits);
1377 else
1378 key = 0;
1379 remove_hash_entry(src, entry); 1367 remove_hash_entry(src, entry);
1380 __add_hash_entry(new_hash, entry); 1368 __add_hash_entry(new_hash, entry);
1381 } 1369 }
@@ -2973,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
2973} 2961}
2974 2962
2975 2963
2976static void ftrace_free_entry_rcu(struct rcu_head *rhp) 2964static void ftrace_free_entry(struct ftrace_func_probe *entry)
2977{ 2965{
2978 struct ftrace_func_probe *entry =
2979 container_of(rhp, struct ftrace_func_probe, rcu);
2980
2981 if (entry->ops->free) 2966 if (entry->ops->free)
2982 entry->ops->free(&entry->data); 2967 entry->ops->free(entry->ops, entry->ip, &entry->data);
2983 kfree(entry); 2968 kfree(entry);
2984} 2969}
2985 2970
2986
2987int 2971int
2988register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 2972register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2989 void *data) 2973 void *data)
2990{ 2974{
2991 struct ftrace_func_probe *entry; 2975 struct ftrace_func_probe *entry;
2976 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
2977 struct ftrace_hash *hash;
2992 struct ftrace_page *pg; 2978 struct ftrace_page *pg;
2993 struct dyn_ftrace *rec; 2979 struct dyn_ftrace *rec;
2994 int type, len, not; 2980 int type, len, not;
2995 unsigned long key; 2981 unsigned long key;
2996 int count = 0; 2982 int count = 0;
2997 char *search; 2983 char *search;
2984 int ret;
2998 2985
2999 type = filter_parse_regex(glob, strlen(glob), &search, &not); 2986 type = filter_parse_regex(glob, strlen(glob), &search, &not);
3000 len = strlen(search); 2987 len = strlen(search);
@@ -3005,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3005 2992
3006 mutex_lock(&ftrace_lock); 2993 mutex_lock(&ftrace_lock);
3007 2994
3008 if (unlikely(ftrace_disabled)) 2995 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2996 if (!hash) {
2997 count = -ENOMEM;
3009 goto out_unlock; 2998 goto out_unlock;
2999 }
3000
3001 if (unlikely(ftrace_disabled)) {
3002 count = -ENODEV;
3003 goto out_unlock;
3004 }
3010 3005
3011 do_for_each_ftrace_rec(pg, rec) { 3006 do_for_each_ftrace_rec(pg, rec) {
3012 3007
@@ -3030,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3030 * for each function we find. We call the callback 3025 * for each function we find. We call the callback
3031 * to give the caller an opportunity to do so. 3026 * to give the caller an opportunity to do so.
3032 */ 3027 */
3033 if (ops->callback) { 3028 if (ops->init) {
3034 if (ops->callback(rec->ip, &entry->data) < 0) { 3029 if (ops->init(ops, rec->ip, &entry->data) < 0) {
3035 /* caller does not like this func */ 3030 /* caller does not like this func */
3036 kfree(entry); 3031 kfree(entry);
3037 continue; 3032 continue;
3038 } 3033 }
3039 } 3034 }
3040 3035
3036 ret = enter_record(hash, rec, 0);
3037 if (ret < 0) {
3038 kfree(entry);
3039 count = ret;
3040 goto out_unlock;
3041 }
3042
3041 entry->ops = ops; 3043 entry->ops = ops;
3042 entry->ip = rec->ip; 3044 entry->ip = rec->ip;
3043 3045
@@ -3045,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3045 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); 3047 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3046 3048
3047 } while_for_each_ftrace_rec(); 3049 } while_for_each_ftrace_rec();
3050
3051 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3052 if (ret < 0)
3053 count = ret;
3054
3048 __enable_ftrace_function_probe(); 3055 __enable_ftrace_function_probe();
3049 3056
3050 out_unlock: 3057 out_unlock:
3051 mutex_unlock(&ftrace_lock); 3058 mutex_unlock(&ftrace_lock);
3059 free_ftrace_hash(hash);
3052 3060
3053 return count; 3061 return count;
3054} 3062}
@@ -3062,7 +3070,12 @@ static void
3062__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3070__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3063 void *data, int flags) 3071 void *data, int flags)
3064{ 3072{
3073 struct ftrace_func_entry *rec_entry;
3065 struct ftrace_func_probe *entry; 3074 struct ftrace_func_probe *entry;
3075 struct ftrace_func_probe *p;
3076 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
3077 struct list_head free_list;
3078 struct ftrace_hash *hash;
3066 struct hlist_node *tmp; 3079 struct hlist_node *tmp;
3067 char str[KSYM_SYMBOL_LEN]; 3080 char str[KSYM_SYMBOL_LEN];
3068 int type = MATCH_FULL; 3081 int type = MATCH_FULL;
@@ -3083,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3083 } 3096 }
3084 3097
3085 mutex_lock(&ftrace_lock); 3098 mutex_lock(&ftrace_lock);
3099
3100 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3101 if (!hash)
3102 /* Hmm, should report this somehow */
3103 goto out_unlock;
3104
3105 INIT_LIST_HEAD(&free_list);
3106
3086 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3107 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3087 struct hlist_head *hhd = &ftrace_func_hash[i]; 3108 struct hlist_head *hhd = &ftrace_func_hash[i];
3088 3109
@@ -3103,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3103 continue; 3124 continue;
3104 } 3125 }
3105 3126
3127 rec_entry = ftrace_lookup_ip(hash, entry->ip);
3128 /* It is possible more than one entry had this ip */
3129 if (rec_entry)
3130 free_hash_entry(hash, rec_entry);
3131
3106 hlist_del_rcu(&entry->node); 3132 hlist_del_rcu(&entry->node);
3107 call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); 3133 list_add(&entry->free_list, &free_list);
3108 } 3134 }
3109 } 3135 }
3110 __disable_ftrace_function_probe(); 3136 __disable_ftrace_function_probe();
3137 /*
3138 * Remove after the disable is called. Otherwise, if the last
3139 * probe is removed, a null hash means *all enabled*.
3140 */
3141 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3142 synchronize_sched();
3143 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3144 list_del(&entry->free_list);
3145 ftrace_free_entry(entry);
3146 }
3147
3148 out_unlock:
3111 mutex_unlock(&ftrace_lock); 3149 mutex_unlock(&ftrace_lock);
3150 free_ftrace_hash(hash);
3112} 3151}
3113 3152
3114void 3153void
@@ -3736,7 +3775,8 @@ out:
3736 if (fail) 3775 if (fail)
3737 return -EINVAL; 3776 return -EINVAL;
3738 3777
3739 ftrace_graph_filter_enabled = 1; 3778 ftrace_graph_filter_enabled = !!(*idx);
3779
3740 return 0; 3780 return 0;
3741} 3781}
3742 3782
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..b59aea2c48c2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
8#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h>
11#include <linux/debugfs.h> 12#include <linux/debugfs.h>
12#include <linux/uaccess.h> 13#include <linux/uaccess.h>
13#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */
14#include <linux/kmemcheck.h> 16#include <linux/kmemcheck.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/percpu.h> 18#include <linux/percpu.h>
17#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/delay.h>
18#include <linux/slab.h> 21#include <linux/slab.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/hash.h> 23#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
444 return ret; 447 return ret;
445} 448}
446 449
450struct rb_irq_work {
451 struct irq_work work;
452 wait_queue_head_t waiters;
453 bool waiters_pending;
454};
455
447/* 456/*
448 * head_page == tail_page && head == tail then buffer is empty. 457 * head_page == tail_page && head == tail then buffer is empty.
449 */ 458 */
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
478 struct list_head new_pages; /* new pages to add */ 487 struct list_head new_pages; /* new pages to add */
479 struct work_struct update_pages_work; 488 struct work_struct update_pages_work;
480 struct completion update_done; 489 struct completion update_done;
490
491 struct rb_irq_work irq_work;
481}; 492};
482 493
483struct ring_buffer { 494struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
497 struct notifier_block cpu_notify; 508 struct notifier_block cpu_notify;
498#endif 509#endif
499 u64 (*clock)(void); 510 u64 (*clock)(void);
511
512 struct rb_irq_work irq_work;
500}; 513};
501 514
502struct ring_buffer_iter { 515struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
508 u64 read_stamp; 521 u64 read_stamp;
509}; 522};
510 523
524/*
525 * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
526 *
527 * Schedules a delayed work to wake up any task that is blocked on the
528 * ring buffer waiters queue.
529 */
530static void rb_wake_up_waiters(struct irq_work *work)
531{
532 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
533
534 wake_up_all(&rbwork->waiters);
535}
536
537/**
538 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on
541 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer.
545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{
548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait);
550 struct rb_irq_work *work;
551
552 /*
553 * Depending on what the caller is waiting for, either any
554 * data in any cpu buffer, or a specific buffer, put the
555 * caller on the appropriate wait queue.
556 */
557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work;
559 else {
560 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work;
562 }
563
564
565 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
566
567 /*
568 * The events can happen in critical sections where
569 * checking a work queue can cause deadlocks.
570 * After adding a task to the queue, this flag is set
571 * only to notify events to try to wake up the queue
572 * using irq_work.
573 *
574 * We don't clear it even if the buffer is no longer
575 * empty. The flag only causes the next event to run
576 * irq_work to do the work queue wake up. The worse
577 * that can happen if we race with !trace_empty() is that
578 * an event will cause an irq_work to try to wake up
579 * an empty queue.
580 *
581 * There's no reason to protect this flag either, as
582 * the work queue and irq_work logic will do the necessary
583 * synchronization for the wake ups. The only thing
584 * that is necessary is that the wake up happens after
585 * a task has been queued. It's OK for spurious wake ups.
586 */
587 work->waiters_pending = true;
588
589 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
590 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
591 schedule();
592
593 finish_wait(&work->waiters, &wait);
594}
595
596/**
597 * ring_buffer_poll_wait - poll on buffer input
598 * @buffer: buffer to wait on
599 * @cpu: the cpu buffer to wait on
600 * @filp: the file descriptor
601 * @poll_table: The poll descriptor
602 *
603 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
604 * as data is added to any of the @buffer's cpu buffers. Otherwise
605 * it will wait for data to be added to a specific cpu buffer.
606 *
607 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
608 * zero otherwise.
609 */
610int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
611 struct file *filp, poll_table *poll_table)
612{
613 struct ring_buffer_per_cpu *cpu_buffer;
614 struct rb_irq_work *work;
615
616 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
617 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
618 return POLLIN | POLLRDNORM;
619
620 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work;
622 else {
623 cpu_buffer = buffer->buffers[cpu];
624 work = &cpu_buffer->irq_work;
625 }
626
627 work->waiters_pending = true;
628 poll_wait(filp, &work->waiters, poll_table);
629
630 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
631 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
632 return POLLIN | POLLRDNORM;
633 return 0;
634}
635
511/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 636/* buffer may be either ring_buffer or ring_buffer_per_cpu */
512#define RB_WARN_ON(b, cond) \ 637#define RB_WARN_ON(b, cond) \
513 ({ \ 638 ({ \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1063 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1188 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1064 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); 1189 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1065 init_completion(&cpu_buffer->update_done); 1190 init_completion(&cpu_buffer->update_done);
1191 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1192 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1066 1193
1067 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1194 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1068 GFP_KERNEL, cpu_to_node(cpu)); 1195 GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1158 buffer->clock = trace_clock_local; 1285 buffer->clock = trace_clock_local;
1159 buffer->reader_lock_key = key; 1286 buffer->reader_lock_key = key;
1160 1287
1288 init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
1289 init_waitqueue_head(&buffer->irq_work.waiters);
1290
1161 /* need at least two pages */ 1291 /* need at least two pages */
1162 if (nr_pages < 2) 1292 if (nr_pages < 2)
1163 nr_pages = 2; 1293 nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1553 if (!cpu_buffer->nr_pages_to_update) 1683 if (!cpu_buffer->nr_pages_to_update)
1554 continue; 1684 continue;
1555 1685
1556 if (cpu_online(cpu)) 1686 /* The update must run on the CPU that is being updated. */
1687 preempt_disable();
1688 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1689 rb_update_pages(cpu_buffer);
1690 cpu_buffer->nr_pages_to_update = 0;
1691 } else {
1692 /*
1693 * Can not disable preemption for schedule_work_on()
1694 * on PREEMPT_RT.
1695 */
1696 preempt_enable();
1557 schedule_work_on(cpu, 1697 schedule_work_on(cpu,
1558 &cpu_buffer->update_pages_work); 1698 &cpu_buffer->update_pages_work);
1559 else 1699 preempt_disable();
1560 rb_update_pages(cpu_buffer); 1700 }
1701 preempt_enable();
1561 } 1702 }
1562 1703
1563 /* wait for all the updates to complete */ 1704 /* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1595 1736
1596 get_online_cpus(); 1737 get_online_cpus();
1597 1738
1598 if (cpu_online(cpu_id)) { 1739 preempt_disable();
1740 /* The update must run on the CPU that is being updated. */
1741 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1742 rb_update_pages(cpu_buffer);
1743 else {
1744 /*
1745 * Can not disable preemption for schedule_work_on()
1746 * on PREEMPT_RT.
1747 */
1748 preempt_enable();
1599 schedule_work_on(cpu_id, 1749 schedule_work_on(cpu_id,
1600 &cpu_buffer->update_pages_work); 1750 &cpu_buffer->update_pages_work);
1601 wait_for_completion(&cpu_buffer->update_done); 1751 wait_for_completion(&cpu_buffer->update_done);
1602 } else 1752 preempt_disable();
1603 rb_update_pages(cpu_buffer); 1753 }
1754 preempt_enable();
1604 1755
1605 cpu_buffer->nr_pages_to_update = 0; 1756 cpu_buffer->nr_pages_to_update = 0;
1606 put_online_cpus(); 1757 put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2612 rb_end_commit(cpu_buffer); 2763 rb_end_commit(cpu_buffer);
2613} 2764}
2614 2765
2766static __always_inline void
2767rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2768{
2769 if (buffer->irq_work.waiters_pending) {
2770 buffer->irq_work.waiters_pending = false;
2771 /* irq_work_queue() supplies it's own memory barriers */
2772 irq_work_queue(&buffer->irq_work.work);
2773 }
2774
2775 if (cpu_buffer->irq_work.waiters_pending) {
2776 cpu_buffer->irq_work.waiters_pending = false;
2777 /* irq_work_queue() supplies it's own memory barriers */
2778 irq_work_queue(&cpu_buffer->irq_work.work);
2779 }
2780}
2781
2615/** 2782/**
2616 * ring_buffer_unlock_commit - commit a reserved 2783 * ring_buffer_unlock_commit - commit a reserved
2617 * @buffer: The buffer to commit to 2784 * @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2631 2798
2632 rb_commit(cpu_buffer, event); 2799 rb_commit(cpu_buffer, event);
2633 2800
2801 rb_wakeups(buffer, cpu_buffer);
2802
2634 trace_recursive_unlock(); 2803 trace_recursive_unlock();
2635 2804
2636 preempt_enable_notrace(); 2805 preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
2803 2972
2804 rb_commit(cpu_buffer, event); 2973 rb_commit(cpu_buffer, event);
2805 2974
2975 rb_wakeups(buffer, cpu_buffer);
2976
2806 ret = 0; 2977 ret = 0;
2807 out: 2978 out:
2808 preempt_enable_notrace(); 2979 preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
4467 return NOTIFY_OK; 4638 return NOTIFY_OK;
4468} 4639}
4469#endif 4640#endif
4641
4642#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
4643/*
4644 * This is a basic integrity check of the ring buffer.
4645 * Late in the boot cycle this test will run when configured in.
4646 * It will kick off a thread per CPU that will go into a loop
4647 * writing to the per cpu ring buffer various sizes of data.
4648 * Some of the data will be large items, some small.
4649 *
4650 * Another thread is created that goes into a spin, sending out
4651 * IPIs to the other CPUs to also write into the ring buffer.
4652 * this is to test the nesting ability of the buffer.
4653 *
4654 * Basic stats are recorded and reported. If something in the
4655 * ring buffer should happen that's not expected, a big warning
4656 * is displayed and all ring buffers are disabled.
4657 */
4658static struct task_struct *rb_threads[NR_CPUS] __initdata;
4659
4660struct rb_test_data {
4661 struct ring_buffer *buffer;
4662 unsigned long events;
4663 unsigned long bytes_written;
4664 unsigned long bytes_alloc;
4665 unsigned long bytes_dropped;
4666 unsigned long events_nested;
4667 unsigned long bytes_written_nested;
4668 unsigned long bytes_alloc_nested;
4669 unsigned long bytes_dropped_nested;
4670 int min_size_nested;
4671 int max_size_nested;
4672 int max_size;
4673 int min_size;
4674 int cpu;
4675 int cnt;
4676};
4677
4678static struct rb_test_data rb_data[NR_CPUS] __initdata;
4679
4680/* 1 meg per cpu */
4681#define RB_TEST_BUFFER_SIZE 1048576
4682
4683static char rb_string[] __initdata =
4684 "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
4685 "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
4686 "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
4687
4688static bool rb_test_started __initdata;
4689
4690struct rb_item {
4691 int size;
4692 char str[];
4693};
4694
4695static __init int rb_write_something(struct rb_test_data *data, bool nested)
4696{
4697 struct ring_buffer_event *event;
4698 struct rb_item *item;
4699 bool started;
4700 int event_len;
4701 int size;
4702 int len;
4703 int cnt;
4704
4705 /* Have nested writes different that what is written */
4706 cnt = data->cnt + (nested ? 27 : 0);
4707
4708 /* Multiply cnt by ~e, to make some unique increment */
4709 size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
4710
4711 len = size + sizeof(struct rb_item);
4712
4713 started = rb_test_started;
4714 /* read rb_test_started before checking buffer enabled */
4715 smp_rmb();
4716
4717 event = ring_buffer_lock_reserve(data->buffer, len);
4718 if (!event) {
4719 /* Ignore dropped events before test starts. */
4720 if (started) {
4721 if (nested)
4722 data->bytes_dropped += len;
4723 else
4724 data->bytes_dropped_nested += len;
4725 }
4726 return len;
4727 }
4728
4729 event_len = ring_buffer_event_length(event);
4730
4731 if (RB_WARN_ON(data->buffer, event_len < len))
4732 goto out;
4733
4734 item = ring_buffer_event_data(event);
4735 item->size = size;
4736 memcpy(item->str, rb_string, size);
4737
4738 if (nested) {
4739 data->bytes_alloc_nested += event_len;
4740 data->bytes_written_nested += len;
4741 data->events_nested++;
4742 if (!data->min_size_nested || len < data->min_size_nested)
4743 data->min_size_nested = len;
4744 if (len > data->max_size_nested)
4745 data->max_size_nested = len;
4746 } else {
4747 data->bytes_alloc += event_len;
4748 data->bytes_written += len;
4749 data->events++;
4750 if (!data->min_size || len < data->min_size)
4751 data->max_size = len;
4752 if (len > data->max_size)
4753 data->max_size = len;
4754 }
4755
4756 out:
4757 ring_buffer_unlock_commit(data->buffer, event);
4758
4759 return 0;
4760}
4761
4762static __init int rb_test(void *arg)
4763{
4764 struct rb_test_data *data = arg;
4765
4766 while (!kthread_should_stop()) {
4767 rb_write_something(data, false);
4768 data->cnt++;
4769
4770 set_current_state(TASK_INTERRUPTIBLE);
4771 /* Now sleep between a min of 100-300us and a max of 1ms */
4772 usleep_range(((data->cnt % 3) + 1) * 100, 1000);
4773 }
4774
4775 return 0;
4776}
4777
4778static __init void rb_ipi(void *ignore)
4779{
4780 struct rb_test_data *data;
4781 int cpu = smp_processor_id();
4782
4783 data = &rb_data[cpu];
4784 rb_write_something(data, true);
4785}
4786
4787static __init int rb_hammer_test(void *arg)
4788{
4789 while (!kthread_should_stop()) {
4790
4791 /* Send an IPI to all cpus to write data! */
4792 smp_call_function(rb_ipi, NULL, 1);
4793 /* No sleep, but for non preempt, let others run */
4794 schedule();
4795 }
4796
4797 return 0;
4798}
4799
4800static __init int test_ringbuffer(void)
4801{
4802 struct task_struct *rb_hammer;
4803 struct ring_buffer *buffer;
4804 int cpu;
4805 int ret = 0;
4806
4807 pr_info("Running ring buffer tests...\n");
4808
4809 buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
4810 if (WARN_ON(!buffer))
4811 return 0;
4812
4813 /* Disable buffer so that threads can't write to it yet */
4814 ring_buffer_record_off(buffer);
4815
4816 for_each_online_cpu(cpu) {
4817 rb_data[cpu].buffer = buffer;
4818 rb_data[cpu].cpu = cpu;
4819 rb_data[cpu].cnt = cpu;
4820 rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
4821 "rbtester/%d", cpu);
4822 if (WARN_ON(!rb_threads[cpu])) {
4823 pr_cont("FAILED\n");
4824 ret = -1;
4825 goto out_free;
4826 }
4827
4828 kthread_bind(rb_threads[cpu], cpu);
4829 wake_up_process(rb_threads[cpu]);
4830 }
4831
4832 /* Now create the rb hammer! */
4833 rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
4834 if (WARN_ON(!rb_hammer)) {
4835 pr_cont("FAILED\n");
4836 ret = -1;
4837 goto out_free;
4838 }
4839
4840 ring_buffer_record_on(buffer);
4841 /*
4842 * Show buffer is enabled before setting rb_test_started.
4843 * Yes there's a small race window where events could be
4844 * dropped and the thread wont catch it. But when a ring
4845 * buffer gets enabled, there will always be some kind of
4846 * delay before other CPUs see it. Thus, we don't care about
4847 * those dropped events. We care about events dropped after
4848 * the threads see that the buffer is active.
4849 */
4850 smp_wmb();
4851 rb_test_started = true;
4852
4853 set_current_state(TASK_INTERRUPTIBLE);
4854 /* Just run for 10 seconds */;
4855 schedule_timeout(10 * HZ);
4856
4857 kthread_stop(rb_hammer);
4858
4859 out_free:
4860 for_each_online_cpu(cpu) {
4861 if (!rb_threads[cpu])
4862 break;
4863 kthread_stop(rb_threads[cpu]);
4864 }
4865 if (ret) {
4866 ring_buffer_free(buffer);
4867 return ret;
4868 }
4869
4870 /* Report! */
4871 pr_info("finished\n");
4872 for_each_online_cpu(cpu) {
4873 struct ring_buffer_event *event;
4874 struct rb_test_data *data = &rb_data[cpu];
4875 struct rb_item *item;
4876 unsigned long total_events;
4877 unsigned long total_dropped;
4878 unsigned long total_written;
4879 unsigned long total_alloc;
4880 unsigned long total_read = 0;
4881 unsigned long total_size = 0;
4882 unsigned long total_len = 0;
4883 unsigned long total_lost = 0;
4884 unsigned long lost;
4885 int big_event_size;
4886 int small_event_size;
4887
4888 ret = -1;
4889
4890 total_events = data->events + data->events_nested;
4891 total_written = data->bytes_written + data->bytes_written_nested;
4892 total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
4893 total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
4894
4895 big_event_size = data->max_size + data->max_size_nested;
4896 small_event_size = data->min_size + data->min_size_nested;
4897
4898 pr_info("CPU %d:\n", cpu);
4899 pr_info(" events: %ld\n", total_events);
4900 pr_info(" dropped bytes: %ld\n", total_dropped);
4901 pr_info(" alloced bytes: %ld\n", total_alloc);
4902 pr_info(" written bytes: %ld\n", total_written);
4903 pr_info(" biggest event: %d\n", big_event_size);
4904 pr_info(" smallest event: %d\n", small_event_size);
4905
4906 if (RB_WARN_ON(buffer, total_dropped))
4907 break;
4908
4909 ret = 0;
4910
4911 while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
4912 total_lost += lost;
4913 item = ring_buffer_event_data(event);
4914 total_len += ring_buffer_event_length(event);
4915 total_size += item->size + sizeof(struct rb_item);
4916 if (memcmp(&item->str[0], rb_string, item->size) != 0) {
4917 pr_info("FAILED!\n");
4918 pr_info("buffer had: %.*s\n", item->size, item->str);
4919 pr_info("expected: %.*s\n", item->size, rb_string);
4920 RB_WARN_ON(buffer, 1);
4921 ret = -1;
4922 break;
4923 }
4924 total_read++;
4925 }
4926 if (ret)
4927 break;
4928
4929 ret = -1;
4930
4931 pr_info(" read events: %ld\n", total_read);
4932 pr_info(" lost events: %ld\n", total_lost);
4933 pr_info(" total events: %ld\n", total_lost + total_read);
4934 pr_info(" recorded len bytes: %ld\n", total_len);
4935 pr_info(" recorded size bytes: %ld\n", total_size);
4936 if (total_lost)
4937 pr_info(" With dropped events, record len and size may not match\n"
4938 " alloced and written from above\n");
4939 if (!total_lost) {
4940 if (RB_WARN_ON(buffer, total_len != total_alloc ||
4941 total_size != total_written))
4942 break;
4943 }
4944 if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
4945 break;
4946
4947 ret = 0;
4948 }
4949 if (!ret)
4950 pr_info("Ring buffer PASSED!\n");
4951
4952 ring_buffer_free(buffer);
4953 return 0;
4954}
4955
4956late_initcall(test_ringbuffer);
4957#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 66338c4f7f4b..581630a6387d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * ring buffer based function tracer 2 * ring buffer based function tracer
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 * 6 *
7 * Originally taken from the RT patch by: 7 * Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/hardirq.h> 24#include <linux/hardirq.h>
@@ -48,7 +47,7 @@
48 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
50 */ 49 */
51int ring_buffer_expanded; 50bool ring_buffer_expanded;
52 51
53/* 52/*
54 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
87static DEFINE_PER_CPU(bool, trace_cmdline_save); 86static DEFINE_PER_CPU(bool, trace_cmdline_save);
88 87
89/* 88/*
90 * When a reader is waiting for data, then this variable is
91 * set to true.
92 */
93static bool trace_wakeup_needed;
94
95static struct irq_work trace_work_wakeup;
96
97/*
98 * Kill all tracing for good (never come back). 89 * Kill all tracing for good (never come back).
99 * It is initialized to 1 but will turn to zero if the initialization 90 * It is initialized to 1 but will turn to zero if the initialization
100 * of the tracer is successful. But that is the only place that sets 91 * of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
130static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 121static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
131static char *default_bootup_tracer; 122static char *default_bootup_tracer;
132 123
124static bool allocate_snapshot;
125
133static int __init set_cmdline_ftrace(char *str) 126static int __init set_cmdline_ftrace(char *str)
134{ 127{
135 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 128 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf; 129 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */ 130 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1; 131 ring_buffer_expanded = true;
139 return 1; 132 return 1;
140} 133}
141__setup("ftrace=", set_cmdline_ftrace); 134__setup("ftrace=", set_cmdline_ftrace);
@@ -156,6 +149,15 @@ static int __init set_ftrace_dump_on_oops(char *str)
156} 149}
157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
158 151
152static int __init boot_alloc_snapshot(char *str)
153{
154 allocate_snapshot = true;
155 /* We also need the main ring buffer expanded */
156 ring_buffer_expanded = true;
157 return 1;
158}
159__setup("alloc_snapshot", boot_alloc_snapshot);
160
159 161
160static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; 162static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
161static char *trace_boot_options __initdata; 163static char *trace_boot_options __initdata;
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
189 */ 191 */
190static struct trace_array global_trace; 192static struct trace_array global_trace;
191 193
192static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 194LIST_HEAD(ftrace_trace_arrays);
193 195
194int filter_current_check_discard(struct ring_buffer *buffer, 196int filter_current_check_discard(struct ring_buffer *buffer,
195 struct ftrace_event_call *call, void *rec, 197 struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
204 u64 ts; 206 u64 ts;
205 207
206 /* Early boot up does not have a buffer yet */ 208 /* Early boot up does not have a buffer yet */
207 if (!global_trace.buffer) 209 if (!global_trace.trace_buffer.buffer)
208 return trace_clock_local(); 210 return trace_clock_local();
209 211
210 ts = ring_buffer_time_stamp(global_trace.buffer, cpu); 212 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
211 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); 213 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
212 214
213 return ts; 215 return ts;
214} 216}
215 217
216/*
217 * The max_tr is used to snapshot the global_trace when a maximum
218 * latency is reached. Some tracers will use this to store a maximum
219 * trace while it continues examining live traces.
220 *
221 * The buffers for the max_tr are set up the same as the global_trace.
222 * When a snapshot is taken, the link list of the max_tr is swapped
223 * with the link list of the global_trace and the buffers are reset for
224 * the global_trace so the tracing can continue.
225 */
226static struct trace_array max_tr;
227
228static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
229
230int tracing_is_enabled(void) 218int tracing_is_enabled(void)
231{ 219{
232 return tracing_is_on(); 220 return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249/* trace_types holds a link list of available tracers. */ 237/* trace_types holds a link list of available tracers. */
250static struct tracer *trace_types __read_mostly; 238static struct tracer *trace_types __read_mostly;
251 239
252/* current_trace points to the tracer that is currently active */
253static struct tracer *current_trace __read_mostly = &nop_trace;
254
255/* 240/*
256 * trace_types_lock is used to protect the trace_types list. 241 * trace_types_lock is used to protect the trace_types list.
257 */ 242 */
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
285 270
286static inline void trace_access_lock(int cpu) 271static inline void trace_access_lock(int cpu)
287{ 272{
288 if (cpu == TRACE_PIPE_ALL_CPU) { 273 if (cpu == RING_BUFFER_ALL_CPUS) {
289 /* gain it for accessing the whole ring buffer. */ 274 /* gain it for accessing the whole ring buffer. */
290 down_write(&all_cpu_access_lock); 275 down_write(&all_cpu_access_lock);
291 } else { 276 } else {
292 /* gain it for accessing a cpu ring buffer. */ 277 /* gain it for accessing a cpu ring buffer. */
293 278
294 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ 279 /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
295 down_read(&all_cpu_access_lock); 280 down_read(&all_cpu_access_lock);
296 281
297 /* Secondly block other access to this @cpu ring buffer. */ 282 /* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
301 286
302static inline void trace_access_unlock(int cpu) 287static inline void trace_access_unlock(int cpu)
303{ 288{
304 if (cpu == TRACE_PIPE_ALL_CPU) { 289 if (cpu == RING_BUFFER_ALL_CPUS) {
305 up_write(&all_cpu_access_lock); 290 up_write(&all_cpu_access_lock);
306 } else { 291 } else {
307 mutex_unlock(&per_cpu(cpu_access_lock, cpu)); 292 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
339 324
340#endif 325#endif
341 326
342/* trace_wait is a waitqueue for tasks blocked on trace_poll */
343static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344
345/* trace_flags holds trace_options default values */ 327/* trace_flags holds trace_options default values */
346unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 328unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
347 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 329 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
348 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
349 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; 331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
350
351static int trace_stop_count;
352static DEFINE_RAW_SPINLOCK(tracing_start_lock);
353
354/**
355 * trace_wake_up - wake up tasks waiting for trace input
356 *
357 * Schedules a delayed work to wake up any task that is blocked on the
358 * trace_wait queue. These is used with trace_poll for tasks polling the
359 * trace.
360 */
361static void trace_wake_up(struct irq_work *work)
362{
363 wake_up_all(&trace_wait);
364
365}
366 332
367/** 333/**
368 * tracing_on - enable tracing buffers 334 * tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
372 */ 338 */
373void tracing_on(void) 339void tracing_on(void)
374{ 340{
375 if (global_trace.buffer) 341 if (global_trace.trace_buffer.buffer)
376 ring_buffer_record_on(global_trace.buffer); 342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
377 /* 343 /*
378 * This flag is only looked at when buffers haven't been 344 * This flag is only looked at when buffers haven't been
379 * allocated yet. We don't really care about the race 345 * allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
385EXPORT_SYMBOL_GPL(tracing_on); 351EXPORT_SYMBOL_GPL(tracing_on);
386 352
387/** 353/**
354 * __trace_puts - write a constant string into the trace buffer.
355 * @ip: The address of the caller
356 * @str: The constant string to write
357 * @size: The size of the string.
358 */
359int __trace_puts(unsigned long ip, const char *str, int size)
360{
361 struct ring_buffer_event *event;
362 struct ring_buffer *buffer;
363 struct print_entry *entry;
364 unsigned long irq_flags;
365 int alloc;
366
367 alloc = sizeof(*entry) + size + 2; /* possible \n added */
368
369 local_save_flags(irq_flags);
370 buffer = global_trace.trace_buffer.buffer;
371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
372 irq_flags, preempt_count());
373 if (!event)
374 return 0;
375
376 entry = ring_buffer_event_data(event);
377 entry->ip = ip;
378
379 memcpy(&entry->buf, str, size);
380
381 /* Add a newline if necessary */
382 if (entry->buf[size - 1] != '\n') {
383 entry->buf[size] = '\n';
384 entry->buf[size + 1] = '\0';
385 } else
386 entry->buf[size] = '\0';
387
388 __buffer_unlock_commit(buffer, event);
389
390 return size;
391}
392EXPORT_SYMBOL_GPL(__trace_puts);
393
394/**
395 * __trace_bputs - write the pointer to a constant string into trace buffer
396 * @ip: The address of the caller
397 * @str: The constant string to write to the buffer to
398 */
399int __trace_bputs(unsigned long ip, const char *str)
400{
401 struct ring_buffer_event *event;
402 struct ring_buffer *buffer;
403 struct bputs_entry *entry;
404 unsigned long irq_flags;
405 int size = sizeof(struct bputs_entry);
406
407 local_save_flags(irq_flags);
408 buffer = global_trace.trace_buffer.buffer;
409 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
410 irq_flags, preempt_count());
411 if (!event)
412 return 0;
413
414 entry = ring_buffer_event_data(event);
415 entry->ip = ip;
416 entry->str = str;
417
418 __buffer_unlock_commit(buffer, event);
419
420 return 1;
421}
422EXPORT_SYMBOL_GPL(__trace_bputs);
423
424#ifdef CONFIG_TRACER_SNAPSHOT
425/**
426 * trace_snapshot - take a snapshot of the current buffer.
427 *
428 * This causes a swap between the snapshot buffer and the current live
429 * tracing buffer. You can use this to take snapshots of the live
430 * trace when some condition is triggered, but continue to trace.
431 *
432 * Note, make sure to allocate the snapshot with either
433 * a tracing_snapshot_alloc(), or by doing it manually
434 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
435 *
436 * If the snapshot buffer is not allocated, it will stop tracing.
437 * Basically making a permanent snapshot.
438 */
439void tracing_snapshot(void)
440{
441 struct trace_array *tr = &global_trace;
442 struct tracer *tracer = tr->current_trace;
443 unsigned long flags;
444
445 if (in_nmi()) {
446 internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
447 internal_trace_puts("*** snapshot is being ignored ***\n");
448 return;
449 }
450
451 if (!tr->allocated_snapshot) {
452 internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
453 internal_trace_puts("*** stopping trace here! ***\n");
454 tracing_off();
455 return;
456 }
457
458 /* Note, snapshot can not be used when the tracer uses it */
459 if (tracer->use_max_tr) {
460 internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
461 internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
462 return;
463 }
464
465 local_irq_save(flags);
466 update_max_tr(tr, current, smp_processor_id());
467 local_irq_restore(flags);
468}
469EXPORT_SYMBOL_GPL(tracing_snapshot);
470
471static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
472 struct trace_buffer *size_buf, int cpu_id);
473static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
474
475static int alloc_snapshot(struct trace_array *tr)
476{
477 int ret;
478
479 if (!tr->allocated_snapshot) {
480
481 /* allocate spare buffer */
482 ret = resize_buffer_duplicate_size(&tr->max_buffer,
483 &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
484 if (ret < 0)
485 return ret;
486
487 tr->allocated_snapshot = true;
488 }
489
490 return 0;
491}
492
493void free_snapshot(struct trace_array *tr)
494{
495 /*
496 * We don't free the ring buffer. instead, resize it because
497 * The max_tr ring buffer has some state (e.g. ring->clock) and
498 * we want preserve it.
499 */
500 ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
501 set_buffer_entries(&tr->max_buffer, 1);
502 tracing_reset_online_cpus(&tr->max_buffer);
503 tr->allocated_snapshot = false;
504}
505
506/**
507 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
508 *
509 * This is similar to trace_snapshot(), but it will allocate the
510 * snapshot buffer if it isn't already allocated. Use this only
511 * where it is safe to sleep, as the allocation may sleep.
512 *
513 * This causes a swap between the snapshot buffer and the current live
514 * tracing buffer. You can use this to take snapshots of the live
515 * trace when some condition is triggered, but continue to trace.
516 */
517void tracing_snapshot_alloc(void)
518{
519 struct trace_array *tr = &global_trace;
520 int ret;
521
522 ret = alloc_snapshot(tr);
523 if (WARN_ON(ret < 0))
524 return;
525
526 tracing_snapshot();
527}
528EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
529#else
530void tracing_snapshot(void)
531{
532 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
533}
534EXPORT_SYMBOL_GPL(tracing_snapshot);
535void tracing_snapshot_alloc(void)
536{
537 /* Give warning */
538 tracing_snapshot();
539}
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */
542
543/**
388 * tracing_off - turn off tracing buffers 544 * tracing_off - turn off tracing buffers
389 * 545 *
390 * This function stops the tracing buffers from recording data. 546 * This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
394 */ 550 */
395void tracing_off(void) 551void tracing_off(void)
396{ 552{
397 if (global_trace.buffer) 553 if (global_trace.trace_buffer.buffer)
398 ring_buffer_record_off(global_trace.buffer); 554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
399 /* 555 /*
400 * This flag is only looked at when buffers haven't been 556 * This flag is only looked at when buffers haven't been
401 * allocated yet. We don't really care about the race 557 * allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
411 */ 567 */
412int tracing_is_on(void) 568int tracing_is_on(void)
413{ 569{
414 if (global_trace.buffer) 570 if (global_trace.trace_buffer.buffer)
415 return ring_buffer_record_is_on(global_trace.buffer); 571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
416 return !global_trace.buffer_disabled; 572 return !global_trace.buffer_disabled;
417} 573}
418EXPORT_SYMBOL_GPL(tracing_is_on); 574EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
479 "disable_on_free", 635 "disable_on_free",
480 "irq-info", 636 "irq-info",
481 "markers", 637 "markers",
638 "function-trace",
482 NULL 639 NULL
483}; 640};
484 641
@@ -490,6 +647,8 @@ static struct {
490 { trace_clock_local, "local", 1 }, 647 { trace_clock_local, "local", 1 },
491 { trace_clock_global, "global", 1 }, 648 { trace_clock_global, "global", 1 },
492 { trace_clock_counter, "counter", 0 }, 649 { trace_clock_counter, "counter", 0 },
650 { trace_clock_jiffies, "uptime", 1 },
651 { trace_clock, "perf", 1 },
493 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
494}; 653};
495 654
@@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency;
670static void 829static void
671__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 830__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
672{ 831{
673 struct trace_array_cpu *data = tr->data[cpu]; 832 struct trace_buffer *trace_buf = &tr->trace_buffer;
674 struct trace_array_cpu *max_data; 833 struct trace_buffer *max_buf = &tr->max_buffer;
834 struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
835 struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
675 836
676 max_tr.cpu = cpu; 837 max_buf->cpu = cpu;
677 max_tr.time_start = data->preempt_timestamp; 838 max_buf->time_start = data->preempt_timestamp;
678 839
679 max_data = max_tr.data[cpu];
680 max_data->saved_latency = tracing_max_latency; 840 max_data->saved_latency = tracing_max_latency;
681 max_data->critical_start = data->critical_start; 841 max_data->critical_start = data->critical_start;
682 max_data->critical_end = data->critical_end; 842 max_data->critical_end = data->critical_end;
@@ -706,22 +866,22 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
706{ 866{
707 struct ring_buffer *buf; 867 struct ring_buffer *buf;
708 868
709 if (trace_stop_count) 869 if (tr->stop_count)
710 return; 870 return;
711 871
712 WARN_ON_ONCE(!irqs_disabled()); 872 WARN_ON_ONCE(!irqs_disabled());
713 873
714 if (!current_trace->allocated_snapshot) { 874 if (!tr->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */ 875 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace); 876 WARN_ON_ONCE(tr->current_trace != &nop_trace);
717 return; 877 return;
718 } 878 }
719 879
720 arch_spin_lock(&ftrace_max_lock); 880 arch_spin_lock(&ftrace_max_lock);
721 881
722 buf = tr->buffer; 882 buf = tr->trace_buffer.buffer;
723 tr->buffer = max_tr.buffer; 883 tr->trace_buffer.buffer = tr->max_buffer.buffer;
724 max_tr.buffer = buf; 884 tr->max_buffer.buffer = buf;
725 885
726 __update_max_tr(tr, tsk, cpu); 886 __update_max_tr(tr, tsk, cpu);
727 arch_spin_unlock(&ftrace_max_lock); 887 arch_spin_unlock(&ftrace_max_lock);
@@ -740,19 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
740{ 900{
741 int ret; 901 int ret;
742 902
743 if (trace_stop_count) 903 if (tr->stop_count)
744 return; 904 return;
745 905
746 WARN_ON_ONCE(!irqs_disabled()); 906 WARN_ON_ONCE(!irqs_disabled());
747 if (!current_trace->allocated_snapshot) { 907 if (tr->allocated_snapshot) {
748 /* Only the nop tracer should hit this when disabling */ 908 /* Only the nop tracer should hit this when disabling */
749 WARN_ON_ONCE(current_trace != &nop_trace); 909 WARN_ON_ONCE(tr->current_trace != &nop_trace);
750 return; 910 return;
751 } 911 }
752 912
753 arch_spin_lock(&ftrace_max_lock); 913 arch_spin_lock(&ftrace_max_lock);
754 914
755 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 915 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
756 916
757 if (ret == -EBUSY) { 917 if (ret == -EBUSY) {
758 /* 918 /*
@@ -761,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
761 * the max trace buffer (no one writes directly to it) 921 * the max trace buffer (no one writes directly to it)
762 * and flag that it failed. 922 * and flag that it failed.
763 */ 923 */
764 trace_array_printk(&max_tr, _THIS_IP_, 924 trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
765 "Failed to swap buffers due to commit in progress\n"); 925 "Failed to swap buffers due to commit in progress\n");
766 } 926 }
767 927
@@ -774,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
774 934
775static void default_wait_pipe(struct trace_iterator *iter) 935static void default_wait_pipe(struct trace_iterator *iter)
776{ 936{
777 DEFINE_WAIT(wait); 937 /* Iterators are static, they should be filled or empty */
938 if (trace_buffer_iter(iter, iter->cpu_file))
939 return;
940
941 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
942}
943
944#ifdef CONFIG_FTRACE_STARTUP_TEST
945static int run_tracer_selftest(struct tracer *type)
946{
947 struct trace_array *tr = &global_trace;
948 struct tracer *saved_tracer = tr->current_trace;
949 int ret;
778 950
779 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); 951 if (!type->selftest || tracing_selftest_disabled)
952 return 0;
780 953
781 /* 954 /*
782 * The events can happen in critical sections where 955 * Run a selftest on this tracer.
783 * checking a work queue can cause deadlocks. 956 * Here we reset the trace buffer, and set the current
784 * After adding a task to the queue, this flag is set 957 * tracer to be this tracer. The tracer can then run some
785 * only to notify events to try to wake up the queue 958 * internal tracing to verify that everything is in order.
786 * using irq_work. 959 * If we fail, we do not register this tracer.
787 *
788 * We don't clear it even if the buffer is no longer
789 * empty. The flag only causes the next event to run
790 * irq_work to do the work queue wake up. The worse
791 * that can happen if we race with !trace_empty() is that
792 * an event will cause an irq_work to try to wake up
793 * an empty queue.
794 *
795 * There's no reason to protect this flag either, as
796 * the work queue and irq_work logic will do the necessary
797 * synchronization for the wake ups. The only thing
798 * that is necessary is that the wake up happens after
799 * a task has been queued. It's OK for spurious wake ups.
800 */ 960 */
801 trace_wakeup_needed = true; 961 tracing_reset_online_cpus(&tr->trace_buffer);
802 962
803 if (trace_empty(iter)) 963 tr->current_trace = type;
804 schedule();
805 964
806 finish_wait(&trace_wait, &wait); 965#ifdef CONFIG_TRACER_MAX_TRACE
966 if (type->use_max_tr) {
967 /* If we expanded the buffers, make sure the max is expanded too */
968 if (ring_buffer_expanded)
969 ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
970 RING_BUFFER_ALL_CPUS);
971 tr->allocated_snapshot = true;
972 }
973#endif
974
975 /* the test is responsible for initializing and enabling */
976 pr_info("Testing tracer %s: ", type->name);
977 ret = type->selftest(type, tr);
978 /* the test is responsible for resetting too */
979 tr->current_trace = saved_tracer;
980 if (ret) {
981 printk(KERN_CONT "FAILED!\n");
982 /* Add the warning after printing 'FAILED' */
983 WARN_ON(1);
984 return -1;
985 }
986 /* Only reset on passing, to avoid touching corrupted buffers */
987 tracing_reset_online_cpus(&tr->trace_buffer);
988
989#ifdef CONFIG_TRACER_MAX_TRACE
990 if (type->use_max_tr) {
991 tr->allocated_snapshot = false;
992
993 /* Shrink the max buffer again */
994 if (ring_buffer_expanded)
995 ring_buffer_resize(tr->max_buffer.buffer, 1,
996 RING_BUFFER_ALL_CPUS);
997 }
998#endif
999
1000 printk(KERN_CONT "PASSED\n");
1001 return 0;
1002}
1003#else
1004static inline int run_tracer_selftest(struct tracer *type)
1005{
1006 return 0;
807} 1007}
1008#endif /* CONFIG_FTRACE_STARTUP_TEST */
808 1009
809/** 1010/**
810 * register_tracer - register a tracer with the ftrace system. 1011 * register_tracer - register a tracer with the ftrace system.
@@ -851,57 +1052,9 @@ int register_tracer(struct tracer *type)
851 if (!type->wait_pipe) 1052 if (!type->wait_pipe)
852 type->wait_pipe = default_wait_pipe; 1053 type->wait_pipe = default_wait_pipe;
853 1054
854 1055 ret = run_tracer_selftest(type);
855#ifdef CONFIG_FTRACE_STARTUP_TEST 1056 if (ret < 0)
856 if (type->selftest && !tracing_selftest_disabled) { 1057 goto out;
857 struct tracer *saved_tracer = current_trace;
858 struct trace_array *tr = &global_trace;
859
860 /*
861 * Run a selftest on this tracer.
862 * Here we reset the trace buffer, and set the current
863 * tracer to be this tracer. The tracer can then run some
864 * internal tracing to verify that everything is in order.
865 * If we fail, we do not register this tracer.
866 */
867 tracing_reset_online_cpus(tr);
868
869 current_trace = type;
870
871 if (type->use_max_tr) {
872 /* If we expanded the buffers, make sure the max is expanded too */
873 if (ring_buffer_expanded)
874 ring_buffer_resize(max_tr.buffer, trace_buf_size,
875 RING_BUFFER_ALL_CPUS);
876 type->allocated_snapshot = true;
877 }
878
879 /* the test is responsible for initializing and enabling */
880 pr_info("Testing tracer %s: ", type->name);
881 ret = type->selftest(type, tr);
882 /* the test is responsible for resetting too */
883 current_trace = saved_tracer;
884 if (ret) {
885 printk(KERN_CONT "FAILED!\n");
886 /* Add the warning after printing 'FAILED' */
887 WARN_ON(1);
888 goto out;
889 }
890 /* Only reset on passing, to avoid touching corrupted buffers */
891 tracing_reset_online_cpus(tr);
892
893 if (type->use_max_tr) {
894 type->allocated_snapshot = false;
895
896 /* Shrink the max buffer again */
897 if (ring_buffer_expanded)
898 ring_buffer_resize(max_tr.buffer, 1,
899 RING_BUFFER_ALL_CPUS);
900 }
901
902 printk(KERN_CONT "PASSED\n");
903 }
904#endif
905 1058
906 type->next = trace_types; 1059 type->next = trace_types;
907 trace_types = type; 1060 trace_types = type;
@@ -921,7 +1074,7 @@ int register_tracer(struct tracer *type)
921 tracing_set_tracer(type->name); 1074 tracing_set_tracer(type->name);
922 default_bootup_tracer = NULL; 1075 default_bootup_tracer = NULL;
923 /* disable other selftests, since this will break it. */ 1076 /* disable other selftests, since this will break it. */
924 tracing_selftest_disabled = 1; 1077 tracing_selftest_disabled = true;
925#ifdef CONFIG_FTRACE_STARTUP_TEST 1078#ifdef CONFIG_FTRACE_STARTUP_TEST
926 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", 1079 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
927 type->name); 1080 type->name);
@@ -931,9 +1084,9 @@ int register_tracer(struct tracer *type)
931 return ret; 1084 return ret;
932} 1085}
933 1086
934void tracing_reset(struct trace_array *tr, int cpu) 1087void tracing_reset(struct trace_buffer *buf, int cpu)
935{ 1088{
936 struct ring_buffer *buffer = tr->buffer; 1089 struct ring_buffer *buffer = buf->buffer;
937 1090
938 if (!buffer) 1091 if (!buffer)
939 return; 1092 return;
@@ -947,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
947 ring_buffer_record_enable(buffer); 1100 ring_buffer_record_enable(buffer);
948} 1101}
949 1102
950void tracing_reset_online_cpus(struct trace_array *tr) 1103void tracing_reset_online_cpus(struct trace_buffer *buf)
951{ 1104{
952 struct ring_buffer *buffer = tr->buffer; 1105 struct ring_buffer *buffer = buf->buffer;
953 int cpu; 1106 int cpu;
954 1107
955 if (!buffer) 1108 if (!buffer)
@@ -960,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
960 /* Make sure all commits have finished */ 1113 /* Make sure all commits have finished */
961 synchronize_sched(); 1114 synchronize_sched();
962 1115
963 tr->time_start = ftrace_now(tr->cpu); 1116 buf->time_start = ftrace_now(buf->cpu);
964 1117
965 for_each_online_cpu(cpu) 1118 for_each_online_cpu(cpu)
966 ring_buffer_reset_cpu(buffer, cpu); 1119 ring_buffer_reset_cpu(buffer, cpu);
@@ -970,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
970 1123
971void tracing_reset_current(int cpu) 1124void tracing_reset_current(int cpu)
972{ 1125{
973 tracing_reset(&global_trace, cpu); 1126 tracing_reset(&global_trace.trace_buffer, cpu);
974} 1127}
975 1128
976void tracing_reset_current_online_cpus(void) 1129void tracing_reset_all_online_cpus(void)
977{ 1130{
978 tracing_reset_online_cpus(&global_trace); 1131 struct trace_array *tr;
1132
1133 mutex_lock(&trace_types_lock);
1134 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1135 tracing_reset_online_cpus(&tr->trace_buffer);
1136#ifdef CONFIG_TRACER_MAX_TRACE
1137 tracing_reset_online_cpus(&tr->max_buffer);
1138#endif
1139 }
1140 mutex_unlock(&trace_types_lock);
979} 1141}
980 1142
981#define SAVED_CMDLINES 128 1143#define SAVED_CMDLINES 128
@@ -998,7 +1160,7 @@ static void trace_init_cmdlines(void)
998 1160
999int is_tracing_stopped(void) 1161int is_tracing_stopped(void)
1000{ 1162{
1001 return trace_stop_count; 1163 return global_trace.stop_count;
1002} 1164}
1003 1165
1004/** 1166/**
@@ -1030,12 +1192,12 @@ void tracing_start(void)
1030 if (tracing_disabled) 1192 if (tracing_disabled)
1031 return; 1193 return;
1032 1194
1033 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1195 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1034 if (--trace_stop_count) { 1196 if (--global_trace.stop_count) {
1035 if (trace_stop_count < 0) { 1197 if (global_trace.stop_count < 0) {
1036 /* Someone screwed up their debugging */ 1198 /* Someone screwed up their debugging */
1037 WARN_ON_ONCE(1); 1199 WARN_ON_ONCE(1);
1038 trace_stop_count = 0; 1200 global_trace.stop_count = 0;
1039 } 1201 }
1040 goto out; 1202 goto out;
1041 } 1203 }
@@ -1043,19 +1205,52 @@ void tracing_start(void)
1043 /* Prevent the buffers from switching */ 1205 /* Prevent the buffers from switching */
1044 arch_spin_lock(&ftrace_max_lock); 1206 arch_spin_lock(&ftrace_max_lock);
1045 1207
1046 buffer = global_trace.buffer; 1208 buffer = global_trace.trace_buffer.buffer;
1047 if (buffer) 1209 if (buffer)
1048 ring_buffer_record_enable(buffer); 1210 ring_buffer_record_enable(buffer);
1049 1211
1050 buffer = max_tr.buffer; 1212#ifdef CONFIG_TRACER_MAX_TRACE
1213 buffer = global_trace.max_buffer.buffer;
1051 if (buffer) 1214 if (buffer)
1052 ring_buffer_record_enable(buffer); 1215 ring_buffer_record_enable(buffer);
1216#endif
1053 1217
1054 arch_spin_unlock(&ftrace_max_lock); 1218 arch_spin_unlock(&ftrace_max_lock);
1055 1219
1056 ftrace_start(); 1220 ftrace_start();
1057 out: 1221 out:
1058 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1222 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1223}
1224
1225static void tracing_start_tr(struct trace_array *tr)
1226{
1227 struct ring_buffer *buffer;
1228 unsigned long flags;
1229
1230 if (tracing_disabled)
1231 return;
1232
1233 /* If global, we need to also start the max tracer */
1234 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1235 return tracing_start();
1236
1237 raw_spin_lock_irqsave(&tr->start_lock, flags);
1238
1239 if (--tr->stop_count) {
1240 if (tr->stop_count < 0) {
1241 /* Someone screwed up their debugging */
1242 WARN_ON_ONCE(1);
1243 tr->stop_count = 0;
1244 }
1245 goto out;
1246 }
1247
1248 buffer = tr->trace_buffer.buffer;
1249 if (buffer)
1250 ring_buffer_record_enable(buffer);
1251
1252 out:
1253 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1059} 1254}
1060 1255
1061/** 1256/**
@@ -1070,25 +1265,48 @@ void tracing_stop(void)
1070 unsigned long flags; 1265 unsigned long flags;
1071 1266
1072 ftrace_stop(); 1267 ftrace_stop();
1073 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1268 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1074 if (trace_stop_count++) 1269 if (global_trace.stop_count++)
1075 goto out; 1270 goto out;
1076 1271
1077 /* Prevent the buffers from switching */ 1272 /* Prevent the buffers from switching */
1078 arch_spin_lock(&ftrace_max_lock); 1273 arch_spin_lock(&ftrace_max_lock);
1079 1274
1080 buffer = global_trace.buffer; 1275 buffer = global_trace.trace_buffer.buffer;
1081 if (buffer) 1276 if (buffer)
1082 ring_buffer_record_disable(buffer); 1277 ring_buffer_record_disable(buffer);
1083 1278
1084 buffer = max_tr.buffer; 1279#ifdef CONFIG_TRACER_MAX_TRACE
1280 buffer = global_trace.max_buffer.buffer;
1085 if (buffer) 1281 if (buffer)
1086 ring_buffer_record_disable(buffer); 1282 ring_buffer_record_disable(buffer);
1283#endif
1087 1284
1088 arch_spin_unlock(&ftrace_max_lock); 1285 arch_spin_unlock(&ftrace_max_lock);
1089 1286
1090 out: 1287 out:
1091 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1288 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1289}
1290
1291static void tracing_stop_tr(struct trace_array *tr)
1292{
1293 struct ring_buffer *buffer;
1294 unsigned long flags;
1295
1296 /* If global, we need to also stop the max tracer */
1297 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1298 return tracing_stop();
1299
1300 raw_spin_lock_irqsave(&tr->start_lock, flags);
1301 if (tr->stop_count++)
1302 goto out;
1303
1304 buffer = tr->trace_buffer.buffer;
1305 if (buffer)
1306 ring_buffer_record_disable(buffer);
1307
1308 out:
1309 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1092} 1310}
1093 1311
1094void trace_stop_cmdline_recording(void); 1312void trace_stop_cmdline_recording(void);
@@ -1221,11 +1439,6 @@ void
1221__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 1439__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1222{ 1440{
1223 __this_cpu_write(trace_cmdline_save, true); 1441 __this_cpu_write(trace_cmdline_save, true);
1224 if (trace_wakeup_needed) {
1225 trace_wakeup_needed = false;
1226 /* irq_work_queue() supplies it's own memory barriers */
1227 irq_work_queue(&trace_work_wakeup);
1228 }
1229 ring_buffer_unlock_commit(buffer, event); 1442 ring_buffer_unlock_commit(buffer, event);
1230} 1443}
1231 1444
@@ -1249,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1249EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1462EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1250 1463
1251struct ring_buffer_event * 1464struct ring_buffer_event *
1465trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1466 struct ftrace_event_file *ftrace_file,
1467 int type, unsigned long len,
1468 unsigned long flags, int pc)
1469{
1470 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1471 return trace_buffer_lock_reserve(*current_rb,
1472 type, len, flags, pc);
1473}
1474EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1475
1476struct ring_buffer_event *
1252trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1477trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1253 int type, unsigned long len, 1478 int type, unsigned long len,
1254 unsigned long flags, int pc) 1479 unsigned long flags, int pc)
1255{ 1480{
1256 *current_rb = global_trace.buffer; 1481 *current_rb = global_trace.trace_buffer.buffer;
1257 return trace_buffer_lock_reserve(*current_rb, 1482 return trace_buffer_lock_reserve(*current_rb,
1258 type, len, flags, pc); 1483 type, len, flags, pc);
1259} 1484}
@@ -1292,7 +1517,7 @@ trace_function(struct trace_array *tr,
1292 int pc) 1517 int pc)
1293{ 1518{
1294 struct ftrace_event_call *call = &event_function; 1519 struct ftrace_event_call *call = &event_function;
1295 struct ring_buffer *buffer = tr->buffer; 1520 struct ring_buffer *buffer = tr->trace_buffer.buffer;
1296 struct ring_buffer_event *event; 1521 struct ring_buffer_event *event;
1297 struct ftrace_entry *entry; 1522 struct ftrace_entry *entry;
1298 1523
@@ -1433,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1433void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1658void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1434 int pc) 1659 int pc)
1435{ 1660{
1436 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); 1661 __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
1437} 1662}
1438 1663
1439/** 1664/**
1440 * trace_dump_stack - record a stack back trace in the trace buffer 1665 * trace_dump_stack - record a stack back trace in the trace buffer
1666 * @skip: Number of functions to skip (helper handlers)
1441 */ 1667 */
1442void trace_dump_stack(void) 1668void trace_dump_stack(int skip)
1443{ 1669{
1444 unsigned long flags; 1670 unsigned long flags;
1445 1671
@@ -1448,8 +1674,13 @@ void trace_dump_stack(void)
1448 1674
1449 local_save_flags(flags); 1675 local_save_flags(flags);
1450 1676
1451 /* skipping 3 traces, seems to get us at the caller of this function */ 1677 /*
1452 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); 1678 * Skip 3 more, seems to get us at the caller of
1679 * this function.
1680 */
1681 skip += 3;
1682 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
1683 flags, skip, preempt_count(), NULL);
1453} 1684}
1454 1685
1455static DEFINE_PER_CPU(int, user_stack_count); 1686static DEFINE_PER_CPU(int, user_stack_count);
@@ -1619,7 +1850,7 @@ void trace_printk_init_buffers(void)
1619 * directly here. If the global_trace.buffer is already 1850 * directly here. If the global_trace.buffer is already
1620 * allocated here, then this was called by module code. 1851 * allocated here, then this was called by module code.
1621 */ 1852 */
1622 if (global_trace.buffer) 1853 if (global_trace.trace_buffer.buffer)
1623 tracing_start_cmdline_record(); 1854 tracing_start_cmdline_record();
1624} 1855}
1625 1856
@@ -1679,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1679 1910
1680 local_save_flags(flags); 1911 local_save_flags(flags);
1681 size = sizeof(*entry) + sizeof(u32) * len; 1912 size = sizeof(*entry) + sizeof(u32) * len;
1682 buffer = tr->buffer; 1913 buffer = tr->trace_buffer.buffer;
1683 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1914 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1684 flags, pc); 1915 flags, pc);
1685 if (!event) 1916 if (!event)
@@ -1702,27 +1933,12 @@ out:
1702} 1933}
1703EXPORT_SYMBOL_GPL(trace_vbprintk); 1934EXPORT_SYMBOL_GPL(trace_vbprintk);
1704 1935
1705int trace_array_printk(struct trace_array *tr, 1936static int
1706 unsigned long ip, const char *fmt, ...) 1937__trace_array_vprintk(struct ring_buffer *buffer,
1707{ 1938 unsigned long ip, const char *fmt, va_list args)
1708 int ret;
1709 va_list ap;
1710
1711 if (!(trace_flags & TRACE_ITER_PRINTK))
1712 return 0;
1713
1714 va_start(ap, fmt);
1715 ret = trace_array_vprintk(tr, ip, fmt, ap);
1716 va_end(ap);
1717 return ret;
1718}
1719
1720int trace_array_vprintk(struct trace_array *tr,
1721 unsigned long ip, const char *fmt, va_list args)
1722{ 1939{
1723 struct ftrace_event_call *call = &event_print; 1940 struct ftrace_event_call *call = &event_print;
1724 struct ring_buffer_event *event; 1941 struct ring_buffer_event *event;
1725 struct ring_buffer *buffer;
1726 int len = 0, size, pc; 1942 int len = 0, size, pc;
1727 struct print_entry *entry; 1943 struct print_entry *entry;
1728 unsigned long flags; 1944 unsigned long flags;
@@ -1750,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,
1750 1966
1751 local_save_flags(flags); 1967 local_save_flags(flags);
1752 size = sizeof(*entry) + len + 1; 1968 size = sizeof(*entry) + len + 1;
1753 buffer = tr->buffer;
1754 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1969 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1755 flags, pc); 1970 flags, pc);
1756 if (!event) 1971 if (!event)
@@ -1771,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,
1771 return len; 1986 return len;
1772} 1987}
1773 1988
1989int trace_array_vprintk(struct trace_array *tr,
1990 unsigned long ip, const char *fmt, va_list args)
1991{
1992 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
1993}
1994
1995int trace_array_printk(struct trace_array *tr,
1996 unsigned long ip, const char *fmt, ...)
1997{
1998 int ret;
1999 va_list ap;
2000
2001 if (!(trace_flags & TRACE_ITER_PRINTK))
2002 return 0;
2003
2004 va_start(ap, fmt);
2005 ret = trace_array_vprintk(tr, ip, fmt, ap);
2006 va_end(ap);
2007 return ret;
2008}
2009
2010int trace_array_printk_buf(struct ring_buffer *buffer,
2011 unsigned long ip, const char *fmt, ...)
2012{
2013 int ret;
2014 va_list ap;
2015
2016 if (!(trace_flags & TRACE_ITER_PRINTK))
2017 return 0;
2018
2019 va_start(ap, fmt);
2020 ret = __trace_array_vprintk(buffer, ip, fmt, ap);
2021 va_end(ap);
2022 return ret;
2023}
2024
1774int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 2025int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1775{ 2026{
1776 return trace_array_vprintk(&global_trace, ip, fmt, args); 2027 return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1796,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1796 if (buf_iter) 2047 if (buf_iter)
1797 event = ring_buffer_iter_peek(buf_iter, ts); 2048 event = ring_buffer_iter_peek(buf_iter, ts);
1798 else 2049 else
1799 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 2050 event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
1800 lost_events); 2051 lost_events);
1801 2052
1802 if (event) { 2053 if (event) {
@@ -1811,7 +2062,7 @@ static struct trace_entry *
1811__find_next_entry(struct trace_iterator *iter, int *ent_cpu, 2062__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1812 unsigned long *missing_events, u64 *ent_ts) 2063 unsigned long *missing_events, u64 *ent_ts)
1813{ 2064{
1814 struct ring_buffer *buffer = iter->tr->buffer; 2065 struct ring_buffer *buffer = iter->trace_buffer->buffer;
1815 struct trace_entry *ent, *next = NULL; 2066 struct trace_entry *ent, *next = NULL;
1816 unsigned long lost_events = 0, next_lost = 0; 2067 unsigned long lost_events = 0, next_lost = 0;
1817 int cpu_file = iter->cpu_file; 2068 int cpu_file = iter->cpu_file;
@@ -1824,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1824 * If we are in a per_cpu trace file, don't bother by iterating over 2075 * If we are in a per_cpu trace file, don't bother by iterating over
1825 * all cpu and peek directly. 2076 * all cpu and peek directly.
1826 */ 2077 */
1827 if (cpu_file > TRACE_PIPE_ALL_CPU) { 2078 if (cpu_file > RING_BUFFER_ALL_CPUS) {
1828 if (ring_buffer_empty_cpu(buffer, cpu_file)) 2079 if (ring_buffer_empty_cpu(buffer, cpu_file))
1829 return NULL; 2080 return NULL;
1830 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); 2081 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1888,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1888 2139
1889static void trace_consume(struct trace_iterator *iter) 2140static void trace_consume(struct trace_iterator *iter)
1890{ 2141{
1891 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 2142 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
1892 &iter->lost_events); 2143 &iter->lost_events);
1893} 2144}
1894 2145
@@ -1921,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1921 2172
1922void tracing_iter_reset(struct trace_iterator *iter, int cpu) 2173void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1923{ 2174{
1924 struct trace_array *tr = iter->tr;
1925 struct ring_buffer_event *event; 2175 struct ring_buffer_event *event;
1926 struct ring_buffer_iter *buf_iter; 2176 struct ring_buffer_iter *buf_iter;
1927 unsigned long entries = 0; 2177 unsigned long entries = 0;
1928 u64 ts; 2178 u64 ts;
1929 2179
1930 tr->data[cpu]->skipped_entries = 0; 2180 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
1931 2181
1932 buf_iter = trace_buffer_iter(iter, cpu); 2182 buf_iter = trace_buffer_iter(iter, cpu);
1933 if (!buf_iter) 2183 if (!buf_iter)
@@ -1941,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1941 * by the timestamp being before the start of the buffer. 2191 * by the timestamp being before the start of the buffer.
1942 */ 2192 */
1943 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { 2193 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1944 if (ts >= iter->tr->time_start) 2194 if (ts >= iter->trace_buffer->time_start)
1945 break; 2195 break;
1946 entries++; 2196 entries++;
1947 ring_buffer_read(buf_iter, NULL); 2197 ring_buffer_read(buf_iter, NULL);
1948 } 2198 }
1949 2199
1950 tr->data[cpu]->skipped_entries = entries; 2200 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
1951} 2201}
1952 2202
1953/* 2203/*
@@ -1957,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1957static void *s_start(struct seq_file *m, loff_t *pos) 2207static void *s_start(struct seq_file *m, loff_t *pos)
1958{ 2208{
1959 struct trace_iterator *iter = m->private; 2209 struct trace_iterator *iter = m->private;
2210 struct trace_array *tr = iter->tr;
1960 int cpu_file = iter->cpu_file; 2211 int cpu_file = iter->cpu_file;
1961 void *p = NULL; 2212 void *p = NULL;
1962 loff_t l = 0; 2213 loff_t l = 0;
@@ -1969,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1969 * will point to the same string as current_trace->name. 2220 * will point to the same string as current_trace->name.
1970 */ 2221 */
1971 mutex_lock(&trace_types_lock); 2222 mutex_lock(&trace_types_lock);
1972 if (unlikely(current_trace && iter->trace->name != current_trace->name)) 2223 if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
1973 *iter->trace = *current_trace; 2224 *iter->trace = *tr->current_trace;
1974 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
1975 2226
2227#ifdef CONFIG_TRACER_MAX_TRACE
1976 if (iter->snapshot && iter->trace->use_max_tr) 2228 if (iter->snapshot && iter->trace->use_max_tr)
1977 return ERR_PTR(-EBUSY); 2229 return ERR_PTR(-EBUSY);
2230#endif
1978 2231
1979 if (!iter->snapshot) 2232 if (!iter->snapshot)
1980 atomic_inc(&trace_record_cmdline_disabled); 2233 atomic_inc(&trace_record_cmdline_disabled);
@@ -1984,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1984 iter->cpu = 0; 2237 iter->cpu = 0;
1985 iter->idx = -1; 2238 iter->idx = -1;
1986 2239
1987 if (cpu_file == TRACE_PIPE_ALL_CPU) { 2240 if (cpu_file == RING_BUFFER_ALL_CPUS) {
1988 for_each_tracing_cpu(cpu) 2241 for_each_tracing_cpu(cpu)
1989 tracing_iter_reset(iter, cpu); 2242 tracing_iter_reset(iter, cpu);
1990 } else 2243 } else
@@ -2016,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)
2016{ 2269{
2017 struct trace_iterator *iter = m->private; 2270 struct trace_iterator *iter = m->private;
2018 2271
2272#ifdef CONFIG_TRACER_MAX_TRACE
2019 if (iter->snapshot && iter->trace->use_max_tr) 2273 if (iter->snapshot && iter->trace->use_max_tr)
2020 return; 2274 return;
2275#endif
2021 2276
2022 if (!iter->snapshot) 2277 if (!iter->snapshot)
2023 atomic_dec(&trace_record_cmdline_disabled); 2278 atomic_dec(&trace_record_cmdline_disabled);
2279
2024 trace_access_unlock(iter->cpu_file); 2280 trace_access_unlock(iter->cpu_file);
2025 trace_event_read_unlock(); 2281 trace_event_read_unlock();
2026} 2282}
2027 2283
2028static void 2284static void
2029get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) 2285get_total_entries(struct trace_buffer *buf,
2286 unsigned long *total, unsigned long *entries)
2030{ 2287{
2031 unsigned long count; 2288 unsigned long count;
2032 int cpu; 2289 int cpu;
@@ -2035,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
2035 *entries = 0; 2292 *entries = 0;
2036 2293
2037 for_each_tracing_cpu(cpu) { 2294 for_each_tracing_cpu(cpu) {
2038 count = ring_buffer_entries_cpu(tr->buffer, cpu); 2295 count = ring_buffer_entries_cpu(buf->buffer, cpu);
2039 /* 2296 /*
2040 * If this buffer has skipped entries, then we hold all 2297 * If this buffer has skipped entries, then we hold all
2041 * entries for the trace and we need to ignore the 2298 * entries for the trace and we need to ignore the
2042 * ones before the time stamp. 2299 * ones before the time stamp.
2043 */ 2300 */
2044 if (tr->data[cpu]->skipped_entries) { 2301 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
2045 count -= tr->data[cpu]->skipped_entries; 2302 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
2046 /* total is the same as the entries */ 2303 /* total is the same as the entries */
2047 *total += count; 2304 *total += count;
2048 } else 2305 } else
2049 *total += count + 2306 *total += count +
2050 ring_buffer_overrun_cpu(tr->buffer, cpu); 2307 ring_buffer_overrun_cpu(buf->buffer, cpu);
2051 *entries += count; 2308 *entries += count;
2052 } 2309 }
2053} 2310}
@@ -2064,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)
2064 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2321 seq_puts(m, "# \\ / ||||| \\ | / \n");
2065} 2322}
2066 2323
2067static void print_event_info(struct trace_array *tr, struct seq_file *m) 2324static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2068{ 2325{
2069 unsigned long total; 2326 unsigned long total;
2070 unsigned long entries; 2327 unsigned long entries;
2071 2328
2072 get_total_entries(tr, &total, &entries); 2329 get_total_entries(buf, &total, &entries);
2073 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", 2330 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
2074 entries, total, num_online_cpus()); 2331 entries, total, num_online_cpus());
2075 seq_puts(m, "#\n"); 2332 seq_puts(m, "#\n");
2076} 2333}
2077 2334
2078static void print_func_help_header(struct trace_array *tr, struct seq_file *m) 2335static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2079{ 2336{
2080 print_event_info(tr, m); 2337 print_event_info(buf, m);
2081 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2338 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2082 seq_puts(m, "# | | | | |\n"); 2339 seq_puts(m, "# | | | | |\n");
2083} 2340}
2084 2341
2085static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) 2342static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2086{ 2343{
2087 print_event_info(tr, m); 2344 print_event_info(buf, m);
2088 seq_puts(m, "# _-----=> irqs-off\n"); 2345 seq_puts(m, "# _-----=> irqs-off\n");
2089 seq_puts(m, "# / _----=> need-resched\n"); 2346 seq_puts(m, "# / _----=> need-resched\n");
2090 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2347 seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -2098,16 +2355,16 @@ void
2098print_trace_header(struct seq_file *m, struct trace_iterator *iter) 2355print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2099{ 2356{
2100 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 2357 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
2101 struct trace_array *tr = iter->tr; 2358 struct trace_buffer *buf = iter->trace_buffer;
2102 struct trace_array_cpu *data = tr->data[tr->cpu]; 2359 struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
2103 struct tracer *type = current_trace; 2360 struct tracer *type = iter->trace;
2104 unsigned long entries; 2361 unsigned long entries;
2105 unsigned long total; 2362 unsigned long total;
2106 const char *name = "preemption"; 2363 const char *name = "preemption";
2107 2364
2108 name = type->name; 2365 name = type->name;
2109 2366
2110 get_total_entries(tr, &total, &entries); 2367 get_total_entries(buf, &total, &entries);
2111 2368
2112 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 2369 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
2113 name, UTS_RELEASE); 2370 name, UTS_RELEASE);
@@ -2118,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2118 nsecs_to_usecs(data->saved_latency), 2375 nsecs_to_usecs(data->saved_latency),
2119 entries, 2376 entries,
2120 total, 2377 total,
2121 tr->cpu, 2378 buf->cpu,
2122#if defined(CONFIG_PREEMPT_NONE) 2379#if defined(CONFIG_PREEMPT_NONE)
2123 "server", 2380 "server",
2124#elif defined(CONFIG_PREEMPT_VOLUNTARY) 2381#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2169,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
2169 if (cpumask_test_cpu(iter->cpu, iter->started)) 2426 if (cpumask_test_cpu(iter->cpu, iter->started))
2170 return; 2427 return;
2171 2428
2172 if (iter->tr->data[iter->cpu]->skipped_entries) 2429 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
2173 return; 2430 return;
2174 2431
2175 cpumask_set_cpu(iter->cpu, iter->started); 2432 cpumask_set_cpu(iter->cpu, iter->started);
@@ -2292,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)
2292 int cpu; 2549 int cpu;
2293 2550
2294 /* If we are looking at one CPU buffer, only check that one */ 2551 /* If we are looking at one CPU buffer, only check that one */
2295 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2552 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
2296 cpu = iter->cpu_file; 2553 cpu = iter->cpu_file;
2297 buf_iter = trace_buffer_iter(iter, cpu); 2554 buf_iter = trace_buffer_iter(iter, cpu);
2298 if (buf_iter) { 2555 if (buf_iter) {
2299 if (!ring_buffer_iter_empty(buf_iter)) 2556 if (!ring_buffer_iter_empty(buf_iter))
2300 return 0; 2557 return 0;
2301 } else { 2558 } else {
2302 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2559 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2303 return 0; 2560 return 0;
2304 } 2561 }
2305 return 1; 2562 return 1;
@@ -2311,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)
2311 if (!ring_buffer_iter_empty(buf_iter)) 2568 if (!ring_buffer_iter_empty(buf_iter))
2312 return 0; 2569 return 0;
2313 } else { 2570 } else {
2314 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2571 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2315 return 0; 2572 return 0;
2316 } 2573 }
2317 } 2574 }
@@ -2335,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2335 return ret; 2592 return ret;
2336 } 2593 }
2337 2594
2595 if (iter->ent->type == TRACE_BPUTS &&
2596 trace_flags & TRACE_ITER_PRINTK &&
2597 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2598 return trace_print_bputs_msg_only(iter);
2599
2338 if (iter->ent->type == TRACE_BPRINT && 2600 if (iter->ent->type == TRACE_BPRINT &&
2339 trace_flags & TRACE_ITER_PRINTK && 2601 trace_flags & TRACE_ITER_PRINTK &&
2340 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 2602 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2389,9 +2651,9 @@ void trace_default_header(struct seq_file *m)
2389 } else { 2651 } else {
2390 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 2652 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2391 if (trace_flags & TRACE_ITER_IRQ_INFO) 2653 if (trace_flags & TRACE_ITER_IRQ_INFO)
2392 print_func_help_header_irq(iter->tr, m); 2654 print_func_help_header_irq(iter->trace_buffer, m);
2393 else 2655 else
2394 print_func_help_header(iter->tr, m); 2656 print_func_help_header(iter->trace_buffer, m);
2395 } 2657 }
2396 } 2658 }
2397} 2659}
@@ -2405,14 +2667,8 @@ static void test_ftrace_alive(struct seq_file *m)
2405} 2667}
2406 2668
2407#ifdef CONFIG_TRACER_MAX_TRACE 2669#ifdef CONFIG_TRACER_MAX_TRACE
2408static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2670static void show_snapshot_main_help(struct seq_file *m)
2409{ 2671{
2410 if (iter->trace->allocated_snapshot)
2411 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2412 else
2413 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2414
2415 seq_printf(m, "# Snapshot commands:\n");
2416 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2672 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2417 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2673 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2418 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2674 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
@@ -2420,6 +2676,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2420 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2676 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2421 seq_printf(m, "# is not a '0' or '1')\n"); 2677 seq_printf(m, "# is not a '0' or '1')\n");
2422} 2678}
2679
2680static void show_snapshot_percpu_help(struct seq_file *m)
2681{
2682 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2683#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2684 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2685 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
2686#else
2687 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
2688 seq_printf(m, "# Must use main snapshot file to allocate.\n");
2689#endif
2690 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
2691 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2692 seq_printf(m, "# is not a '0' or '1')\n");
2693}
2694
2695static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2696{
2697 if (iter->tr->allocated_snapshot)
2698 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2699 else
2700 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2701
2702 seq_printf(m, "# Snapshot commands:\n");
2703 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2704 show_snapshot_main_help(m);
2705 else
2706 show_snapshot_percpu_help(m);
2707}
2423#else 2708#else
2424/* Should never be called */ 2709/* Should never be called */
2425static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } 2710static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
@@ -2479,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {
2479static struct trace_iterator * 2764static struct trace_iterator *
2480__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2765__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2481{ 2766{
2482 long cpu_file = (long) inode->i_private; 2767 struct trace_cpu *tc = inode->i_private;
2768 struct trace_array *tr = tc->tr;
2483 struct trace_iterator *iter; 2769 struct trace_iterator *iter;
2484 int cpu; 2770 int cpu;
2485 2771
@@ -2504,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2504 if (!iter->trace) 2790 if (!iter->trace)
2505 goto fail; 2791 goto fail;
2506 2792
2507 *iter->trace = *current_trace; 2793 *iter->trace = *tr->current_trace;
2508 2794
2509 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2795 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2510 goto fail; 2796 goto fail;
2511 2797
2512 if (current_trace->print_max || snapshot) 2798 iter->tr = tr;
2513 iter->tr = &max_tr; 2799
2800#ifdef CONFIG_TRACER_MAX_TRACE
2801 /* Currently only the top directory has a snapshot */
2802 if (tr->current_trace->print_max || snapshot)
2803 iter->trace_buffer = &tr->max_buffer;
2514 else 2804 else
2515 iter->tr = &global_trace; 2805#endif
2806 iter->trace_buffer = &tr->trace_buffer;
2516 iter->snapshot = snapshot; 2807 iter->snapshot = snapshot;
2517 iter->pos = -1; 2808 iter->pos = -1;
2518 mutex_init(&iter->mutex); 2809 mutex_init(&iter->mutex);
2519 iter->cpu_file = cpu_file; 2810 iter->cpu_file = tc->cpu;
2520 2811
2521 /* Notify the tracer early; before we stop tracing. */ 2812 /* Notify the tracer early; before we stop tracing. */
2522 if (iter->trace && iter->trace->open) 2813 if (iter->trace && iter->trace->open)
2523 iter->trace->open(iter); 2814 iter->trace->open(iter);
2524 2815
2525 /* Annotate start of buffers if we had overruns */ 2816 /* Annotate start of buffers if we had overruns */
2526 if (ring_buffer_overruns(iter->tr->buffer)) 2817 if (ring_buffer_overruns(iter->trace_buffer->buffer))
2527 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2818 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2528 2819
2529 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2820 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2532,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2532 2823
2533 /* stop the trace while dumping if we are not opening "snapshot" */ 2824 /* stop the trace while dumping if we are not opening "snapshot" */
2534 if (!iter->snapshot) 2825 if (!iter->snapshot)
2535 tracing_stop(); 2826 tracing_stop_tr(tr);
2536 2827
2537 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2828 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
2538 for_each_tracing_cpu(cpu) { 2829 for_each_tracing_cpu(cpu) {
2539 iter->buffer_iter[cpu] = 2830 iter->buffer_iter[cpu] =
2540 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2831 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2541 } 2832 }
2542 ring_buffer_read_prepare_sync(); 2833 ring_buffer_read_prepare_sync();
2543 for_each_tracing_cpu(cpu) { 2834 for_each_tracing_cpu(cpu) {
@@ -2547,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2547 } else { 2838 } else {
2548 cpu = iter->cpu_file; 2839 cpu = iter->cpu_file;
2549 iter->buffer_iter[cpu] = 2840 iter->buffer_iter[cpu] =
2550 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2841 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2551 ring_buffer_read_prepare_sync(); 2842 ring_buffer_read_prepare_sync();
2552 ring_buffer_read_start(iter->buffer_iter[cpu]); 2843 ring_buffer_read_start(iter->buffer_iter[cpu]);
2553 tracing_iter_reset(iter, cpu); 2844 tracing_iter_reset(iter, cpu);
2554 } 2845 }
2555 2846
2847 tr->ref++;
2848
2556 mutex_unlock(&trace_types_lock); 2849 mutex_unlock(&trace_types_lock);
2557 2850
2558 return iter; 2851 return iter;
@@ -2579,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)
2579{ 2872{
2580 struct seq_file *m = file->private_data; 2873 struct seq_file *m = file->private_data;
2581 struct trace_iterator *iter; 2874 struct trace_iterator *iter;
2875 struct trace_array *tr;
2582 int cpu; 2876 int cpu;
2583 2877
2584 if (!(file->f_mode & FMODE_READ)) 2878 if (!(file->f_mode & FMODE_READ))
2585 return 0; 2879 return 0;
2586 2880
2587 iter = m->private; 2881 iter = m->private;
2882 tr = iter->tr;
2588 2883
2589 mutex_lock(&trace_types_lock); 2884 mutex_lock(&trace_types_lock);
2885
2886 WARN_ON(!tr->ref);
2887 tr->ref--;
2888
2590 for_each_tracing_cpu(cpu) { 2889 for_each_tracing_cpu(cpu) {
2591 if (iter->buffer_iter[cpu]) 2890 if (iter->buffer_iter[cpu])
2592 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2891 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2597,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2597 2896
2598 if (!iter->snapshot) 2897 if (!iter->snapshot)
2599 /* reenable tracing if it was previously enabled */ 2898 /* reenable tracing if it was previously enabled */
2600 tracing_start(); 2899 tracing_start_tr(tr);
2601 mutex_unlock(&trace_types_lock); 2900 mutex_unlock(&trace_types_lock);
2602 2901
2603 mutex_destroy(&iter->mutex); 2902 mutex_destroy(&iter->mutex);
@@ -2616,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)
2616 /* If this file was open for write, then erase contents */ 2915 /* If this file was open for write, then erase contents */
2617 if ((file->f_mode & FMODE_WRITE) && 2916 if ((file->f_mode & FMODE_WRITE) &&
2618 (file->f_flags & O_TRUNC)) { 2917 (file->f_flags & O_TRUNC)) {
2619 long cpu = (long) inode->i_private; 2918 struct trace_cpu *tc = inode->i_private;
2919 struct trace_array *tr = tc->tr;
2620 2920
2621 if (cpu == TRACE_PIPE_ALL_CPU) 2921 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2622 tracing_reset_online_cpus(&global_trace); 2922 tracing_reset_online_cpus(&tr->trace_buffer);
2623 else 2923 else
2624 tracing_reset(&global_trace, cpu); 2924 tracing_reset(&tr->trace_buffer, tc->cpu);
2625 } 2925 }
2626 2926
2627 if (file->f_mode & FMODE_READ) { 2927 if (file->f_mode & FMODE_READ) {
@@ -2768,8 +3068,9 @@ static ssize_t
2768tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3068tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2769 size_t count, loff_t *ppos) 3069 size_t count, loff_t *ppos)
2770{ 3070{
2771 int err, cpu; 3071 struct trace_array *tr = filp->private_data;
2772 cpumask_var_t tracing_cpumask_new; 3072 cpumask_var_t tracing_cpumask_new;
3073 int err, cpu;
2773 3074
2774 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 3075 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2775 return -ENOMEM; 3076 return -ENOMEM;
@@ -2789,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2789 */ 3090 */
2790 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3091 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2791 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3092 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2792 atomic_inc(&global_trace.data[cpu]->disabled); 3093 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2793 ring_buffer_record_disable_cpu(global_trace.buffer, cpu); 3094 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
2794 } 3095 }
2795 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3096 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2796 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3097 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2797 atomic_dec(&global_trace.data[cpu]->disabled); 3098 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2798 ring_buffer_record_enable_cpu(global_trace.buffer, cpu); 3099 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
2799 } 3100 }
2800 } 3101 }
2801 arch_spin_unlock(&ftrace_max_lock); 3102 arch_spin_unlock(&ftrace_max_lock);
@@ -2824,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {
2824static int tracing_trace_options_show(struct seq_file *m, void *v) 3125static int tracing_trace_options_show(struct seq_file *m, void *v)
2825{ 3126{
2826 struct tracer_opt *trace_opts; 3127 struct tracer_opt *trace_opts;
3128 struct trace_array *tr = m->private;
2827 u32 tracer_flags; 3129 u32 tracer_flags;
2828 int i; 3130 int i;
2829 3131
2830 mutex_lock(&trace_types_lock); 3132 mutex_lock(&trace_types_lock);
2831 tracer_flags = current_trace->flags->val; 3133 tracer_flags = tr->current_trace->flags->val;
2832 trace_opts = current_trace->flags->opts; 3134 trace_opts = tr->current_trace->flags->opts;
2833 3135
2834 for (i = 0; trace_options[i]; i++) { 3136 for (i = 0; trace_options[i]; i++) {
2835 if (trace_flags & (1 << i)) 3137 if (trace_flags & (1 << i))
@@ -2893,15 +3195,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
2893 return 0; 3195 return 0;
2894} 3196}
2895 3197
2896int set_tracer_flag(unsigned int mask, int enabled) 3198int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
2897{ 3199{
2898 /* do nothing if flag is already set */ 3200 /* do nothing if flag is already set */
2899 if (!!(trace_flags & mask) == !!enabled) 3201 if (!!(trace_flags & mask) == !!enabled)
2900 return 0; 3202 return 0;
2901 3203
2902 /* Give the tracer a chance to approve the change */ 3204 /* Give the tracer a chance to approve the change */
2903 if (current_trace->flag_changed) 3205 if (tr->current_trace->flag_changed)
2904 if (current_trace->flag_changed(current_trace, mask, !!enabled)) 3206 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
2905 return -EINVAL; 3207 return -EINVAL;
2906 3208
2907 if (enabled) 3209 if (enabled)
@@ -2913,9 +3215,9 @@ int set_tracer_flag(unsigned int mask, int enabled)
2913 trace_event_enable_cmd_record(enabled); 3215 trace_event_enable_cmd_record(enabled);
2914 3216
2915 if (mask == TRACE_ITER_OVERWRITE) { 3217 if (mask == TRACE_ITER_OVERWRITE) {
2916 ring_buffer_change_overwrite(global_trace.buffer, enabled); 3218 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
2917#ifdef CONFIG_TRACER_MAX_TRACE 3219#ifdef CONFIG_TRACER_MAX_TRACE
2918 ring_buffer_change_overwrite(max_tr.buffer, enabled); 3220 ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
2919#endif 3221#endif
2920 } 3222 }
2921 3223
@@ -2925,7 +3227,7 @@ int set_tracer_flag(unsigned int mask, int enabled)
2925 return 0; 3227 return 0;
2926} 3228}
2927 3229
2928static int trace_set_options(char *option) 3230static int trace_set_options(struct trace_array *tr, char *option)
2929{ 3231{
2930 char *cmp; 3232 char *cmp;
2931 int neg = 0; 3233 int neg = 0;
@@ -2943,14 +3245,14 @@ static int trace_set_options(char *option)
2943 3245
2944 for (i = 0; trace_options[i]; i++) { 3246 for (i = 0; trace_options[i]; i++) {
2945 if (strcmp(cmp, trace_options[i]) == 0) { 3247 if (strcmp(cmp, trace_options[i]) == 0) {
2946 ret = set_tracer_flag(1 << i, !neg); 3248 ret = set_tracer_flag(tr, 1 << i, !neg);
2947 break; 3249 break;
2948 } 3250 }
2949 } 3251 }
2950 3252
2951 /* If no option could be set, test the specific tracer options */ 3253 /* If no option could be set, test the specific tracer options */
2952 if (!trace_options[i]) 3254 if (!trace_options[i])
2953 ret = set_tracer_option(current_trace, cmp, neg); 3255 ret = set_tracer_option(tr->current_trace, cmp, neg);
2954 3256
2955 mutex_unlock(&trace_types_lock); 3257 mutex_unlock(&trace_types_lock);
2956 3258
@@ -2961,6 +3263,8 @@ static ssize_t
2961tracing_trace_options_write(struct file *filp, const char __user *ubuf, 3263tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2962 size_t cnt, loff_t *ppos) 3264 size_t cnt, loff_t *ppos)
2963{ 3265{
3266 struct seq_file *m = filp->private_data;
3267 struct trace_array *tr = m->private;
2964 char buf[64]; 3268 char buf[64];
2965 int ret; 3269 int ret;
2966 3270
@@ -2972,7 +3276,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2972 3276
2973 buf[cnt] = 0; 3277 buf[cnt] = 0;
2974 3278
2975 ret = trace_set_options(buf); 3279 ret = trace_set_options(tr, buf);
2976 if (ret < 0) 3280 if (ret < 0)
2977 return ret; 3281 return ret;
2978 3282
@@ -2985,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
2985{ 3289{
2986 if (tracing_disabled) 3290 if (tracing_disabled)
2987 return -ENODEV; 3291 return -ENODEV;
2988 return single_open(file, tracing_trace_options_show, NULL); 3292
3293 return single_open(file, tracing_trace_options_show, inode->i_private);
2989} 3294}
2990 3295
2991static const struct file_operations tracing_iter_fops = { 3296static const struct file_operations tracing_iter_fops = {
@@ -2998,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {
2998 3303
2999static const char readme_msg[] = 3304static const char readme_msg[] =
3000 "tracing mini-HOWTO:\n\n" 3305 "tracing mini-HOWTO:\n\n"
3001 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 3306 "# echo 0 > tracing_on : quick way to disable tracing\n"
3002 "# cat /sys/kernel/debug/tracing/available_tracers\n" 3307 "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
3003 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" 3308 " Important files:\n"
3004 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3309 " trace\t\t\t- The static contents of the buffer\n"
3005 "nop\n" 3310 "\t\t\t To clear the buffer write into this file: echo > trace\n"
3006 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" 3311 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
3007 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3312 " current_tracer\t- function and latency tracers\n"
3008 "wakeup\n" 3313 " available_tracers\t- list of configured tracers for current_tracer\n"
3009 "# cat /sys/kernel/debug/tracing/trace_options\n" 3314 " buffer_size_kb\t- view and modify size of per cpu buffer\n"
3010 "noprint-parent nosym-offset nosym-addr noverbose\n" 3315 " buffer_total_size_kb - view total size of all cpu buffers\n\n"
3011 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 3316 " trace_clock\t\t-change the clock used to order events\n"
3012 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" 3317 " local: Per cpu clock but may not be synced across CPUs\n"
3013 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 3318 " global: Synced across CPUs but slows tracing down.\n"
3014 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" 3319 " counter: Not a clock, but just an increment\n"
3320 " uptime: Jiffy counter from time of boot\n"
3321 " perf: Same clock that perf events use\n"
3322#ifdef CONFIG_X86_64
3323 " x86-tsc: TSC cycle counter\n"
3324#endif
3325 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
3326 " tracing_cpumask\t- Limit which CPUs to trace\n"
3327 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
3328 "\t\t\t Remove sub-buffer with rmdir\n"
3329 " trace_options\t\t- Set format or modify how tracing happens\n"
3330 "\t\t\t Disable an option by adding a suffix 'no' to the option name\n"
3331#ifdef CONFIG_DYNAMIC_FTRACE
3332 "\n available_filter_functions - list of functions that can be filtered on\n"
3333 " set_ftrace_filter\t- echo function name in here to only trace these functions\n"
3334 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3335 " modules: Can select a group via module\n"
3336 " Format: :mod:<module-name>\n"
3337 " example: echo :mod:ext3 > set_ftrace_filter\n"
3338 " triggers: a command to perform when function is hit\n"
3339 " Format: <function>:<trigger>[:count]\n"
3340 " trigger: traceon, traceoff\n"
3341 " enable_event:<system>:<event>\n"
3342 " disable_event:<system>:<event>\n"
3343#ifdef CONFIG_STACKTRACE
3344 " stacktrace\n"
3345#endif
3346#ifdef CONFIG_TRACER_SNAPSHOT
3347 " snapshot\n"
3348#endif
3349 " example: echo do_fault:traceoff > set_ftrace_filter\n"
3350 " echo do_trap:traceoff:3 > set_ftrace_filter\n"
3351 " The first one will disable tracing every time do_fault is hit\n"
3352 " The second will disable tracing at most 3 times when do_trap is hit\n"
3353 " The first time do trap is hit and it disables tracing, the counter\n"
3354 " will decrement to 2. If tracing is already disabled, the counter\n"
3355 " will not decrement. It only decrements when the trigger did work\n"
3356 " To remove trigger without count:\n"
3357 " echo '!<function>:<trigger> > set_ftrace_filter\n"
3358 " To remove trigger with a count:\n"
3359 " echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
3360 " set_ftrace_notrace\t- echo function name in here to never trace.\n"
3361 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3362 " modules: Can select a group via module command :mod:\n"
3363 " Does not accept triggers\n"
3364#endif /* CONFIG_DYNAMIC_FTRACE */
3365#ifdef CONFIG_FUNCTION_TRACER
3366 " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
3367#endif
3368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3369 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3370 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3371#endif
3372#ifdef CONFIG_TRACER_SNAPSHOT
3373 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3374 "\t\t\t Read the contents for more information\n"
3375#endif
3376#ifdef CONFIG_STACKTRACE
3377 " stack_trace\t\t- Shows the max stack trace when active\n"
3378 " stack_max_size\t- Shows current max stack size that was traced\n"
3379 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3380#ifdef CONFIG_DYNAMIC_FTRACE
3381 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3382#endif
3383#endif /* CONFIG_STACKTRACE */
3015; 3384;
3016 3385
3017static ssize_t 3386static ssize_t
@@ -3083,11 +3452,12 @@ static ssize_t
3083tracing_set_trace_read(struct file *filp, char __user *ubuf, 3452tracing_set_trace_read(struct file *filp, char __user *ubuf,
3084 size_t cnt, loff_t *ppos) 3453 size_t cnt, loff_t *ppos)
3085{ 3454{
3455 struct trace_array *tr = filp->private_data;
3086 char buf[MAX_TRACER_SIZE+2]; 3456 char buf[MAX_TRACER_SIZE+2];
3087 int r; 3457 int r;
3088 3458
3089 mutex_lock(&trace_types_lock); 3459 mutex_lock(&trace_types_lock);
3090 r = sprintf(buf, "%s\n", current_trace->name); 3460 r = sprintf(buf, "%s\n", tr->current_trace->name);
3091 mutex_unlock(&trace_types_lock); 3461 mutex_unlock(&trace_types_lock);
3092 3462
3093 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3463 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3095,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3095 3465
3096int tracer_init(struct tracer *t, struct trace_array *tr) 3466int tracer_init(struct tracer *t, struct trace_array *tr)
3097{ 3467{
3098 tracing_reset_online_cpus(tr); 3468 tracing_reset_online_cpus(&tr->trace_buffer);
3099 return t->init(tr); 3469 return t->init(tr);
3100} 3470}
3101 3471
3102static void set_buffer_entries(struct trace_array *tr, unsigned long val) 3472static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
3103{ 3473{
3104 int cpu; 3474 int cpu;
3475
3105 for_each_tracing_cpu(cpu) 3476 for_each_tracing_cpu(cpu)
3106 tr->data[cpu]->entries = val; 3477 per_cpu_ptr(buf->data, cpu)->entries = val;
3107} 3478}
3108 3479
3480#ifdef CONFIG_TRACER_MAX_TRACE
3109/* resize @tr's buffer to the size of @size_tr's entries */ 3481/* resize @tr's buffer to the size of @size_tr's entries */
3110static int resize_buffer_duplicate_size(struct trace_array *tr, 3482static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
3111 struct trace_array *size_tr, int cpu_id) 3483 struct trace_buffer *size_buf, int cpu_id)
3112{ 3484{
3113 int cpu, ret = 0; 3485 int cpu, ret = 0;
3114 3486
3115 if (cpu_id == RING_BUFFER_ALL_CPUS) { 3487 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3116 for_each_tracing_cpu(cpu) { 3488 for_each_tracing_cpu(cpu) {
3117 ret = ring_buffer_resize(tr->buffer, 3489 ret = ring_buffer_resize(trace_buf->buffer,
3118 size_tr->data[cpu]->entries, cpu); 3490 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
3119 if (ret < 0) 3491 if (ret < 0)
3120 break; 3492 break;
3121 tr->data[cpu]->entries = size_tr->data[cpu]->entries; 3493 per_cpu_ptr(trace_buf->data, cpu)->entries =
3494 per_cpu_ptr(size_buf->data, cpu)->entries;
3122 } 3495 }
3123 } else { 3496 } else {
3124 ret = ring_buffer_resize(tr->buffer, 3497 ret = ring_buffer_resize(trace_buf->buffer,
3125 size_tr->data[cpu_id]->entries, cpu_id); 3498 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
3126 if (ret == 0) 3499 if (ret == 0)
3127 tr->data[cpu_id]->entries = 3500 per_cpu_ptr(trace_buf->data, cpu_id)->entries =
3128 size_tr->data[cpu_id]->entries; 3501 per_cpu_ptr(size_buf->data, cpu_id)->entries;
3129 } 3502 }
3130 3503
3131 return ret; 3504 return ret;
3132} 3505}
3506#endif /* CONFIG_TRACER_MAX_TRACE */
3133 3507
3134static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3508static int __tracing_resize_ring_buffer(struct trace_array *tr,
3509 unsigned long size, int cpu)
3135{ 3510{
3136 int ret; 3511 int ret;
3137 3512
@@ -3140,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3140 * we use the size that was given, and we can forget about 3515 * we use the size that was given, and we can forget about
3141 * expanding it later. 3516 * expanding it later.
3142 */ 3517 */
3143 ring_buffer_expanded = 1; 3518 ring_buffer_expanded = true;
3144 3519
3145 /* May be called before buffers are initialized */ 3520 /* May be called before buffers are initialized */
3146 if (!global_trace.buffer) 3521 if (!tr->trace_buffer.buffer)
3147 return 0; 3522 return 0;
3148 3523
3149 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3524 ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
3150 if (ret < 0) 3525 if (ret < 0)
3151 return ret; 3526 return ret;
3152 3527
3153 if (!current_trace->use_max_tr) 3528#ifdef CONFIG_TRACER_MAX_TRACE
3529 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
3530 !tr->current_trace->use_max_tr)
3154 goto out; 3531 goto out;
3155 3532
3156 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3533 ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
3157 if (ret < 0) { 3534 if (ret < 0) {
3158 int r = resize_buffer_duplicate_size(&global_trace, 3535 int r = resize_buffer_duplicate_size(&tr->trace_buffer,
3159 &global_trace, cpu); 3536 &tr->trace_buffer, cpu);
3160 if (r < 0) { 3537 if (r < 0) {
3161 /* 3538 /*
3162 * AARGH! We are left with different 3539 * AARGH! We are left with different
@@ -3179,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3179 } 3556 }
3180 3557
3181 if (cpu == RING_BUFFER_ALL_CPUS) 3558 if (cpu == RING_BUFFER_ALL_CPUS)
3182 set_buffer_entries(&max_tr, size); 3559 set_buffer_entries(&tr->max_buffer, size);
3183 else 3560 else
3184 max_tr.data[cpu]->entries = size; 3561 per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
3185 3562
3186 out: 3563 out:
3564#endif /* CONFIG_TRACER_MAX_TRACE */
3565
3187 if (cpu == RING_BUFFER_ALL_CPUS) 3566 if (cpu == RING_BUFFER_ALL_CPUS)
3188 set_buffer_entries(&global_trace, size); 3567 set_buffer_entries(&tr->trace_buffer, size);
3189 else 3568 else
3190 global_trace.data[cpu]->entries = size; 3569 per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
3191 3570
3192 return ret; 3571 return ret;
3193} 3572}
3194 3573
3195static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 3574static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
3575 unsigned long size, int cpu_id)
3196{ 3576{
3197 int ret = size; 3577 int ret = size;
3198 3578
@@ -3206,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3206 } 3586 }
3207 } 3587 }
3208 3588
3209 ret = __tracing_resize_ring_buffer(size, cpu_id); 3589 ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
3210 if (ret < 0) 3590 if (ret < 0)
3211 ret = -ENOMEM; 3591 ret = -ENOMEM;
3212 3592
@@ -3233,7 +3613,7 @@ int tracing_update_buffers(void)
3233 3613
3234 mutex_lock(&trace_types_lock); 3614 mutex_lock(&trace_types_lock);
3235 if (!ring_buffer_expanded) 3615 if (!ring_buffer_expanded)
3236 ret = __tracing_resize_ring_buffer(trace_buf_size, 3616 ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
3237 RING_BUFFER_ALL_CPUS); 3617 RING_BUFFER_ALL_CPUS);
3238 mutex_unlock(&trace_types_lock); 3618 mutex_unlock(&trace_types_lock);
3239 3619
@@ -3243,7 +3623,7 @@ int tracing_update_buffers(void)
3243struct trace_option_dentry; 3623struct trace_option_dentry;
3244 3624
3245static struct trace_option_dentry * 3625static struct trace_option_dentry *
3246create_trace_option_files(struct tracer *tracer); 3626create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3247 3627
3248static void 3628static void
3249destroy_trace_option_files(struct trace_option_dentry *topts); 3629destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3253,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)
3253 static struct trace_option_dentry *topts; 3633 static struct trace_option_dentry *topts;
3254 struct trace_array *tr = &global_trace; 3634 struct trace_array *tr = &global_trace;
3255 struct tracer *t; 3635 struct tracer *t;
3636#ifdef CONFIG_TRACER_MAX_TRACE
3256 bool had_max_tr; 3637 bool had_max_tr;
3638#endif
3257 int ret = 0; 3639 int ret = 0;
3258 3640
3259 mutex_lock(&trace_types_lock); 3641 mutex_lock(&trace_types_lock);
3260 3642
3261 if (!ring_buffer_expanded) { 3643 if (!ring_buffer_expanded) {
3262 ret = __tracing_resize_ring_buffer(trace_buf_size, 3644 ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
3263 RING_BUFFER_ALL_CPUS); 3645 RING_BUFFER_ALL_CPUS);
3264 if (ret < 0) 3646 if (ret < 0)
3265 goto out; 3647 goto out;
@@ -3274,18 +3656,21 @@ static int tracing_set_tracer(const char *buf)
3274 ret = -EINVAL; 3656 ret = -EINVAL;
3275 goto out; 3657 goto out;
3276 } 3658 }
3277 if (t == current_trace) 3659 if (t == tr->current_trace)
3278 goto out; 3660 goto out;
3279 3661
3280 trace_branch_disable(); 3662 trace_branch_disable();
3281 3663
3282 current_trace->enabled = false; 3664 tr->current_trace->enabled = false;
3283 3665
3284 if (current_trace->reset) 3666 if (tr->current_trace->reset)
3285 current_trace->reset(tr); 3667 tr->current_trace->reset(tr);
3286 3668
3287 had_max_tr = current_trace->allocated_snapshot; 3669 /* Current trace needs to be nop_trace before synchronize_sched */
3288 current_trace = &nop_trace; 3670 tr->current_trace = &nop_trace;
3671
3672#ifdef CONFIG_TRACER_MAX_TRACE
3673 had_max_tr = tr->allocated_snapshot;
3289 3674
3290 if (had_max_tr && !t->use_max_tr) { 3675 if (had_max_tr && !t->use_max_tr) {
3291 /* 3676 /*
@@ -3296,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)
3296 * so a synchronized_sched() is sufficient. 3681 * so a synchronized_sched() is sufficient.
3297 */ 3682 */
3298 synchronize_sched(); 3683 synchronize_sched();
3299 /* 3684 free_snapshot(tr);
3300 * We don't free the ring buffer. instead, resize it because
3301 * The max_tr ring buffer has some state (e.g. ring->clock) and
3302 * we want preserve it.
3303 */
3304 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3305 set_buffer_entries(&max_tr, 1);
3306 tracing_reset_online_cpus(&max_tr);
3307 current_trace->allocated_snapshot = false;
3308 } 3685 }
3686#endif
3309 destroy_trace_option_files(topts); 3687 destroy_trace_option_files(topts);
3310 3688
3311 topts = create_trace_option_files(t); 3689 topts = create_trace_option_files(tr, t);
3690
3691#ifdef CONFIG_TRACER_MAX_TRACE
3312 if (t->use_max_tr && !had_max_tr) { 3692 if (t->use_max_tr && !had_max_tr) {
3313 /* we need to make per cpu buffer sizes equivalent */ 3693 ret = alloc_snapshot(tr);
3314 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3315 RING_BUFFER_ALL_CPUS);
3316 if (ret < 0) 3694 if (ret < 0)
3317 goto out; 3695 goto out;
3318 t->allocated_snapshot = true;
3319 } 3696 }
3697#endif
3320 3698
3321 if (t->init) { 3699 if (t->init) {
3322 ret = tracer_init(t, tr); 3700 ret = tracer_init(t, tr);
@@ -3324,8 +3702,8 @@ static int tracing_set_tracer(const char *buf)
3324 goto out; 3702 goto out;
3325 } 3703 }
3326 3704
3327 current_trace = t; 3705 tr->current_trace = t;
3328 current_trace->enabled = true; 3706 tr->current_trace->enabled = true;
3329 trace_branch_enable(tr); 3707 trace_branch_enable(tr);
3330 out: 3708 out:
3331 mutex_unlock(&trace_types_lock); 3709 mutex_unlock(&trace_types_lock);
@@ -3399,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3399 3777
3400static int tracing_open_pipe(struct inode *inode, struct file *filp) 3778static int tracing_open_pipe(struct inode *inode, struct file *filp)
3401{ 3779{
3402 long cpu_file = (long) inode->i_private; 3780 struct trace_cpu *tc = inode->i_private;
3781 struct trace_array *tr = tc->tr;
3403 struct trace_iterator *iter; 3782 struct trace_iterator *iter;
3404 int ret = 0; 3783 int ret = 0;
3405 3784
@@ -3424,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3424 ret = -ENOMEM; 3803 ret = -ENOMEM;
3425 goto fail; 3804 goto fail;
3426 } 3805 }
3427 *iter->trace = *current_trace; 3806 *iter->trace = *tr->current_trace;
3428 3807
3429 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3808 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3430 ret = -ENOMEM; 3809 ret = -ENOMEM;
@@ -3441,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3441 if (trace_clocks[trace_clock_id].in_ns) 3820 if (trace_clocks[trace_clock_id].in_ns)
3442 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3821 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3443 3822
3444 iter->cpu_file = cpu_file; 3823 iter->cpu_file = tc->cpu;
3445 iter->tr = &global_trace; 3824 iter->tr = tc->tr;
3825 iter->trace_buffer = &tc->tr->trace_buffer;
3446 mutex_init(&iter->mutex); 3826 mutex_init(&iter->mutex);
3447 filp->private_data = iter; 3827 filp->private_data = iter;
3448 3828
@@ -3481,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3481} 3861}
3482 3862
3483static unsigned int 3863static unsigned int
3484tracing_poll_pipe(struct file *filp, poll_table *poll_table) 3864trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
3485{ 3865{
3486 struct trace_iterator *iter = filp->private_data; 3866 /* Iterators are static, they should be filled or empty */
3867 if (trace_buffer_iter(iter, iter->cpu_file))
3868 return POLLIN | POLLRDNORM;
3487 3869
3488 if (trace_flags & TRACE_ITER_BLOCK) { 3870 if (trace_flags & TRACE_ITER_BLOCK)
3489 /* 3871 /*
3490 * Always select as readable when in blocking mode 3872 * Always select as readable when in blocking mode
3491 */ 3873 */
3492 return POLLIN | POLLRDNORM; 3874 return POLLIN | POLLRDNORM;
3493 } else { 3875 else
3494 if (!trace_empty(iter)) 3876 return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
3495 return POLLIN | POLLRDNORM; 3877 filp, poll_table);
3496 poll_wait(filp, &trace_wait, poll_table); 3878}
3497 if (!trace_empty(iter))
3498 return POLLIN | POLLRDNORM;
3499 3879
3500 return 0; 3880static unsigned int
3501 } 3881tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3882{
3883 struct trace_iterator *iter = filp->private_data;
3884
3885 return trace_poll(iter, filp, poll_table);
3502} 3886}
3503 3887
3504/* 3888/*
@@ -3564,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3564 size_t cnt, loff_t *ppos) 3948 size_t cnt, loff_t *ppos)
3565{ 3949{
3566 struct trace_iterator *iter = filp->private_data; 3950 struct trace_iterator *iter = filp->private_data;
3951 struct trace_array *tr = iter->tr;
3567 ssize_t sret; 3952 ssize_t sret;
3568 3953
3569 /* return any leftover data */ 3954 /* return any leftover data */
@@ -3575,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3575 3960
3576 /* copy the tracer to avoid using a global lock all around */ 3961 /* copy the tracer to avoid using a global lock all around */
3577 mutex_lock(&trace_types_lock); 3962 mutex_lock(&trace_types_lock);
3578 if (unlikely(iter->trace->name != current_trace->name)) 3963 if (unlikely(iter->trace->name != tr->current_trace->name))
3579 *iter->trace = *current_trace; 3964 *iter->trace = *tr->current_trace;
3580 mutex_unlock(&trace_types_lock); 3965 mutex_unlock(&trace_types_lock);
3581 3966
3582 /* 3967 /*
@@ -3732,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3732 .ops = &tracing_pipe_buf_ops, 4117 .ops = &tracing_pipe_buf_ops,
3733 .spd_release = tracing_spd_release_pipe, 4118 .spd_release = tracing_spd_release_pipe,
3734 }; 4119 };
4120 struct trace_array *tr = iter->tr;
3735 ssize_t ret; 4121 ssize_t ret;
3736 size_t rem; 4122 size_t rem;
3737 unsigned int i; 4123 unsigned int i;
@@ -3741,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3741 4127
3742 /* copy the tracer to avoid using a global lock all around */ 4128 /* copy the tracer to avoid using a global lock all around */
3743 mutex_lock(&trace_types_lock); 4129 mutex_lock(&trace_types_lock);
3744 if (unlikely(iter->trace->name != current_trace->name)) 4130 if (unlikely(iter->trace->name != tr->current_trace->name))
3745 *iter->trace = *current_trace; 4131 *iter->trace = *tr->current_trace;
3746 mutex_unlock(&trace_types_lock); 4132 mutex_unlock(&trace_types_lock);
3747 4133
3748 mutex_lock(&iter->mutex); 4134 mutex_lock(&iter->mutex);
@@ -3804,43 +4190,19 @@ out_err:
3804 goto out; 4190 goto out;
3805} 4191}
3806 4192
3807struct ftrace_entries_info {
3808 struct trace_array *tr;
3809 int cpu;
3810};
3811
3812static int tracing_entries_open(struct inode *inode, struct file *filp)
3813{
3814 struct ftrace_entries_info *info;
3815
3816 if (tracing_disabled)
3817 return -ENODEV;
3818
3819 info = kzalloc(sizeof(*info), GFP_KERNEL);
3820 if (!info)
3821 return -ENOMEM;
3822
3823 info->tr = &global_trace;
3824 info->cpu = (unsigned long)inode->i_private;
3825
3826 filp->private_data = info;
3827
3828 return 0;
3829}
3830
3831static ssize_t 4193static ssize_t
3832tracing_entries_read(struct file *filp, char __user *ubuf, 4194tracing_entries_read(struct file *filp, char __user *ubuf,
3833 size_t cnt, loff_t *ppos) 4195 size_t cnt, loff_t *ppos)
3834{ 4196{
3835 struct ftrace_entries_info *info = filp->private_data; 4197 struct trace_cpu *tc = filp->private_data;
3836 struct trace_array *tr = info->tr; 4198 struct trace_array *tr = tc->tr;
3837 char buf[64]; 4199 char buf[64];
3838 int r = 0; 4200 int r = 0;
3839 ssize_t ret; 4201 ssize_t ret;
3840 4202
3841 mutex_lock(&trace_types_lock); 4203 mutex_lock(&trace_types_lock);
3842 4204
3843 if (info->cpu == RING_BUFFER_ALL_CPUS) { 4205 if (tc->cpu == RING_BUFFER_ALL_CPUS) {
3844 int cpu, buf_size_same; 4206 int cpu, buf_size_same;
3845 unsigned long size; 4207 unsigned long size;
3846 4208
@@ -3850,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3850 for_each_tracing_cpu(cpu) { 4212 for_each_tracing_cpu(cpu) {
3851 /* fill in the size from first enabled cpu */ 4213 /* fill in the size from first enabled cpu */
3852 if (size == 0) 4214 if (size == 0)
3853 size = tr->data[cpu]->entries; 4215 size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
3854 if (size != tr->data[cpu]->entries) { 4216 if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
3855 buf_size_same = 0; 4217 buf_size_same = 0;
3856 break; 4218 break;
3857 } 4219 }
@@ -3867,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3867 } else 4229 } else
3868 r = sprintf(buf, "X\n"); 4230 r = sprintf(buf, "X\n");
3869 } else 4231 } else
3870 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); 4232 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
3871 4233
3872 mutex_unlock(&trace_types_lock); 4234 mutex_unlock(&trace_types_lock);
3873 4235
@@ -3879,7 +4241,7 @@ static ssize_t
3879tracing_entries_write(struct file *filp, const char __user *ubuf, 4241tracing_entries_write(struct file *filp, const char __user *ubuf,
3880 size_t cnt, loff_t *ppos) 4242 size_t cnt, loff_t *ppos)
3881{ 4243{
3882 struct ftrace_entries_info *info = filp->private_data; 4244 struct trace_cpu *tc = filp->private_data;
3883 unsigned long val; 4245 unsigned long val;
3884 int ret; 4246 int ret;
3885 4247
@@ -3894,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3894 /* value is in KB */ 4256 /* value is in KB */
3895 val <<= 10; 4257 val <<= 10;
3896 4258
3897 ret = tracing_resize_ring_buffer(val, info->cpu); 4259 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
3898 if (ret < 0) 4260 if (ret < 0)
3899 return ret; 4261 return ret;
3900 4262
@@ -3903,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3903 return cnt; 4265 return cnt;
3904} 4266}
3905 4267
3906static int
3907tracing_entries_release(struct inode *inode, struct file *filp)
3908{
3909 struct ftrace_entries_info *info = filp->private_data;
3910
3911 kfree(info);
3912
3913 return 0;
3914}
3915
3916static ssize_t 4268static ssize_t
3917tracing_total_entries_read(struct file *filp, char __user *ubuf, 4269tracing_total_entries_read(struct file *filp, char __user *ubuf,
3918 size_t cnt, loff_t *ppos) 4270 size_t cnt, loff_t *ppos)
@@ -3924,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3924 4276
3925 mutex_lock(&trace_types_lock); 4277 mutex_lock(&trace_types_lock);
3926 for_each_tracing_cpu(cpu) { 4278 for_each_tracing_cpu(cpu) {
3927 size += tr->data[cpu]->entries >> 10; 4279 size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
3928 if (!ring_buffer_expanded) 4280 if (!ring_buffer_expanded)
3929 expanded_size += trace_buf_size >> 10; 4281 expanded_size += trace_buf_size >> 10;
3930 } 4282 }
@@ -3954,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3954static int 4306static int
3955tracing_free_buffer_release(struct inode *inode, struct file *filp) 4307tracing_free_buffer_release(struct inode *inode, struct file *filp)
3956{ 4308{
4309 struct trace_array *tr = inode->i_private;
4310
3957 /* disable tracing ? */ 4311 /* disable tracing ? */
3958 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4312 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3959 tracing_off(); 4313 tracing_off();
3960 /* resize the ring buffer to 0 */ 4314 /* resize the ring buffer to 0 */
3961 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); 4315 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
3962 4316
3963 return 0; 4317 return 0;
3964} 4318}
@@ -4027,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4027 4381
4028 local_save_flags(irq_flags); 4382 local_save_flags(irq_flags);
4029 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4383 size = sizeof(*entry) + cnt + 2; /* possible \n added */
4030 buffer = global_trace.buffer; 4384 buffer = global_trace.trace_buffer.buffer;
4031 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4385 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4032 irq_flags, preempt_count()); 4386 irq_flags, preempt_count());
4033 if (!event) { 4387 if (!event) {
@@ -4069,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4069 4423
4070static int tracing_clock_show(struct seq_file *m, void *v) 4424static int tracing_clock_show(struct seq_file *m, void *v)
4071{ 4425{
4426 struct trace_array *tr = m->private;
4072 int i; 4427 int i;
4073 4428
4074 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 4429 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
4075 seq_printf(m, 4430 seq_printf(m,
4076 "%s%s%s%s", i ? " " : "", 4431 "%s%s%s%s", i ? " " : "",
4077 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 4432 i == tr->clock_id ? "[" : "", trace_clocks[i].name,
4078 i == trace_clock_id ? "]" : ""); 4433 i == tr->clock_id ? "]" : "");
4079 seq_putc(m, '\n'); 4434 seq_putc(m, '\n');
4080 4435
4081 return 0; 4436 return 0;
@@ -4084,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4084static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4439static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4085 size_t cnt, loff_t *fpos) 4440 size_t cnt, loff_t *fpos)
4086{ 4441{
4442 struct seq_file *m = filp->private_data;
4443 struct trace_array *tr = m->private;
4087 char buf[64]; 4444 char buf[64];
4088 const char *clockstr; 4445 const char *clockstr;
4089 int i; 4446 int i;
@@ -4105,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4105 if (i == ARRAY_SIZE(trace_clocks)) 4462 if (i == ARRAY_SIZE(trace_clocks))
4106 return -EINVAL; 4463 return -EINVAL;
4107 4464
4108 trace_clock_id = i;
4109
4110 mutex_lock(&trace_types_lock); 4465 mutex_lock(&trace_types_lock);
4111 4466
4112 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); 4467 tr->clock_id = i;
4113 if (max_tr.buffer) 4468
4114 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4469 ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
4115 4470
4116 /* 4471 /*
4117 * New clock may not be consistent with the previous clock. 4472 * New clock may not be consistent with the previous clock.
4118 * Reset the buffer so that it doesn't have incomparable timestamps. 4473 * Reset the buffer so that it doesn't have incomparable timestamps.
4119 */ 4474 */
4120 tracing_reset_online_cpus(&global_trace); 4475 tracing_reset_online_cpus(&global_trace.trace_buffer);
4121 tracing_reset_online_cpus(&max_tr); 4476
4477#ifdef CONFIG_TRACER_MAX_TRACE
4478 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4479 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4480 tracing_reset_online_cpus(&global_trace.max_buffer);
4481#endif
4122 4482
4123 mutex_unlock(&trace_types_lock); 4483 mutex_unlock(&trace_types_lock);
4124 4484
@@ -4131,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4131{ 4491{
4132 if (tracing_disabled) 4492 if (tracing_disabled)
4133 return -ENODEV; 4493 return -ENODEV;
4134 return single_open(file, tracing_clock_show, NULL); 4494
4495 return single_open(file, tracing_clock_show, inode->i_private);
4135} 4496}
4136 4497
4498struct ftrace_buffer_info {
4499 struct trace_iterator iter;
4500 void *spare;
4501 unsigned int read;
4502};
4503
4137#ifdef CONFIG_TRACER_SNAPSHOT 4504#ifdef CONFIG_TRACER_SNAPSHOT
4138static int tracing_snapshot_open(struct inode *inode, struct file *file) 4505static int tracing_snapshot_open(struct inode *inode, struct file *file)
4139{ 4506{
4507 struct trace_cpu *tc = inode->i_private;
4140 struct trace_iterator *iter; 4508 struct trace_iterator *iter;
4509 struct seq_file *m;
4141 int ret = 0; 4510 int ret = 0;
4142 4511
4143 if (file->f_mode & FMODE_READ) { 4512 if (file->f_mode & FMODE_READ) {
4144 iter = __tracing_open(inode, file, true); 4513 iter = __tracing_open(inode, file, true);
4145 if (IS_ERR(iter)) 4514 if (IS_ERR(iter))
4146 ret = PTR_ERR(iter); 4515 ret = PTR_ERR(iter);
4516 } else {
4517 /* Writes still need the seq_file to hold the private data */
4518 m = kzalloc(sizeof(*m), GFP_KERNEL);
4519 if (!m)
4520 return -ENOMEM;
4521 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4522 if (!iter) {
4523 kfree(m);
4524 return -ENOMEM;
4525 }
4526 iter->tr = tc->tr;
4527 iter->trace_buffer = &tc->tr->max_buffer;
4528 iter->cpu_file = tc->cpu;
4529 m->private = iter;
4530 file->private_data = m;
4147 } 4531 }
4532
4148 return ret; 4533 return ret;
4149} 4534}
4150 4535
@@ -4152,6 +4537,9 @@ static ssize_t
4152tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, 4537tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4153 loff_t *ppos) 4538 loff_t *ppos)
4154{ 4539{
4540 struct seq_file *m = filp->private_data;
4541 struct trace_iterator *iter = m->private;
4542 struct trace_array *tr = iter->tr;
4155 unsigned long val; 4543 unsigned long val;
4156 int ret; 4544 int ret;
4157 4545
@@ -4165,40 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4165 4553
4166 mutex_lock(&trace_types_lock); 4554 mutex_lock(&trace_types_lock);
4167 4555
4168 if (current_trace->use_max_tr) { 4556 if (tr->current_trace->use_max_tr) {
4169 ret = -EBUSY; 4557 ret = -EBUSY;
4170 goto out; 4558 goto out;
4171 } 4559 }
4172 4560
4173 switch (val) { 4561 switch (val) {
4174 case 0: 4562 case 0:
4175 if (current_trace->allocated_snapshot) { 4563 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4176 /* free spare buffer */ 4564 ret = -EINVAL;
4177 ring_buffer_resize(max_tr.buffer, 1, 4565 break;
4178 RING_BUFFER_ALL_CPUS);
4179 set_buffer_entries(&max_tr, 1);
4180 tracing_reset_online_cpus(&max_tr);
4181 current_trace->allocated_snapshot = false;
4182 } 4566 }
4567 if (tr->allocated_snapshot)
4568 free_snapshot(tr);
4183 break; 4569 break;
4184 case 1: 4570 case 1:
4185 if (!current_trace->allocated_snapshot) { 4571/* Only allow per-cpu swap if the ring buffer supports it */
4186 /* allocate spare buffer */ 4572#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
4187 ret = resize_buffer_duplicate_size(&max_tr, 4573 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4188 &global_trace, RING_BUFFER_ALL_CPUS); 4574 ret = -EINVAL;
4575 break;
4576 }
4577#endif
4578 if (!tr->allocated_snapshot) {
4579 ret = alloc_snapshot(tr);
4189 if (ret < 0) 4580 if (ret < 0)
4190 break; 4581 break;
4191 current_trace->allocated_snapshot = true;
4192 } 4582 }
4193
4194 local_irq_disable(); 4583 local_irq_disable();
4195 /* Now, we're going to swap */ 4584 /* Now, we're going to swap */
4196 update_max_tr(&global_trace, current, smp_processor_id()); 4585 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4586 update_max_tr(tr, current, smp_processor_id());
4587 else
4588 update_max_tr_single(tr, current, iter->cpu_file);
4197 local_irq_enable(); 4589 local_irq_enable();
4198 break; 4590 break;
4199 default: 4591 default:
4200 if (current_trace->allocated_snapshot) 4592 if (tr->allocated_snapshot) {
4201 tracing_reset_online_cpus(&max_tr); 4593 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4594 tracing_reset_online_cpus(&tr->max_buffer);
4595 else
4596 tracing_reset(&tr->max_buffer, iter->cpu_file);
4597 }
4202 break; 4598 break;
4203 } 4599 }
4204 4600
@@ -4210,6 +4606,51 @@ out:
4210 mutex_unlock(&trace_types_lock); 4606 mutex_unlock(&trace_types_lock);
4211 return ret; 4607 return ret;
4212} 4608}
4609
4610static int tracing_snapshot_release(struct inode *inode, struct file *file)
4611{
4612 struct seq_file *m = file->private_data;
4613
4614 if (file->f_mode & FMODE_READ)
4615 return tracing_release(inode, file);
4616
4617 /* If write only, the seq_file is just a stub */
4618 if (m)
4619 kfree(m->private);
4620 kfree(m);
4621
4622 return 0;
4623}
4624
4625static int tracing_buffers_open(struct inode *inode, struct file *filp);
4626static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
4627 size_t count, loff_t *ppos);
4628static int tracing_buffers_release(struct inode *inode, struct file *file);
4629static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4630 struct pipe_inode_info *pipe, size_t len, unsigned int flags);
4631
4632static int snapshot_raw_open(struct inode *inode, struct file *filp)
4633{
4634 struct ftrace_buffer_info *info;
4635 int ret;
4636
4637 ret = tracing_buffers_open(inode, filp);
4638 if (ret < 0)
4639 return ret;
4640
4641 info = filp->private_data;
4642
4643 if (info->iter.trace->use_max_tr) {
4644 tracing_buffers_release(inode, filp);
4645 return -EBUSY;
4646 }
4647
4648 info->iter.snapshot = true;
4649 info->iter.trace_buffer = &info->iter.tr->max_buffer;
4650
4651 return ret;
4652}
4653
4213#endif /* CONFIG_TRACER_SNAPSHOT */ 4654#endif /* CONFIG_TRACER_SNAPSHOT */
4214 4655
4215 4656
@@ -4237,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {
4237}; 4678};
4238 4679
4239static const struct file_operations tracing_entries_fops = { 4680static const struct file_operations tracing_entries_fops = {
4240 .open = tracing_entries_open, 4681 .open = tracing_open_generic,
4241 .read = tracing_entries_read, 4682 .read = tracing_entries_read,
4242 .write = tracing_entries_write, 4683 .write = tracing_entries_write,
4243 .release = tracing_entries_release,
4244 .llseek = generic_file_llseek, 4684 .llseek = generic_file_llseek,
4245}; 4685};
4246 4686
@@ -4275,20 +4715,23 @@ static const struct file_operations snapshot_fops = {
4275 .read = seq_read, 4715 .read = seq_read,
4276 .write = tracing_snapshot_write, 4716 .write = tracing_snapshot_write,
4277 .llseek = tracing_seek, 4717 .llseek = tracing_seek,
4278 .release = tracing_release, 4718 .release = tracing_snapshot_release,
4279}; 4719};
4280#endif /* CONFIG_TRACER_SNAPSHOT */
4281 4720
4282struct ftrace_buffer_info { 4721static const struct file_operations snapshot_raw_fops = {
4283 struct trace_array *tr; 4722 .open = snapshot_raw_open,
4284 void *spare; 4723 .read = tracing_buffers_read,
4285 int cpu; 4724 .release = tracing_buffers_release,
4286 unsigned int read; 4725 .splice_read = tracing_buffers_splice_read,
4726 .llseek = no_llseek,
4287}; 4727};
4288 4728
4729#endif /* CONFIG_TRACER_SNAPSHOT */
4730
4289static int tracing_buffers_open(struct inode *inode, struct file *filp) 4731static int tracing_buffers_open(struct inode *inode, struct file *filp)
4290{ 4732{
4291 int cpu = (int)(long)inode->i_private; 4733 struct trace_cpu *tc = inode->i_private;
4734 struct trace_array *tr = tc->tr;
4292 struct ftrace_buffer_info *info; 4735 struct ftrace_buffer_info *info;
4293 4736
4294 if (tracing_disabled) 4737 if (tracing_disabled)
@@ -4298,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4298 if (!info) 4741 if (!info)
4299 return -ENOMEM; 4742 return -ENOMEM;
4300 4743
4301 info->tr = &global_trace; 4744 mutex_lock(&trace_types_lock);
4302 info->cpu = cpu; 4745
4303 info->spare = NULL; 4746 tr->ref++;
4747
4748 info->iter.tr = tr;
4749 info->iter.cpu_file = tc->cpu;
4750 info->iter.trace = tr->current_trace;
4751 info->iter.trace_buffer = &tr->trace_buffer;
4752 info->spare = NULL;
4304 /* Force reading ring buffer for first read */ 4753 /* Force reading ring buffer for first read */
4305 info->read = (unsigned int)-1; 4754 info->read = (unsigned int)-1;
4306 4755
4307 filp->private_data = info; 4756 filp->private_data = info;
4308 4757
4758 mutex_unlock(&trace_types_lock);
4759
4309 return nonseekable_open(inode, filp); 4760 return nonseekable_open(inode, filp);
4310} 4761}
4311 4762
4763static unsigned int
4764tracing_buffers_poll(struct file *filp, poll_table *poll_table)
4765{
4766 struct ftrace_buffer_info *info = filp->private_data;
4767 struct trace_iterator *iter = &info->iter;
4768
4769 return trace_poll(iter, filp, poll_table);
4770}
4771
4312static ssize_t 4772static ssize_t
4313tracing_buffers_read(struct file *filp, char __user *ubuf, 4773tracing_buffers_read(struct file *filp, char __user *ubuf,
4314 size_t count, loff_t *ppos) 4774 size_t count, loff_t *ppos)
4315{ 4775{
4316 struct ftrace_buffer_info *info = filp->private_data; 4776 struct ftrace_buffer_info *info = filp->private_data;
4777 struct trace_iterator *iter = &info->iter;
4317 ssize_t ret; 4778 ssize_t ret;
4318 size_t size; 4779 ssize_t size;
4319 4780
4320 if (!count) 4781 if (!count)
4321 return 0; 4782 return 0;
4322 4783
4784 mutex_lock(&trace_types_lock);
4785
4786#ifdef CONFIG_TRACER_MAX_TRACE
4787 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4788 size = -EBUSY;
4789 goto out_unlock;
4790 }
4791#endif
4792
4323 if (!info->spare) 4793 if (!info->spare)
4324 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); 4794 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
4795 iter->cpu_file);
4796 size = -ENOMEM;
4325 if (!info->spare) 4797 if (!info->spare)
4326 return -ENOMEM; 4798 goto out_unlock;
4327 4799
4328 /* Do we have previous read data to read? */ 4800 /* Do we have previous read data to read? */
4329 if (info->read < PAGE_SIZE) 4801 if (info->read < PAGE_SIZE)
4330 goto read; 4802 goto read;
4331 4803
4332 trace_access_lock(info->cpu); 4804 again:
4333 ret = ring_buffer_read_page(info->tr->buffer, 4805 trace_access_lock(iter->cpu_file);
4806 ret = ring_buffer_read_page(iter->trace_buffer->buffer,
4334 &info->spare, 4807 &info->spare,
4335 count, 4808 count,
4336 info->cpu, 0); 4809 iter->cpu_file, 0);
4337 trace_access_unlock(info->cpu); 4810 trace_access_unlock(iter->cpu_file);
4338 if (ret < 0)
4339 return 0;
4340 4811
4341 info->read = 0; 4812 if (ret < 0) {
4813 if (trace_empty(iter)) {
4814 if ((filp->f_flags & O_NONBLOCK)) {
4815 size = -EAGAIN;
4816 goto out_unlock;
4817 }
4818 mutex_unlock(&trace_types_lock);
4819 iter->trace->wait_pipe(iter);
4820 mutex_lock(&trace_types_lock);
4821 if (signal_pending(current)) {
4822 size = -EINTR;
4823 goto out_unlock;
4824 }
4825 goto again;
4826 }
4827 size = 0;
4828 goto out_unlock;
4829 }
4342 4830
4343read: 4831 info->read = 0;
4832 read:
4344 size = PAGE_SIZE - info->read; 4833 size = PAGE_SIZE - info->read;
4345 if (size > count) 4834 if (size > count)
4346 size = count; 4835 size = count;
4347 4836
4348 ret = copy_to_user(ubuf, info->spare + info->read, size); 4837 ret = copy_to_user(ubuf, info->spare + info->read, size);
4349 if (ret == size) 4838 if (ret == size) {
4350 return -EFAULT; 4839 size = -EFAULT;
4840 goto out_unlock;
4841 }
4351 size -= ret; 4842 size -= ret;
4352 4843
4353 *ppos += size; 4844 *ppos += size;
4354 info->read += size; 4845 info->read += size;
4355 4846
4847 out_unlock:
4848 mutex_unlock(&trace_types_lock);
4849
4356 return size; 4850 return size;
4357} 4851}
4358 4852
4359static int tracing_buffers_release(struct inode *inode, struct file *file) 4853static int tracing_buffers_release(struct inode *inode, struct file *file)
4360{ 4854{
4361 struct ftrace_buffer_info *info = file->private_data; 4855 struct ftrace_buffer_info *info = file->private_data;
4856 struct trace_iterator *iter = &info->iter;
4857
4858 mutex_lock(&trace_types_lock);
4859
4860 WARN_ON(!iter->tr->ref);
4861 iter->tr->ref--;
4362 4862
4363 if (info->spare) 4863 if (info->spare)
4364 ring_buffer_free_read_page(info->tr->buffer, info->spare); 4864 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
4365 kfree(info); 4865 kfree(info);
4366 4866
4867 mutex_unlock(&trace_types_lock);
4868
4367 return 0; 4869 return 0;
4368} 4870}
4369 4871
@@ -4428,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4428 unsigned int flags) 4930 unsigned int flags)
4429{ 4931{
4430 struct ftrace_buffer_info *info = file->private_data; 4932 struct ftrace_buffer_info *info = file->private_data;
4933 struct trace_iterator *iter = &info->iter;
4431 struct partial_page partial_def[PIPE_DEF_BUFFERS]; 4934 struct partial_page partial_def[PIPE_DEF_BUFFERS];
4432 struct page *pages_def[PIPE_DEF_BUFFERS]; 4935 struct page *pages_def[PIPE_DEF_BUFFERS];
4433 struct splice_pipe_desc spd = { 4936 struct splice_pipe_desc spd = {
@@ -4440,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4440 }; 4943 };
4441 struct buffer_ref *ref; 4944 struct buffer_ref *ref;
4442 int entries, size, i; 4945 int entries, size, i;
4443 size_t ret; 4946 ssize_t ret;
4444 4947
4445 if (splice_grow_spd(pipe, &spd)) 4948 mutex_lock(&trace_types_lock);
4446 return -ENOMEM; 4949
4950#ifdef CONFIG_TRACER_MAX_TRACE
4951 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4952 ret = -EBUSY;
4953 goto out;
4954 }
4955#endif
4956
4957 if (splice_grow_spd(pipe, &spd)) {
4958 ret = -ENOMEM;
4959 goto out;
4960 }
4447 4961
4448 if (*ppos & (PAGE_SIZE - 1)) { 4962 if (*ppos & (PAGE_SIZE - 1)) {
4449 ret = -EINVAL; 4963 ret = -EINVAL;
@@ -4458,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4458 len &= PAGE_MASK; 4972 len &= PAGE_MASK;
4459 } 4973 }
4460 4974
4461 trace_access_lock(info->cpu); 4975 again:
4462 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 4976 trace_access_lock(iter->cpu_file);
4977 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4463 4978
4464 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { 4979 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
4465 struct page *page; 4980 struct page *page;
@@ -4470,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4470 break; 4985 break;
4471 4986
4472 ref->ref = 1; 4987 ref->ref = 1;
4473 ref->buffer = info->tr->buffer; 4988 ref->buffer = iter->trace_buffer->buffer;
4474 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); 4989 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
4475 if (!ref->page) { 4990 if (!ref->page) {
4476 kfree(ref); 4991 kfree(ref);
4477 break; 4992 break;
4478 } 4993 }
4479 4994
4480 r = ring_buffer_read_page(ref->buffer, &ref->page, 4995 r = ring_buffer_read_page(ref->buffer, &ref->page,
4481 len, info->cpu, 1); 4996 len, iter->cpu_file, 1);
4482 if (r < 0) { 4997 if (r < 0) {
4483 ring_buffer_free_read_page(ref->buffer, ref->page); 4998 ring_buffer_free_read_page(ref->buffer, ref->page);
4484 kfree(ref); 4999 kfree(ref);
@@ -4502,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4502 spd.nr_pages++; 5017 spd.nr_pages++;
4503 *ppos += PAGE_SIZE; 5018 *ppos += PAGE_SIZE;
4504 5019
4505 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 5020 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4506 } 5021 }
4507 5022
4508 trace_access_unlock(info->cpu); 5023 trace_access_unlock(iter->cpu_file);
4509 spd.nr_pages = i; 5024 spd.nr_pages = i;
4510 5025
4511 /* did we read anything? */ 5026 /* did we read anything? */
4512 if (!spd.nr_pages) { 5027 if (!spd.nr_pages) {
4513 if (flags & SPLICE_F_NONBLOCK) 5028 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
4514 ret = -EAGAIN; 5029 ret = -EAGAIN;
4515 else 5030 goto out;
4516 ret = 0; 5031 }
4517 /* TODO: block */ 5032 mutex_unlock(&trace_types_lock);
4518 goto out; 5033 iter->trace->wait_pipe(iter);
5034 mutex_lock(&trace_types_lock);
5035 if (signal_pending(current)) {
5036 ret = -EINTR;
5037 goto out;
5038 }
5039 goto again;
4519 } 5040 }
4520 5041
4521 ret = splice_to_pipe(pipe, &spd); 5042 ret = splice_to_pipe(pipe, &spd);
4522 splice_shrink_spd(&spd); 5043 splice_shrink_spd(&spd);
4523out: 5044out:
5045 mutex_unlock(&trace_types_lock);
5046
4524 return ret; 5047 return ret;
4525} 5048}
4526 5049
4527static const struct file_operations tracing_buffers_fops = { 5050static const struct file_operations tracing_buffers_fops = {
4528 .open = tracing_buffers_open, 5051 .open = tracing_buffers_open,
4529 .read = tracing_buffers_read, 5052 .read = tracing_buffers_read,
5053 .poll = tracing_buffers_poll,
4530 .release = tracing_buffers_release, 5054 .release = tracing_buffers_release,
4531 .splice_read = tracing_buffers_splice_read, 5055 .splice_read = tracing_buffers_splice_read,
4532 .llseek = no_llseek, 5056 .llseek = no_llseek,
@@ -4536,12 +5060,14 @@ static ssize_t
4536tracing_stats_read(struct file *filp, char __user *ubuf, 5060tracing_stats_read(struct file *filp, char __user *ubuf,
4537 size_t count, loff_t *ppos) 5061 size_t count, loff_t *ppos)
4538{ 5062{
4539 unsigned long cpu = (unsigned long)filp->private_data; 5063 struct trace_cpu *tc = filp->private_data;
4540 struct trace_array *tr = &global_trace; 5064 struct trace_array *tr = tc->tr;
5065 struct trace_buffer *trace_buf = &tr->trace_buffer;
4541 struct trace_seq *s; 5066 struct trace_seq *s;
4542 unsigned long cnt; 5067 unsigned long cnt;
4543 unsigned long long t; 5068 unsigned long long t;
4544 unsigned long usec_rem; 5069 unsigned long usec_rem;
5070 int cpu = tc->cpu;
4545 5071
4546 s = kmalloc(sizeof(*s), GFP_KERNEL); 5072 s = kmalloc(sizeof(*s), GFP_KERNEL);
4547 if (!s) 5073 if (!s)
@@ -4549,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4549 5075
4550 trace_seq_init(s); 5076 trace_seq_init(s);
4551 5077
4552 cnt = ring_buffer_entries_cpu(tr->buffer, cpu); 5078 cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
4553 trace_seq_printf(s, "entries: %ld\n", cnt); 5079 trace_seq_printf(s, "entries: %ld\n", cnt);
4554 5080
4555 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); 5081 cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
4556 trace_seq_printf(s, "overrun: %ld\n", cnt); 5082 trace_seq_printf(s, "overrun: %ld\n", cnt);
4557 5083
4558 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 5084 cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
4559 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 5085 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4560 5086
4561 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 5087 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
4562 trace_seq_printf(s, "bytes: %ld\n", cnt); 5088 trace_seq_printf(s, "bytes: %ld\n", cnt);
4563 5089
4564 if (trace_clocks[trace_clock_id].in_ns) { 5090 if (trace_clocks[trace_clock_id].in_ns) {
4565 /* local or global for trace_clock */ 5091 /* local or global for trace_clock */
4566 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5092 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4567 usec_rem = do_div(t, USEC_PER_SEC); 5093 usec_rem = do_div(t, USEC_PER_SEC);
4568 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", 5094 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4569 t, usec_rem); 5095 t, usec_rem);
4570 5096
4571 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 5097 t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
4572 usec_rem = do_div(t, USEC_PER_SEC); 5098 usec_rem = do_div(t, USEC_PER_SEC);
4573 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 5099 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4574 } else { 5100 } else {
4575 /* counter or tsc mode for trace_clock */ 5101 /* counter or tsc mode for trace_clock */
4576 trace_seq_printf(s, "oldest event ts: %llu\n", 5102 trace_seq_printf(s, "oldest event ts: %llu\n",
4577 ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5103 ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4578 5104
4579 trace_seq_printf(s, "now ts: %llu\n", 5105 trace_seq_printf(s, "now ts: %llu\n",
4580 ring_buffer_time_stamp(tr->buffer, cpu)); 5106 ring_buffer_time_stamp(trace_buf->buffer, cpu));
4581 } 5107 }
4582 5108
4583 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 5109 cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
4584 trace_seq_printf(s, "dropped events: %ld\n", cnt); 5110 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4585 5111
4586 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); 5112 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
4587 trace_seq_printf(s, "read events: %ld\n", cnt); 5113 trace_seq_printf(s, "read events: %ld\n", cnt);
4588 5114
4589 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5115 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4635,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {
4635 .read = tracing_read_dyn_info, 5161 .read = tracing_read_dyn_info,
4636 .llseek = generic_file_llseek, 5162 .llseek = generic_file_llseek,
4637}; 5163};
4638#endif 5164#endif /* CONFIG_DYNAMIC_FTRACE */
4639 5165
4640static struct dentry *d_tracer; 5166#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
5167static void
5168ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5169{
5170 tracing_snapshot();
5171}
4641 5172
4642struct dentry *tracing_init_dentry(void) 5173static void
5174ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5175{
5176 unsigned long *count = (long *)data;
5177
5178 if (!*count)
5179 return;
5180
5181 if (*count != -1)
5182 (*count)--;
5183
5184 tracing_snapshot();
5185}
5186
5187static int
5188ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5189 struct ftrace_probe_ops *ops, void *data)
5190{
5191 long count = (long)data;
5192
5193 seq_printf(m, "%ps:", (void *)ip);
5194
5195 seq_printf(m, "snapshot");
5196
5197 if (count == -1)
5198 seq_printf(m, ":unlimited\n");
5199 else
5200 seq_printf(m, ":count=%ld\n", count);
5201
5202 return 0;
5203}
5204
5205static struct ftrace_probe_ops snapshot_probe_ops = {
5206 .func = ftrace_snapshot,
5207 .print = ftrace_snapshot_print,
5208};
5209
5210static struct ftrace_probe_ops snapshot_count_probe_ops = {
5211 .func = ftrace_count_snapshot,
5212 .print = ftrace_snapshot_print,
5213};
5214
5215static int
5216ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
5217 char *glob, char *cmd, char *param, int enable)
4643{ 5218{
4644 static int once; 5219 struct ftrace_probe_ops *ops;
5220 void *count = (void *)-1;
5221 char *number;
5222 int ret;
4645 5223
4646 if (d_tracer) 5224 /* hash funcs only work with set_ftrace_filter */
4647 return d_tracer; 5225 if (!enable)
5226 return -EINVAL;
5227
5228 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
5229
5230 if (glob[0] == '!') {
5231 unregister_ftrace_function_probe_func(glob+1, ops);
5232 return 0;
5233 }
5234
5235 if (!param)
5236 goto out_reg;
5237
5238 number = strsep(&param, ":");
5239
5240 if (!strlen(number))
5241 goto out_reg;
5242
5243 /*
5244 * We use the callback data field (which is a pointer)
5245 * as our counter.
5246 */
5247 ret = kstrtoul(number, 0, (unsigned long *)&count);
5248 if (ret)
5249 return ret;
5250
5251 out_reg:
5252 ret = register_ftrace_function_probe(glob, ops, count);
5253
5254 if (ret >= 0)
5255 alloc_snapshot(&global_trace);
5256
5257 return ret < 0 ? ret : 0;
5258}
5259
5260static struct ftrace_func_command ftrace_snapshot_cmd = {
5261 .name = "snapshot",
5262 .func = ftrace_trace_snapshot_callback,
5263};
5264
5265static int register_snapshot_cmd(void)
5266{
5267 return register_ftrace_command(&ftrace_snapshot_cmd);
5268}
5269#else
5270static inline int register_snapshot_cmd(void) { return 0; }
5271#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5272
5273struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
5274{
5275 if (tr->dir)
5276 return tr->dir;
4648 5277
4649 if (!debugfs_initialized()) 5278 if (!debugfs_initialized())
4650 return NULL; 5279 return NULL;
4651 5280
4652 d_tracer = debugfs_create_dir("tracing", NULL); 5281 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5282 tr->dir = debugfs_create_dir("tracing", NULL);
4653 5283
4654 if (!d_tracer && !once) { 5284 if (!tr->dir)
4655 once = 1; 5285 pr_warn_once("Could not create debugfs directory 'tracing'\n");
4656 pr_warning("Could not create debugfs directory 'tracing'\n");
4657 return NULL;
4658 }
4659 5286
4660 return d_tracer; 5287 return tr->dir;
4661} 5288}
4662 5289
4663static struct dentry *d_percpu; 5290struct dentry *tracing_init_dentry(void)
5291{
5292 return tracing_init_dentry_tr(&global_trace);
5293}
4664 5294
4665static struct dentry *tracing_dentry_percpu(void) 5295static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
4666{ 5296{
4667 static int once;
4668 struct dentry *d_tracer; 5297 struct dentry *d_tracer;
4669 5298
4670 if (d_percpu) 5299 if (tr->percpu_dir)
4671 return d_percpu; 5300 return tr->percpu_dir;
4672
4673 d_tracer = tracing_init_dentry();
4674 5301
5302 d_tracer = tracing_init_dentry_tr(tr);
4675 if (!d_tracer) 5303 if (!d_tracer)
4676 return NULL; 5304 return NULL;
4677 5305
4678 d_percpu = debugfs_create_dir("per_cpu", d_tracer); 5306 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
4679 5307
4680 if (!d_percpu && !once) { 5308 WARN_ONCE(!tr->percpu_dir,
4681 once = 1; 5309 "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
4682 pr_warning("Could not create debugfs directory 'per_cpu'\n");
4683 return NULL;
4684 }
4685 5310
4686 return d_percpu; 5311 return tr->percpu_dir;
4687} 5312}
4688 5313
4689static void tracing_init_debugfs_percpu(long cpu) 5314static void
5315tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
4690{ 5316{
4691 struct dentry *d_percpu = tracing_dentry_percpu(); 5317 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5318 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
4692 struct dentry *d_cpu; 5319 struct dentry *d_cpu;
4693 char cpu_dir[30]; /* 30 characters should be more than enough */ 5320 char cpu_dir[30]; /* 30 characters should be more than enough */
4694 5321
@@ -4704,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)
4704 5331
4705 /* per cpu trace_pipe */ 5332 /* per cpu trace_pipe */
4706 trace_create_file("trace_pipe", 0444, d_cpu, 5333 trace_create_file("trace_pipe", 0444, d_cpu,
4707 (void *) cpu, &tracing_pipe_fops); 5334 (void *)&data->trace_cpu, &tracing_pipe_fops);
4708 5335
4709 /* per cpu trace */ 5336 /* per cpu trace */
4710 trace_create_file("trace", 0644, d_cpu, 5337 trace_create_file("trace", 0644, d_cpu,
4711 (void *) cpu, &tracing_fops); 5338 (void *)&data->trace_cpu, &tracing_fops);
4712 5339
4713 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5340 trace_create_file("trace_pipe_raw", 0444, d_cpu,
4714 (void *) cpu, &tracing_buffers_fops); 5341 (void *)&data->trace_cpu, &tracing_buffers_fops);
4715 5342
4716 trace_create_file("stats", 0444, d_cpu, 5343 trace_create_file("stats", 0444, d_cpu,
4717 (void *) cpu, &tracing_stats_fops); 5344 (void *)&data->trace_cpu, &tracing_stats_fops);
4718 5345
4719 trace_create_file("buffer_size_kb", 0444, d_cpu, 5346 trace_create_file("buffer_size_kb", 0444, d_cpu,
4720 (void *) cpu, &tracing_entries_fops); 5347 (void *)&data->trace_cpu, &tracing_entries_fops);
5348
5349#ifdef CONFIG_TRACER_SNAPSHOT
5350 trace_create_file("snapshot", 0644, d_cpu,
5351 (void *)&data->trace_cpu, &snapshot_fops);
5352
5353 trace_create_file("snapshot_raw", 0444, d_cpu,
5354 (void *)&data->trace_cpu, &snapshot_raw_fops);
5355#endif
4721} 5356}
4722 5357
4723#ifdef CONFIG_FTRACE_SELFTEST 5358#ifdef CONFIG_FTRACE_SELFTEST
@@ -4728,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)
4728struct trace_option_dentry { 5363struct trace_option_dentry {
4729 struct tracer_opt *opt; 5364 struct tracer_opt *opt;
4730 struct tracer_flags *flags; 5365 struct tracer_flags *flags;
5366 struct trace_array *tr;
4731 struct dentry *entry; 5367 struct dentry *entry;
4732}; 5368};
4733 5369
@@ -4763,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4763 5399
4764 if (!!(topt->flags->val & topt->opt->bit) != val) { 5400 if (!!(topt->flags->val & topt->opt->bit) != val) {
4765 mutex_lock(&trace_types_lock); 5401 mutex_lock(&trace_types_lock);
4766 ret = __set_tracer_option(current_trace, topt->flags, 5402 ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
4767 topt->opt, !val); 5403 topt->opt, !val);
4768 mutex_unlock(&trace_types_lock); 5404 mutex_unlock(&trace_types_lock);
4769 if (ret) 5405 if (ret)
@@ -4802,6 +5438,7 @@ static ssize_t
4802trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, 5438trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4803 loff_t *ppos) 5439 loff_t *ppos)
4804{ 5440{
5441 struct trace_array *tr = &global_trace;
4805 long index = (long)filp->private_data; 5442 long index = (long)filp->private_data;
4806 unsigned long val; 5443 unsigned long val;
4807 int ret; 5444 int ret;
@@ -4814,7 +5451,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4814 return -EINVAL; 5451 return -EINVAL;
4815 5452
4816 mutex_lock(&trace_types_lock); 5453 mutex_lock(&trace_types_lock);
4817 ret = set_tracer_flag(1 << index, val); 5454 ret = set_tracer_flag(tr, 1 << index, val);
4818 mutex_unlock(&trace_types_lock); 5455 mutex_unlock(&trace_types_lock);
4819 5456
4820 if (ret < 0) 5457 if (ret < 0)
@@ -4848,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,
4848} 5485}
4849 5486
4850 5487
4851static struct dentry *trace_options_init_dentry(void) 5488static struct dentry *trace_options_init_dentry(struct trace_array *tr)
4852{ 5489{
4853 struct dentry *d_tracer; 5490 struct dentry *d_tracer;
4854 static struct dentry *t_options;
4855 5491
4856 if (t_options) 5492 if (tr->options)
4857 return t_options; 5493 return tr->options;
4858 5494
4859 d_tracer = tracing_init_dentry(); 5495 d_tracer = tracing_init_dentry_tr(tr);
4860 if (!d_tracer) 5496 if (!d_tracer)
4861 return NULL; 5497 return NULL;
4862 5498
4863 t_options = debugfs_create_dir("options", d_tracer); 5499 tr->options = debugfs_create_dir("options", d_tracer);
4864 if (!t_options) { 5500 if (!tr->options) {
4865 pr_warning("Could not create debugfs directory 'options'\n"); 5501 pr_warning("Could not create debugfs directory 'options'\n");
4866 return NULL; 5502 return NULL;
4867 } 5503 }
4868 5504
4869 return t_options; 5505 return tr->options;
4870} 5506}
4871 5507
4872static void 5508static void
4873create_trace_option_file(struct trace_option_dentry *topt, 5509create_trace_option_file(struct trace_array *tr,
5510 struct trace_option_dentry *topt,
4874 struct tracer_flags *flags, 5511 struct tracer_flags *flags,
4875 struct tracer_opt *opt) 5512 struct tracer_opt *opt)
4876{ 5513{
4877 struct dentry *t_options; 5514 struct dentry *t_options;
4878 5515
4879 t_options = trace_options_init_dentry(); 5516 t_options = trace_options_init_dentry(tr);
4880 if (!t_options) 5517 if (!t_options)
4881 return; 5518 return;
4882 5519
4883 topt->flags = flags; 5520 topt->flags = flags;
4884 topt->opt = opt; 5521 topt->opt = opt;
5522 topt->tr = tr;
4885 5523
4886 topt->entry = trace_create_file(opt->name, 0644, t_options, topt, 5524 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
4887 &trace_options_fops); 5525 &trace_options_fops);
@@ -4889,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
4889} 5527}
4890 5528
4891static struct trace_option_dentry * 5529static struct trace_option_dentry *
4892create_trace_option_files(struct tracer *tracer) 5530create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
4893{ 5531{
4894 struct trace_option_dentry *topts; 5532 struct trace_option_dentry *topts;
4895 struct tracer_flags *flags; 5533 struct tracer_flags *flags;
@@ -4914,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)
4914 return NULL; 5552 return NULL;
4915 5553
4916 for (cnt = 0; opts[cnt].name; cnt++) 5554 for (cnt = 0; opts[cnt].name; cnt++)
4917 create_trace_option_file(&topts[cnt], flags, 5555 create_trace_option_file(tr, &topts[cnt], flags,
4918 &opts[cnt]); 5556 &opts[cnt]);
4919 5557
4920 return topts; 5558 return topts;
@@ -4937,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
4937} 5575}
4938 5576
4939static struct dentry * 5577static struct dentry *
4940create_trace_option_core_file(const char *option, long index) 5578create_trace_option_core_file(struct trace_array *tr,
5579 const char *option, long index)
4941{ 5580{
4942 struct dentry *t_options; 5581 struct dentry *t_options;
4943 5582
4944 t_options = trace_options_init_dentry(); 5583 t_options = trace_options_init_dentry(tr);
4945 if (!t_options) 5584 if (!t_options)
4946 return NULL; 5585 return NULL;
4947 5586
@@ -4949,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)
4949 &trace_options_core_fops); 5588 &trace_options_core_fops);
4950} 5589}
4951 5590
4952static __init void create_trace_options_dir(void) 5591static __init void create_trace_options_dir(struct trace_array *tr)
4953{ 5592{
4954 struct dentry *t_options; 5593 struct dentry *t_options;
4955 int i; 5594 int i;
4956 5595
4957 t_options = trace_options_init_dentry(); 5596 t_options = trace_options_init_dentry(tr);
4958 if (!t_options) 5597 if (!t_options)
4959 return; 5598 return;
4960 5599
4961 for (i = 0; trace_options[i]; i++) 5600 for (i = 0; trace_options[i]; i++)
4962 create_trace_option_core_file(trace_options[i], i); 5601 create_trace_option_core_file(tr, trace_options[i], i);
4963} 5602}
4964 5603
4965static ssize_t 5604static ssize_t
@@ -4967,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
4967 size_t cnt, loff_t *ppos) 5606 size_t cnt, loff_t *ppos)
4968{ 5607{
4969 struct trace_array *tr = filp->private_data; 5608 struct trace_array *tr = filp->private_data;
4970 struct ring_buffer *buffer = tr->buffer; 5609 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4971 char buf[64]; 5610 char buf[64];
4972 int r; 5611 int r;
4973 5612
@@ -4986,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4986 size_t cnt, loff_t *ppos) 5625 size_t cnt, loff_t *ppos)
4987{ 5626{
4988 struct trace_array *tr = filp->private_data; 5627 struct trace_array *tr = filp->private_data;
4989 struct ring_buffer *buffer = tr->buffer; 5628 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4990 unsigned long val; 5629 unsigned long val;
4991 int ret; 5630 int ret;
4992 5631
@@ -4998,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4998 mutex_lock(&trace_types_lock); 5637 mutex_lock(&trace_types_lock);
4999 if (val) { 5638 if (val) {
5000 ring_buffer_record_on(buffer); 5639 ring_buffer_record_on(buffer);
5001 if (current_trace->start) 5640 if (tr->current_trace->start)
5002 current_trace->start(tr); 5641 tr->current_trace->start(tr);
5003 } else { 5642 } else {
5004 ring_buffer_record_off(buffer); 5643 ring_buffer_record_off(buffer);
5005 if (current_trace->stop) 5644 if (tr->current_trace->stop)
5006 current_trace->stop(tr); 5645 tr->current_trace->stop(tr);
5007 } 5646 }
5008 mutex_unlock(&trace_types_lock); 5647 mutex_unlock(&trace_types_lock);
5009 } 5648 }
@@ -5020,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {
5020 .llseek = default_llseek, 5659 .llseek = default_llseek,
5021}; 5660};
5022 5661
5662struct dentry *trace_instance_dir;
5663
5664static void
5665init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5666
5667static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5668{
5669 int cpu;
5670
5671 for_each_tracing_cpu(cpu) {
5672 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5673 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5674 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5675 }
5676}
5677
5678static int
5679allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5680{
5681 enum ring_buffer_flags rb_flags;
5682
5683 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5684
5685 buf->buffer = ring_buffer_alloc(size, rb_flags);
5686 if (!buf->buffer)
5687 return -ENOMEM;
5688
5689 buf->data = alloc_percpu(struct trace_array_cpu);
5690 if (!buf->data) {
5691 ring_buffer_free(buf->buffer);
5692 return -ENOMEM;
5693 }
5694
5695 init_trace_buffers(tr, buf);
5696
5697 /* Allocate the first page for all buffers */
5698 set_buffer_entries(&tr->trace_buffer,
5699 ring_buffer_size(tr->trace_buffer.buffer, 0));
5700
5701 return 0;
5702}
5703
5704static int allocate_trace_buffers(struct trace_array *tr, int size)
5705{
5706 int ret;
5707
5708 ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
5709 if (ret)
5710 return ret;
5711
5712#ifdef CONFIG_TRACER_MAX_TRACE
5713 ret = allocate_trace_buffer(tr, &tr->max_buffer,
5714 allocate_snapshot ? size : 1);
5715 if (WARN_ON(ret)) {
5716 ring_buffer_free(tr->trace_buffer.buffer);
5717 free_percpu(tr->trace_buffer.data);
5718 return -ENOMEM;
5719 }
5720 tr->allocated_snapshot = allocate_snapshot;
5721
5722 /*
5723 * Only the top level trace array gets its snapshot allocated
5724 * from the kernel command line.
5725 */
5726 allocate_snapshot = false;
5727#endif
5728 return 0;
5729}
5730
5731static int new_instance_create(const char *name)
5732{
5733 struct trace_array *tr;
5734 int ret;
5735
5736 mutex_lock(&trace_types_lock);
5737
5738 ret = -EEXIST;
5739 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5740 if (tr->name && strcmp(tr->name, name) == 0)
5741 goto out_unlock;
5742 }
5743
5744 ret = -ENOMEM;
5745 tr = kzalloc(sizeof(*tr), GFP_KERNEL);
5746 if (!tr)
5747 goto out_unlock;
5748
5749 tr->name = kstrdup(name, GFP_KERNEL);
5750 if (!tr->name)
5751 goto out_free_tr;
5752
5753 raw_spin_lock_init(&tr->start_lock);
5754
5755 tr->current_trace = &nop_trace;
5756
5757 INIT_LIST_HEAD(&tr->systems);
5758 INIT_LIST_HEAD(&tr->events);
5759
5760 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5761 goto out_free_tr;
5762
5763 /* Holder for file callbacks */
5764 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5765 tr->trace_cpu.tr = tr;
5766
5767 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5768 if (!tr->dir)
5769 goto out_free_tr;
5770
5771 ret = event_trace_add_tracer(tr->dir, tr);
5772 if (ret)
5773 goto out_free_tr;
5774
5775 init_tracer_debugfs(tr, tr->dir);
5776
5777 list_add(&tr->list, &ftrace_trace_arrays);
5778
5779 mutex_unlock(&trace_types_lock);
5780
5781 return 0;
5782
5783 out_free_tr:
5784 if (tr->trace_buffer.buffer)
5785 ring_buffer_free(tr->trace_buffer.buffer);
5786 kfree(tr->name);
5787 kfree(tr);
5788
5789 out_unlock:
5790 mutex_unlock(&trace_types_lock);
5791
5792 return ret;
5793
5794}
5795
5796static int instance_delete(const char *name)
5797{
5798 struct trace_array *tr;
5799 int found = 0;
5800 int ret;
5801
5802 mutex_lock(&trace_types_lock);
5803
5804 ret = -ENODEV;
5805 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5806 if (tr->name && strcmp(tr->name, name) == 0) {
5807 found = 1;
5808 break;
5809 }
5810 }
5811 if (!found)
5812 goto out_unlock;
5813
5814 ret = -EBUSY;
5815 if (tr->ref)
5816 goto out_unlock;
5817
5818 list_del(&tr->list);
5819
5820 event_trace_del_tracer(tr);
5821 debugfs_remove_recursive(tr->dir);
5822 free_percpu(tr->trace_buffer.data);
5823 ring_buffer_free(tr->trace_buffer.buffer);
5824
5825 kfree(tr->name);
5826 kfree(tr);
5827
5828 ret = 0;
5829
5830 out_unlock:
5831 mutex_unlock(&trace_types_lock);
5832
5833 return ret;
5834}
5835
5836static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
5837{
5838 struct dentry *parent;
5839 int ret;
5840
5841 /* Paranoid: Make sure the parent is the "instances" directory */
5842 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5843 if (WARN_ON_ONCE(parent != trace_instance_dir))
5844 return -ENOENT;
5845
5846 /*
5847 * The inode mutex is locked, but debugfs_create_dir() will also
5848 * take the mutex. As the instances directory can not be destroyed
5849 * or changed in any other way, it is safe to unlock it, and
5850 * let the dentry try. If two users try to make the same dir at
5851 * the same time, then the new_instance_create() will determine the
5852 * winner.
5853 */
5854 mutex_unlock(&inode->i_mutex);
5855
5856 ret = new_instance_create(dentry->d_iname);
5857
5858 mutex_lock(&inode->i_mutex);
5859
5860 return ret;
5861}
5862
5863static int instance_rmdir(struct inode *inode, struct dentry *dentry)
5864{
5865 struct dentry *parent;
5866 int ret;
5867
5868 /* Paranoid: Make sure the parent is the "instances" directory */
5869 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5870 if (WARN_ON_ONCE(parent != trace_instance_dir))
5871 return -ENOENT;
5872
5873 /* The caller did a dget() on dentry */
5874 mutex_unlock(&dentry->d_inode->i_mutex);
5875
5876 /*
5877 * The inode mutex is locked, but debugfs_create_dir() will also
5878 * take the mutex. As the instances directory can not be destroyed
5879 * or changed in any other way, it is safe to unlock it, and
5880 * let the dentry try. If two users try to make the same dir at
5881 * the same time, then the instance_delete() will determine the
5882 * winner.
5883 */
5884 mutex_unlock(&inode->i_mutex);
5885
5886 ret = instance_delete(dentry->d_iname);
5887
5888 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
5889 mutex_lock(&dentry->d_inode->i_mutex);
5890
5891 return ret;
5892}
5893
5894static const struct inode_operations instance_dir_inode_operations = {
5895 .lookup = simple_lookup,
5896 .mkdir = instance_mkdir,
5897 .rmdir = instance_rmdir,
5898};
5899
5900static __init void create_trace_instances(struct dentry *d_tracer)
5901{
5902 trace_instance_dir = debugfs_create_dir("instances", d_tracer);
5903 if (WARN_ON(!trace_instance_dir))
5904 return;
5905
5906 /* Hijack the dir inode operations, to allow mkdir */
5907 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
5908}
5909
5910static void
5911init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5912{
5913 int cpu;
5914
5915 trace_create_file("trace_options", 0644, d_tracer,
5916 tr, &tracing_iter_fops);
5917
5918 trace_create_file("trace", 0644, d_tracer,
5919 (void *)&tr->trace_cpu, &tracing_fops);
5920
5921 trace_create_file("trace_pipe", 0444, d_tracer,
5922 (void *)&tr->trace_cpu, &tracing_pipe_fops);
5923
5924 trace_create_file("buffer_size_kb", 0644, d_tracer,
5925 (void *)&tr->trace_cpu, &tracing_entries_fops);
5926
5927 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5928 tr, &tracing_total_entries_fops);
5929
5930 trace_create_file("free_buffer", 0644, d_tracer,
5931 tr, &tracing_free_buffer_fops);
5932
5933 trace_create_file("trace_marker", 0220, d_tracer,
5934 tr, &tracing_mark_fops);
5935
5936 trace_create_file("trace_clock", 0644, d_tracer, tr,
5937 &trace_clock_fops);
5938
5939 trace_create_file("tracing_on", 0644, d_tracer,
5940 tr, &rb_simple_fops);
5941
5942#ifdef CONFIG_TRACER_SNAPSHOT
5943 trace_create_file("snapshot", 0644, d_tracer,
5944 (void *)&tr->trace_cpu, &snapshot_fops);
5945#endif
5946
5947 for_each_tracing_cpu(cpu)
5948 tracing_init_debugfs_percpu(tr, cpu);
5949
5950}
5951
5023static __init int tracer_init_debugfs(void) 5952static __init int tracer_init_debugfs(void)
5024{ 5953{
5025 struct dentry *d_tracer; 5954 struct dentry *d_tracer;
5026 int cpu;
5027 5955
5028 trace_access_lock_init(); 5956 trace_access_lock_init();
5029 5957
5030 d_tracer = tracing_init_dentry(); 5958 d_tracer = tracing_init_dentry();
5959 if (!d_tracer)
5960 return 0;
5031 5961
5032 trace_create_file("trace_options", 0644, d_tracer, 5962 init_tracer_debugfs(&global_trace, d_tracer);
5033 NULL, &tracing_iter_fops);
5034 5963
5035 trace_create_file("tracing_cpumask", 0644, d_tracer, 5964 trace_create_file("tracing_cpumask", 0644, d_tracer,
5036 NULL, &tracing_cpumask_fops); 5965 &global_trace, &tracing_cpumask_fops);
5037
5038 trace_create_file("trace", 0644, d_tracer,
5039 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
5040 5966
5041 trace_create_file("available_tracers", 0444, d_tracer, 5967 trace_create_file("available_tracers", 0444, d_tracer,
5042 &global_trace, &show_traces_fops); 5968 &global_trace, &show_traces_fops);
@@ -5055,44 +5981,17 @@ static __init int tracer_init_debugfs(void)
5055 trace_create_file("README", 0444, d_tracer, 5981 trace_create_file("README", 0444, d_tracer,
5056 NULL, &tracing_readme_fops); 5982 NULL, &tracing_readme_fops);
5057 5983
5058 trace_create_file("trace_pipe", 0444, d_tracer,
5059 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
5060
5061 trace_create_file("buffer_size_kb", 0644, d_tracer,
5062 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
5063
5064 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5065 &global_trace, &tracing_total_entries_fops);
5066
5067 trace_create_file("free_buffer", 0644, d_tracer,
5068 &global_trace, &tracing_free_buffer_fops);
5069
5070 trace_create_file("trace_marker", 0220, d_tracer,
5071 NULL, &tracing_mark_fops);
5072
5073 trace_create_file("saved_cmdlines", 0444, d_tracer, 5984 trace_create_file("saved_cmdlines", 0444, d_tracer,
5074 NULL, &tracing_saved_cmdlines_fops); 5985 NULL, &tracing_saved_cmdlines_fops);
5075 5986
5076 trace_create_file("trace_clock", 0644, d_tracer, NULL,
5077 &trace_clock_fops);
5078
5079 trace_create_file("tracing_on", 0644, d_tracer,
5080 &global_trace, &rb_simple_fops);
5081
5082#ifdef CONFIG_DYNAMIC_FTRACE 5987#ifdef CONFIG_DYNAMIC_FTRACE
5083 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 5988 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
5084 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5989 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
5085#endif 5990#endif
5086 5991
5087#ifdef CONFIG_TRACER_SNAPSHOT 5992 create_trace_instances(d_tracer);
5088 trace_create_file("snapshot", 0644, d_tracer,
5089 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5090#endif
5091 5993
5092 create_trace_options_dir(); 5994 create_trace_options_dir(&global_trace);
5093
5094 for_each_tracing_cpu(cpu)
5095 tracing_init_debugfs_percpu(cpu);
5096 5995
5097 return 0; 5996 return 0;
5098} 5997}
@@ -5148,8 +6047,8 @@ void
5148trace_printk_seq(struct trace_seq *s) 6047trace_printk_seq(struct trace_seq *s)
5149{ 6048{
5150 /* Probably should print a warning here. */ 6049 /* Probably should print a warning here. */
5151 if (s->len >= 1000) 6050 if (s->len >= TRACE_MAX_PRINT)
5152 s->len = 1000; 6051 s->len = TRACE_MAX_PRINT;
5153 6052
5154 /* should be zero ended, but we are paranoid. */ 6053 /* should be zero ended, but we are paranoid. */
5155 s->buffer[s->len] = 0; 6054 s->buffer[s->len] = 0;
@@ -5162,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)
5162void trace_init_global_iter(struct trace_iterator *iter) 6061void trace_init_global_iter(struct trace_iterator *iter)
5163{ 6062{
5164 iter->tr = &global_trace; 6063 iter->tr = &global_trace;
5165 iter->trace = current_trace; 6064 iter->trace = iter->tr->current_trace;
5166 iter->cpu_file = TRACE_PIPE_ALL_CPU; 6065 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6066 iter->trace_buffer = &global_trace.trace_buffer;
5167} 6067}
5168 6068
5169static void 6069void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5170__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5171{ 6070{
5172 static arch_spinlock_t ftrace_dump_lock =
5173 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
5174 /* use static because iter can be a bit big for the stack */ 6071 /* use static because iter can be a bit big for the stack */
5175 static struct trace_iterator iter; 6072 static struct trace_iterator iter;
6073 static atomic_t dump_running;
5176 unsigned int old_userobj; 6074 unsigned int old_userobj;
5177 static int dump_ran;
5178 unsigned long flags; 6075 unsigned long flags;
5179 int cnt = 0, cpu; 6076 int cnt = 0, cpu;
5180 6077
5181 /* only one dump */ 6078 /* Only allow one dump user at a time. */
5182 local_irq_save(flags); 6079 if (atomic_inc_return(&dump_running) != 1) {
5183 arch_spin_lock(&ftrace_dump_lock); 6080 atomic_dec(&dump_running);
5184 if (dump_ran) 6081 return;
5185 goto out; 6082 }
5186
5187 dump_ran = 1;
5188 6083
6084 /*
6085 * Always turn off tracing when we dump.
6086 * We don't need to show trace output of what happens
6087 * between multiple crashes.
6088 *
6089 * If the user does a sysrq-z, then they can re-enable
6090 * tracing with echo 1 > tracing_on.
6091 */
5189 tracing_off(); 6092 tracing_off();
5190 6093
5191 /* Did function tracer already get disabled? */ 6094 local_irq_save(flags);
5192 if (ftrace_is_dead()) {
5193 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
5194 printk("# MAY BE MISSING FUNCTION EVENTS\n");
5195 }
5196
5197 if (disable_tracing)
5198 ftrace_kill();
5199 6095
5200 /* Simulate the iterator */ 6096 /* Simulate the iterator */
5201 trace_init_global_iter(&iter); 6097 trace_init_global_iter(&iter);
5202 6098
5203 for_each_tracing_cpu(cpu) { 6099 for_each_tracing_cpu(cpu) {
5204 atomic_inc(&iter.tr->data[cpu]->disabled); 6100 atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
5205 } 6101 }
5206 6102
5207 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 6103 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5211,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5211 6107
5212 switch (oops_dump_mode) { 6108 switch (oops_dump_mode) {
5213 case DUMP_ALL: 6109 case DUMP_ALL:
5214 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6110 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5215 break; 6111 break;
5216 case DUMP_ORIG: 6112 case DUMP_ORIG:
5217 iter.cpu_file = raw_smp_processor_id(); 6113 iter.cpu_file = raw_smp_processor_id();
@@ -5220,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5220 goto out_enable; 6116 goto out_enable;
5221 default: 6117 default:
5222 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); 6118 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
5223 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6119 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5224 } 6120 }
5225 6121
5226 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 6122 printk(KERN_TRACE "Dumping ftrace buffer:\n");
5227 6123
6124 /* Did function tracer already get disabled? */
6125 if (ftrace_is_dead()) {
6126 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
6127 printk("# MAY BE MISSING FUNCTION EVENTS\n");
6128 }
6129
5228 /* 6130 /*
5229 * We need to stop all tracing on all CPUS to read the 6131 * We need to stop all tracing on all CPUS to read the
5230 * the next buffer. This is a bit expensive, but is 6132 * the next buffer. This is a bit expensive, but is
@@ -5264,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5264 printk(KERN_TRACE "---------------------------------\n"); 6166 printk(KERN_TRACE "---------------------------------\n");
5265 6167
5266 out_enable: 6168 out_enable:
5267 /* Re-enable tracing if requested */ 6169 trace_flags |= old_userobj;
5268 if (!disable_tracing) {
5269 trace_flags |= old_userobj;
5270 6170
5271 for_each_tracing_cpu(cpu) { 6171 for_each_tracing_cpu(cpu) {
5272 atomic_dec(&iter.tr->data[cpu]->disabled); 6172 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
5273 }
5274 tracing_on();
5275 } 6173 }
5276 6174 atomic_dec(&dump_running);
5277 out:
5278 arch_spin_unlock(&ftrace_dump_lock);
5279 local_irq_restore(flags); 6175 local_irq_restore(flags);
5280} 6176}
5281
5282/* By default: disable tracing after the dump */
5283void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5284{
5285 __ftrace_dump(true, oops_dump_mode);
5286}
5287EXPORT_SYMBOL_GPL(ftrace_dump); 6177EXPORT_SYMBOL_GPL(ftrace_dump);
5288 6178
5289__init static int tracer_alloc_buffers(void) 6179__init static int tracer_alloc_buffers(void)
5290{ 6180{
5291 int ring_buf_size; 6181 int ring_buf_size;
5292 enum ring_buffer_flags rb_flags;
5293 int i;
5294 int ret = -ENOMEM; 6182 int ret = -ENOMEM;
5295 6183
5296 6184
@@ -5311,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)
5311 else 6199 else
5312 ring_buf_size = 1; 6200 ring_buf_size = 1;
5313 6201
5314 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5315
5316 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6202 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
5317 cpumask_copy(tracing_cpumask, cpu_all_mask); 6203 cpumask_copy(tracing_cpumask, cpu_all_mask);
5318 6204
6205 raw_spin_lock_init(&global_trace.start_lock);
6206
5319 /* TODO: make the number of buffers hot pluggable with CPUS */ 6207 /* TODO: make the number of buffers hot pluggable with CPUS */
5320 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); 6208 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
5321 if (!global_trace.buffer) {
5322 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6209 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
5323 WARN_ON(1); 6210 WARN_ON(1);
5324 goto out_free_cpumask; 6211 goto out_free_cpumask;
5325 } 6212 }
6213
5326 if (global_trace.buffer_disabled) 6214 if (global_trace.buffer_disabled)
5327 tracing_off(); 6215 tracing_off();
5328 6216
5329
5330#ifdef CONFIG_TRACER_MAX_TRACE
5331 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
5332 if (!max_tr.buffer) {
5333 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
5334 WARN_ON(1);
5335 ring_buffer_free(global_trace.buffer);
5336 goto out_free_cpumask;
5337 }
5338#endif
5339
5340 /* Allocate the first page for all buffers */
5341 for_each_tracing_cpu(i) {
5342 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
5343 max_tr.data[i] = &per_cpu(max_tr_data, i);
5344 }
5345
5346 set_buffer_entries(&global_trace,
5347 ring_buffer_size(global_trace.buffer, 0));
5348#ifdef CONFIG_TRACER_MAX_TRACE
5349 set_buffer_entries(&max_tr, 1);
5350#endif
5351
5352 trace_init_cmdlines(); 6217 trace_init_cmdlines();
5353 init_irq_work(&trace_work_wakeup, trace_wake_up);
5354 6218
5355 register_tracer(&nop_trace); 6219 register_tracer(&nop_trace);
5356 6220
6221 global_trace.current_trace = &nop_trace;
6222
5357 /* All seems OK, enable tracing */ 6223 /* All seems OK, enable tracing */
5358 tracing_disabled = 0; 6224 tracing_disabled = 0;
5359 6225
@@ -5362,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)
5362 6228
5363 register_die_notifier(&trace_die_notifier); 6229 register_die_notifier(&trace_die_notifier);
5364 6230
6231 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6232
6233 /* Holder for file callbacks */
6234 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6235 global_trace.trace_cpu.tr = &global_trace;
6236
6237 INIT_LIST_HEAD(&global_trace.systems);
6238 INIT_LIST_HEAD(&global_trace.events);
6239 list_add(&global_trace.list, &ftrace_trace_arrays);
6240
5365 while (trace_boot_options) { 6241 while (trace_boot_options) {
5366 char *option; 6242 char *option;
5367 6243
5368 option = strsep(&trace_boot_options, ","); 6244 option = strsep(&trace_boot_options, ",");
5369 trace_set_options(option); 6245 trace_set_options(&global_trace, option);
5370 } 6246 }
5371 6247
6248 register_snapshot_cmd();
6249
5372 return 0; 6250 return 0;
5373 6251
5374out_free_cpumask: 6252out_free_cpumask:
6253 free_percpu(global_trace.trace_buffer.data);
6254#ifdef CONFIG_TRACER_MAX_TRACE
6255 free_percpu(global_trace.max_buffer.data);
6256#endif
5375 free_cpumask_var(tracing_cpumask); 6257 free_cpumask_var(tracing_cpumask);
5376out_free_buffer_mask: 6258out_free_buffer_mask:
5377 free_cpumask_var(tracing_buffer_mask); 6259 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2081971367ea..9e014582e763 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
13#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
14#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
15 15
16#ifdef CONFIG_FTRACE_SYSCALLS
17#include <asm/unistd.h> /* For NR_SYSCALLS */
18#include <asm/syscall.h> /* some archs define it here */
19#endif
20
16enum trace_type { 21enum trace_type {
17 __TRACE_FIRST_TYPE = 0, 22 __TRACE_FIRST_TYPE = 0,
18 23
@@ -29,6 +34,7 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 35 TRACE_USER_STACK,
31 TRACE_BLK, 36 TRACE_BLK,
37 TRACE_BPUTS,
32 38
33 __TRACE_LAST_TYPE, 39 __TRACE_LAST_TYPE,
34}; 40};
@@ -127,12 +133,21 @@ enum trace_flag_type {
127 133
128#define TRACE_BUF_SIZE 1024 134#define TRACE_BUF_SIZE 1024
129 135
136struct trace_array;
137
138struct trace_cpu {
139 struct trace_array *tr;
140 struct dentry *dir;
141 int cpu;
142};
143
130/* 144/*
131 * The CPU trace array - it consists of thousands of trace entries 145 * The CPU trace array - it consists of thousands of trace entries
132 * plus some other descriptor data: (for example which task started 146 * plus some other descriptor data: (for example which task started
133 * the trace, etc.) 147 * the trace, etc.)
134 */ 148 */
135struct trace_array_cpu { 149struct trace_array_cpu {
150 struct trace_cpu trace_cpu;
136 atomic_t disabled; 151 atomic_t disabled;
137 void *buffer_page; /* ring buffer spare */ 152 void *buffer_page; /* ring buffer spare */
138 153
@@ -151,20 +166,83 @@ struct trace_array_cpu {
151 char comm[TASK_COMM_LEN]; 166 char comm[TASK_COMM_LEN];
152}; 167};
153 168
169struct tracer;
170
171struct trace_buffer {
172 struct trace_array *tr;
173 struct ring_buffer *buffer;
174 struct trace_array_cpu __percpu *data;
175 cycle_t time_start;
176 int cpu;
177};
178
154/* 179/*
155 * The trace array - an array of per-CPU trace arrays. This is the 180 * The trace array - an array of per-CPU trace arrays. This is the
156 * highest level data structure that individual tracers deal with. 181 * highest level data structure that individual tracers deal with.
157 * They have on/off state as well: 182 * They have on/off state as well:
158 */ 183 */
159struct trace_array { 184struct trace_array {
160 struct ring_buffer *buffer; 185 struct list_head list;
161 int cpu; 186 char *name;
187 struct trace_buffer trace_buffer;
188#ifdef CONFIG_TRACER_MAX_TRACE
189 /*
190 * The max_buffer is used to snapshot the trace when a maximum
191 * latency is reached, or when the user initiates a snapshot.
192 * Some tracers will use this to store a maximum trace while
193 * it continues examining live traces.
194 *
195 * The buffers for the max_buffer are set up the same as the trace_buffer
196 * When a snapshot is taken, the buffer of the max_buffer is swapped
197 * with the buffer of the trace_buffer and the buffers are reset for
198 * the trace_buffer so the tracing can continue.
199 */
200 struct trace_buffer max_buffer;
201 bool allocated_snapshot;
202#endif
162 int buffer_disabled; 203 int buffer_disabled;
163 cycle_t time_start; 204 struct trace_cpu trace_cpu; /* place holder */
205#ifdef CONFIG_FTRACE_SYSCALLS
206 int sys_refcount_enter;
207 int sys_refcount_exit;
208 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
209 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
210#endif
211 int stop_count;
212 int clock_id;
213 struct tracer *current_trace;
214 unsigned int flags;
215 raw_spinlock_t start_lock;
216 struct dentry *dir;
217 struct dentry *options;
218 struct dentry *percpu_dir;
219 struct dentry *event_dir;
220 struct list_head systems;
221 struct list_head events;
164 struct task_struct *waiter; 222 struct task_struct *waiter;
165 struct trace_array_cpu *data[NR_CPUS]; 223 int ref;
166}; 224};
167 225
226enum {
227 TRACE_ARRAY_FL_GLOBAL = (1 << 0)
228};
229
230extern struct list_head ftrace_trace_arrays;
231
232/*
233 * The global tracer (top) should be the first trace array added,
234 * but we check the flag anyway.
235 */
236static inline struct trace_array *top_trace_array(void)
237{
238 struct trace_array *tr;
239
240 tr = list_entry(ftrace_trace_arrays.prev,
241 typeof(*tr), list);
242 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
243 return tr;
244}
245
168#define FTRACE_CMP_TYPE(var, type) \ 246#define FTRACE_CMP_TYPE(var, type) \
169 __builtin_types_compatible_p(typeof(var), type *) 247 __builtin_types_compatible_p(typeof(var), type *)
170 248
@@ -200,6 +278,7 @@ extern void __ftrace_bad_type(void);
200 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 278 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
201 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 279 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
202 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 280 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
281 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
203 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 282 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
204 TRACE_MMIO_RW); \ 283 TRACE_MMIO_RW); \
205 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 284 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -289,9 +368,10 @@ struct tracer {
289 struct tracer *next; 368 struct tracer *next;
290 struct tracer_flags *flags; 369 struct tracer_flags *flags;
291 bool print_max; 370 bool print_max;
292 bool use_max_tr;
293 bool allocated_snapshot;
294 bool enabled; 371 bool enabled;
372#ifdef CONFIG_TRACER_MAX_TRACE
373 bool use_max_tr;
374#endif
295}; 375};
296 376
297 377
@@ -427,8 +507,6 @@ static __always_inline void trace_clear_recursion(int bit)
427 current->trace_recursion = val; 507 current->trace_recursion = val;
428} 508}
429 509
430#define TRACE_PIPE_ALL_CPU -1
431
432static inline struct ring_buffer_iter * 510static inline struct ring_buffer_iter *
433trace_buffer_iter(struct trace_iterator *iter, int cpu) 511trace_buffer_iter(struct trace_iterator *iter, int cpu)
434{ 512{
@@ -439,10 +517,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
439 517
440int tracer_init(struct tracer *t, struct trace_array *tr); 518int tracer_init(struct tracer *t, struct trace_array *tr);
441int tracing_is_enabled(void); 519int tracing_is_enabled(void);
442void tracing_reset(struct trace_array *tr, int cpu); 520void tracing_reset(struct trace_buffer *buf, int cpu);
443void tracing_reset_online_cpus(struct trace_array *tr); 521void tracing_reset_online_cpus(struct trace_buffer *buf);
444void tracing_reset_current(int cpu); 522void tracing_reset_current(int cpu);
445void tracing_reset_current_online_cpus(void); 523void tracing_reset_all_online_cpus(void);
446int tracing_open_generic(struct inode *inode, struct file *filp); 524int tracing_open_generic(struct inode *inode, struct file *filp);
447struct dentry *trace_create_file(const char *name, 525struct dentry *trace_create_file(const char *name,
448 umode_t mode, 526 umode_t mode,
@@ -450,6 +528,7 @@ struct dentry *trace_create_file(const char *name,
450 void *data, 528 void *data,
451 const struct file_operations *fops); 529 const struct file_operations *fops);
452 530
531struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
453struct dentry *tracing_init_dentry(void); 532struct dentry *tracing_init_dentry(void);
454 533
455struct ring_buffer_event; 534struct ring_buffer_event;
@@ -583,7 +662,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
583#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 662#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
584extern int DYN_FTRACE_TEST_NAME2(void); 663extern int DYN_FTRACE_TEST_NAME2(void);
585 664
586extern int ring_buffer_expanded; 665extern bool ring_buffer_expanded;
587extern bool tracing_selftest_disabled; 666extern bool tracing_selftest_disabled;
588DECLARE_PER_CPU(int, ftrace_cpu_disabled); 667DECLARE_PER_CPU(int, ftrace_cpu_disabled);
589 668
@@ -619,6 +698,8 @@ trace_array_vprintk(struct trace_array *tr,
619 unsigned long ip, const char *fmt, va_list args); 698 unsigned long ip, const char *fmt, va_list args);
620int trace_array_printk(struct trace_array *tr, 699int trace_array_printk(struct trace_array *tr,
621 unsigned long ip, const char *fmt, ...); 700 unsigned long ip, const char *fmt, ...);
701int trace_array_printk_buf(struct ring_buffer *buffer,
702 unsigned long ip, const char *fmt, ...);
622void trace_printk_seq(struct trace_seq *s); 703void trace_printk_seq(struct trace_seq *s);
623enum print_line_t print_trace_line(struct trace_iterator *iter); 704enum print_line_t print_trace_line(struct trace_iterator *iter);
624 705
@@ -786,6 +867,7 @@ enum trace_iterator_flags {
786 TRACE_ITER_STOP_ON_FREE = 0x400000, 867 TRACE_ITER_STOP_ON_FREE = 0x400000,
787 TRACE_ITER_IRQ_INFO = 0x800000, 868 TRACE_ITER_IRQ_INFO = 0x800000,
788 TRACE_ITER_MARKERS = 0x1000000, 869 TRACE_ITER_MARKERS = 0x1000000,
870 TRACE_ITER_FUNCTION = 0x2000000,
789}; 871};
790 872
791/* 873/*
@@ -832,8 +914,8 @@ enum {
832 914
833struct ftrace_event_field { 915struct ftrace_event_field {
834 struct list_head link; 916 struct list_head link;
835 char *name; 917 const char *name;
836 char *type; 918 const char *type;
837 int filter_type; 919 int filter_type;
838 int offset; 920 int offset;
839 int size; 921 int size;
@@ -851,12 +933,19 @@ struct event_filter {
851struct event_subsystem { 933struct event_subsystem {
852 struct list_head list; 934 struct list_head list;
853 const char *name; 935 const char *name;
854 struct dentry *entry;
855 struct event_filter *filter; 936 struct event_filter *filter;
856 int nr_events;
857 int ref_count; 937 int ref_count;
858}; 938};
859 939
940struct ftrace_subsystem_dir {
941 struct list_head list;
942 struct event_subsystem *subsystem;
943 struct trace_array *tr;
944 struct dentry *entry;
945 int ref_count;
946 int nr_events;
947};
948
860#define FILTER_PRED_INVALID ((unsigned short)-1) 949#define FILTER_PRED_INVALID ((unsigned short)-1)
861#define FILTER_PRED_IS_RIGHT (1 << 15) 950#define FILTER_PRED_IS_RIGHT (1 << 15)
862#define FILTER_PRED_FOLD (1 << 15) 951#define FILTER_PRED_FOLD (1 << 15)
@@ -906,22 +995,20 @@ struct filter_pred {
906 unsigned short right; 995 unsigned short right;
907}; 996};
908 997
909extern struct list_head ftrace_common_fields;
910
911extern enum regex_type 998extern enum regex_type
912filter_parse_regex(char *buff, int len, char **search, int *not); 999filter_parse_regex(char *buff, int len, char **search, int *not);
913extern void print_event_filter(struct ftrace_event_call *call, 1000extern void print_event_filter(struct ftrace_event_call *call,
914 struct trace_seq *s); 1001 struct trace_seq *s);
915extern int apply_event_filter(struct ftrace_event_call *call, 1002extern int apply_event_filter(struct ftrace_event_call *call,
916 char *filter_string); 1003 char *filter_string);
917extern int apply_subsystem_event_filter(struct event_subsystem *system, 1004extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
918 char *filter_string); 1005 char *filter_string);
919extern void print_subsystem_event_filter(struct event_subsystem *system, 1006extern void print_subsystem_event_filter(struct event_subsystem *system,
920 struct trace_seq *s); 1007 struct trace_seq *s);
921extern int filter_assign_type(const char *type); 1008extern int filter_assign_type(const char *type);
922 1009
923struct list_head * 1010struct ftrace_event_field *
924trace_get_fields(struct ftrace_event_call *event_call); 1011trace_find_event_field(struct ftrace_event_call *call, char *name);
925 1012
926static inline int 1013static inline int
927filter_check_discard(struct ftrace_event_call *call, void *rec, 1014filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -938,6 +1025,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
938} 1025}
939 1026
940extern void trace_event_enable_cmd_record(bool enable); 1027extern void trace_event_enable_cmd_record(bool enable);
1028extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1029extern int event_trace_del_tracer(struct trace_array *tr);
941 1030
942extern struct mutex event_mutex; 1031extern struct mutex event_mutex;
943extern struct list_head ftrace_events; 1032extern struct list_head ftrace_events;
@@ -948,7 +1037,18 @@ extern const char *__stop___trace_bprintk_fmt[];
948void trace_printk_init_buffers(void); 1037void trace_printk_init_buffers(void);
949void trace_printk_start_comm(void); 1038void trace_printk_start_comm(void);
950int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1039int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
951int set_tracer_flag(unsigned int mask, int enabled); 1040int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1041
1042/*
1043 * Normal trace_printk() and friends allocates special buffers
1044 * to do the manipulation, as well as saves the print formats
1045 * into sections to display. But the trace infrastructure wants
1046 * to use these without the added overhead at the price of being
1047 * a bit slower (used mainly for warnings, where we don't care
1048 * about performance). The internal_trace_puts() is for such
1049 * a purpose.
1050 */
1051#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
952 1052
953#undef FTRACE_ENTRY 1053#undef FTRACE_ENTRY
954#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 1054#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch; 33 struct ftrace_event_call *call = &event_branch;
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct trace_array_cpu *data;
35 struct ring_buffer_event *event; 36 struct ring_buffer_event *event;
36 struct trace_branch *entry; 37 struct trace_branch *entry;
37 struct ring_buffer *buffer; 38 struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
51 52
52 local_irq_save(flags); 53 local_irq_save(flags);
53 cpu = raw_smp_processor_id(); 54 cpu = raw_smp_processor_id();
54 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 55 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
56 if (atomic_inc_return(&data->disabled) != 1)
55 goto out; 57 goto out;
56 58
57 pc = preempt_count(); 59 pc = preempt_count();
58 buffer = tr->buffer; 60 buffer = tr->trace_buffer.buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, 61 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
60 sizeof(*entry), flags, pc); 62 sizeof(*entry), flags, pc);
61 if (!event) 63 if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
80 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
81 83
82 out: 84 out:
83 atomic_dec(&tr->data[cpu]->disabled); 85 atomic_dec(&data->disabled);
84 local_irq_restore(flags); 86 local_irq_restore(flags);
85} 87}
86 88
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
57 return local_clock(); 57 return local_clock();
58} 58}
59 59
60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 */
63u64 notrace trace_clock_jiffies(void)
64{
65 u64 jiffy = jiffies - INITIAL_JIFFIES;
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69}
60 70
61/* 71/*
62 * trace_clock_global(): special globally coherent trace clock 72 * trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%08lx fmt:%p", 226 F_printk("%pf: %s",
227 __entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
230); 230);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%08lx %s", 241 F_printk("%pf: %s",
242 __entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243
244 FILTER_OTHER
245);
246
247FTRACE_ENTRY(bputs, bputs_entry,
248
249 TRACE_BPUTS,
250
251 F_STRUCT(
252 __field( unsigned long, ip )
253 __field( const char *, str )
254 ),
255
256 F_printk("%pf: %s",
257 (void *)__entry->ip, __entry->str),
243 258
244 FILTER_OTHER 259 FILTER_OTHER
245); 260);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage); 34EXPORT_SYMBOL_GPL(event_storage);
35 35
36LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
37LIST_HEAD(ftrace_common_fields); 37static LIST_HEAD(ftrace_common_fields);
38 38
39struct list_head * 39#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
40
41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep;
43
44/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
47 list_for_each_entry(file, &tr->events, list)
48
49#define do_for_each_event_file_safe(tr, file) \
50 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
51 struct ftrace_event_file *___n; \
52 list_for_each_entry_safe(file, ___n, &tr->events, list)
53
54#define while_for_each_event_file() \
55 }
56
57static struct list_head *
40trace_get_fields(struct ftrace_event_call *event_call) 58trace_get_fields(struct ftrace_event_call *event_call)
41{ 59{
42 if (!event_call->class->get_fields) 60 if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
44 return event_call->class->get_fields(event_call); 62 return event_call->class->get_fields(event_call);
45} 63}
46 64
65static struct ftrace_event_field *
66__find_event_field(struct list_head *head, char *name)
67{
68 struct ftrace_event_field *field;
69
70 list_for_each_entry(field, head, link) {
71 if (!strcmp(field->name, name))
72 return field;
73 }
74
75 return NULL;
76}
77
78struct ftrace_event_field *
79trace_find_event_field(struct ftrace_event_call *call, char *name)
80{
81 struct ftrace_event_field *field;
82 struct list_head *head;
83
84 field = __find_event_field(&ftrace_common_fields, name);
85 if (field)
86 return field;
87
88 head = trace_get_fields(call);
89 return __find_event_field(head, name);
90}
91
47static int __trace_define_field(struct list_head *head, const char *type, 92static int __trace_define_field(struct list_head *head, const char *type,
48 const char *name, int offset, int size, 93 const char *name, int offset, int size,
49 int is_signed, int filter_type) 94 int is_signed, int filter_type)
50{ 95{
51 struct ftrace_event_field *field; 96 struct ftrace_event_field *field;
52 97
53 field = kzalloc(sizeof(*field), GFP_KERNEL); 98 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
54 if (!field) 99 if (!field)
55 goto err; 100 goto err;
56 101
57 field->name = kstrdup(name, GFP_KERNEL); 102 field->name = name;
58 if (!field->name) 103 field->type = type;
59 goto err;
60
61 field->type = kstrdup(type, GFP_KERNEL);
62 if (!field->type)
63 goto err;
64 104
65 if (filter_type == FILTER_OTHER) 105 if (filter_type == FILTER_OTHER)
66 field->filter_type = filter_assign_type(type); 106 field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
76 return 0; 116 return 0;
77 117
78err: 118err:
79 if (field) 119 kmem_cache_free(field_cachep, field);
80 kfree(field->name);
81 kfree(field);
82 120
83 return -ENOMEM; 121 return -ENOMEM;
84} 122}
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
120 return ret; 158 return ret;
121} 159}
122 160
123void trace_destroy_fields(struct ftrace_event_call *call) 161static void trace_destroy_fields(struct ftrace_event_call *call)
124{ 162{
125 struct ftrace_event_field *field, *next; 163 struct ftrace_event_field *field, *next;
126 struct list_head *head; 164 struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
128 head = trace_get_fields(call); 166 head = trace_get_fields(call);
129 list_for_each_entry_safe(field, next, head, link) { 167 list_for_each_entry_safe(field, next, head, link) {
130 list_del(&field->link); 168 list_del(&field->link);
131 kfree(field->type); 169 kmem_cache_free(field_cachep, field);
132 kfree(field->name);
133 kfree(field);
134 } 170 }
135} 171}
136 172
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
149int ftrace_event_reg(struct ftrace_event_call *call, 185int ftrace_event_reg(struct ftrace_event_call *call,
150 enum trace_reg type, void *data) 186 enum trace_reg type, void *data)
151{ 187{
188 struct ftrace_event_file *file = data;
189
152 switch (type) { 190 switch (type) {
153 case TRACE_REG_REGISTER: 191 case TRACE_REG_REGISTER:
154 return tracepoint_probe_register(call->name, 192 return tracepoint_probe_register(call->name,
155 call->class->probe, 193 call->class->probe,
156 call); 194 file);
157 case TRACE_REG_UNREGISTER: 195 case TRACE_REG_UNREGISTER:
158 tracepoint_probe_unregister(call->name, 196 tracepoint_probe_unregister(call->name,
159 call->class->probe, 197 call->class->probe,
160 call); 198 file);
161 return 0; 199 return 0;
162 200
163#ifdef CONFIG_PERF_EVENTS 201#ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
183 221
184void trace_event_enable_cmd_record(bool enable) 222void trace_event_enable_cmd_record(bool enable)
185{ 223{
186 struct ftrace_event_call *call; 224 struct ftrace_event_file *file;
225 struct trace_array *tr;
187 226
188 mutex_lock(&event_mutex); 227 mutex_lock(&event_mutex);
189 list_for_each_entry(call, &ftrace_events, list) { 228 do_for_each_event_file(tr, file) {
190 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) 229
230 if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
191 continue; 231 continue;
192 232
193 if (enable) { 233 if (enable) {
194 tracing_start_cmdline_record(); 234 tracing_start_cmdline_record();
195 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 235 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
196 } else { 236 } else {
197 tracing_stop_cmdline_record(); 237 tracing_stop_cmdline_record();
198 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 238 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
199 } 239 }
200 } 240 } while_for_each_event_file();
201 mutex_unlock(&event_mutex); 241 mutex_unlock(&event_mutex);
202} 242}
203 243
204static int ftrace_event_enable_disable(struct ftrace_event_call *call, 244static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
205 int enable) 245 int enable, int soft_disable)
206{ 246{
247 struct ftrace_event_call *call = file->event_call;
207 int ret = 0; 248 int ret = 0;
249 int disable;
208 250
209 switch (enable) { 251 switch (enable) {
210 case 0: 252 case 0:
211 if (call->flags & TRACE_EVENT_FL_ENABLED) { 253 /*
212 call->flags &= ~TRACE_EVENT_FL_ENABLED; 254 * When soft_disable is set and enable is cleared, we want
213 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { 255 * to clear the SOFT_DISABLED flag but leave the event in the
256 * state that it was. That is, if the event was enabled and
257 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
258 * is set we do not want the event to be enabled before we
259 * clear the bit.
260 *
261 * When soft_disable is not set but the SOFT_MODE flag is,
262 * we do nothing. Do not disable the tracepoint, otherwise
263 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
264 */
265 if (soft_disable) {
266 disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
267 clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
268 } else
269 disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
270
271 if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
272 clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
273 if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
214 tracing_stop_cmdline_record(); 274 tracing_stop_cmdline_record();
215 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 275 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
216 } 276 }
217 call->class->reg(call, TRACE_REG_UNREGISTER, NULL); 277 call->class->reg(call, TRACE_REG_UNREGISTER, file);
218 } 278 }
279 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
280 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
281 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
219 break; 282 break;
220 case 1: 283 case 1:
221 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 284 /*
285 * When soft_disable is set and enable is set, we want to
286 * register the tracepoint for the event, but leave the event
287 * as is. That means, if the event was already enabled, we do
288 * nothing (but set SOFT_MODE). If the event is disabled, we
289 * set SOFT_DISABLED before enabling the event tracepoint, so
290 * it still seems to be disabled.
291 */
292 if (!soft_disable)
293 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
294 else
295 set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
296
297 if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
298
299 /* Keep the event disabled, when going to SOFT_MODE. */
300 if (soft_disable)
301 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
302
222 if (trace_flags & TRACE_ITER_RECORD_CMD) { 303 if (trace_flags & TRACE_ITER_RECORD_CMD) {
223 tracing_start_cmdline_record(); 304 tracing_start_cmdline_record();
224 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 305 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
225 } 306 }
226 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); 307 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
227 if (ret) { 308 if (ret) {
228 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
229 pr_info("event trace: Could not enable event " 310 pr_info("event trace: Could not enable event "
230 "%s\n", call->name); 311 "%s\n", call->name);
231 break; 312 break;
232 } 313 }
233 call->flags |= TRACE_EVENT_FL_ENABLED; 314 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
315
316 /* WAS_ENABLED gets set but never cleared. */
317 call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
234 } 318 }
235 break; 319 break;
236 } 320 }
@@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
238 return ret; 322 return ret;
239} 323}
240 324
241static void ftrace_clear_events(void) 325static int ftrace_event_enable_disable(struct ftrace_event_file *file,
326 int enable)
242{ 327{
243 struct ftrace_event_call *call; 328 return __ftrace_event_enable_disable(file, enable, 0);
329}
330
331static void ftrace_clear_events(struct trace_array *tr)
332{
333 struct ftrace_event_file *file;
244 334
245 mutex_lock(&event_mutex); 335 mutex_lock(&event_mutex);
246 list_for_each_entry(call, &ftrace_events, list) { 336 list_for_each_entry(file, &tr->events, list) {
247 ftrace_event_enable_disable(call, 0); 337 ftrace_event_enable_disable(file, 0);
248 } 338 }
249 mutex_unlock(&event_mutex); 339 mutex_unlock(&event_mutex);
250} 340}
@@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)
257 if (--system->ref_count) 347 if (--system->ref_count)
258 return; 348 return;
259 349
350 list_del(&system->list);
351
260 if (filter) { 352 if (filter) {
261 kfree(filter->filter_string); 353 kfree(filter->filter_string);
262 kfree(filter); 354 kfree(filter);
263 } 355 }
264 kfree(system->name);
265 kfree(system); 356 kfree(system);
266} 357}
267 358
@@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)
271 system->ref_count++; 362 system->ref_count++;
272} 363}
273 364
274static void put_system(struct event_subsystem *system) 365static void __get_system_dir(struct ftrace_subsystem_dir *dir)
366{
367 WARN_ON_ONCE(dir->ref_count == 0);
368 dir->ref_count++;
369 __get_system(dir->subsystem);
370}
371
372static void __put_system_dir(struct ftrace_subsystem_dir *dir)
373{
374 WARN_ON_ONCE(dir->ref_count == 0);
375 /* If the subsystem is about to be freed, the dir must be too */
376 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
377
378 __put_system(dir->subsystem);
379 if (!--dir->ref_count)
380 kfree(dir);
381}
382
383static void put_system(struct ftrace_subsystem_dir *dir)
275{ 384{
276 mutex_lock(&event_mutex); 385 mutex_lock(&event_mutex);
277 __put_system(system); 386 __put_system_dir(dir);
278 mutex_unlock(&event_mutex); 387 mutex_unlock(&event_mutex);
279} 388}
280 389
281/* 390/*
282 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 391 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
283 */ 392 */
284static int __ftrace_set_clr_event(const char *match, const char *sub, 393static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
285 const char *event, int set) 394 const char *sub, const char *event, int set)
286{ 395{
396 struct ftrace_event_file *file;
287 struct ftrace_event_call *call; 397 struct ftrace_event_call *call;
288 int ret = -EINVAL; 398 int ret = -EINVAL;
289 399
290 mutex_lock(&event_mutex); 400 mutex_lock(&event_mutex);
291 list_for_each_entry(call, &ftrace_events, list) { 401 list_for_each_entry(file, &tr->events, list) {
402
403 call = file->event_call;
292 404
293 if (!call->name || !call->class || !call->class->reg) 405 if (!call->name || !call->class || !call->class->reg)
294 continue; 406 continue;
@@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
307 if (event && strcmp(event, call->name) != 0) 419 if (event && strcmp(event, call->name) != 0)
308 continue; 420 continue;
309 421
310 ftrace_event_enable_disable(call, set); 422 ftrace_event_enable_disable(file, set);
311 423
312 ret = 0; 424 ret = 0;
313 } 425 }
@@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
316 return ret; 428 return ret;
317} 429}
318 430
319static int ftrace_set_clr_event(char *buf, int set) 431static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
320{ 432{
321 char *event = NULL, *sub = NULL, *match; 433 char *event = NULL, *sub = NULL, *match;
322 434
@@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)
344 event = NULL; 456 event = NULL;
345 } 457 }
346 458
347 return __ftrace_set_clr_event(match, sub, event, set); 459 return __ftrace_set_clr_event(tr, match, sub, event, set);
348} 460}
349 461
350/** 462/**
@@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)
361 */ 473 */
362int trace_set_clr_event(const char *system, const char *event, int set) 474int trace_set_clr_event(const char *system, const char *event, int set)
363{ 475{
364 return __ftrace_set_clr_event(NULL, system, event, set); 476 struct trace_array *tr = top_trace_array();
477
478 return __ftrace_set_clr_event(tr, NULL, system, event, set);
365} 479}
366EXPORT_SYMBOL_GPL(trace_set_clr_event); 480EXPORT_SYMBOL_GPL(trace_set_clr_event);
367 481
@@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
373 size_t cnt, loff_t *ppos) 487 size_t cnt, loff_t *ppos)
374{ 488{
375 struct trace_parser parser; 489 struct trace_parser parser;
490 struct seq_file *m = file->private_data;
491 struct trace_array *tr = m->private;
376 ssize_t read, ret; 492 ssize_t read, ret;
377 493
378 if (!cnt) 494 if (!cnt)
@@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
395 511
396 parser.buffer[parser.idx] = 0; 512 parser.buffer[parser.idx] = 0;
397 513
398 ret = ftrace_set_clr_event(parser.buffer + !set, set); 514 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
399 if (ret) 515 if (ret)
400 goto out_put; 516 goto out_put;
401 } 517 }
@@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
411static void * 527static void *
412t_next(struct seq_file *m, void *v, loff_t *pos) 528t_next(struct seq_file *m, void *v, loff_t *pos)
413{ 529{
414 struct ftrace_event_call *call = v; 530 struct ftrace_event_file *file = v;
531 struct ftrace_event_call *call;
532 struct trace_array *tr = m->private;
415 533
416 (*pos)++; 534 (*pos)++;
417 535
418 list_for_each_entry_continue(call, &ftrace_events, list) { 536 list_for_each_entry_continue(file, &tr->events, list) {
537 call = file->event_call;
419 /* 538 /*
420 * The ftrace subsystem is for showing formats only. 539 * The ftrace subsystem is for showing formats only.
421 * They can not be enabled or disabled via the event files. 540 * They can not be enabled or disabled via the event files.
422 */ 541 */
423 if (call->class && call->class->reg) 542 if (call->class && call->class->reg)
424 return call; 543 return file;
425 } 544 }
426 545
427 return NULL; 546 return NULL;
@@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
429 548
430static void *t_start(struct seq_file *m, loff_t *pos) 549static void *t_start(struct seq_file *m, loff_t *pos)
431{ 550{
432 struct ftrace_event_call *call; 551 struct ftrace_event_file *file;
552 struct trace_array *tr = m->private;
433 loff_t l; 553 loff_t l;
434 554
435 mutex_lock(&event_mutex); 555 mutex_lock(&event_mutex);
436 556
437 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 557 file = list_entry(&tr->events, struct ftrace_event_file, list);
438 for (l = 0; l <= *pos; ) { 558 for (l = 0; l <= *pos; ) {
439 call = t_next(m, call, &l); 559 file = t_next(m, file, &l);
440 if (!call) 560 if (!file)
441 break; 561 break;
442 } 562 }
443 return call; 563 return file;
444} 564}
445 565
446static void * 566static void *
447s_next(struct seq_file *m, void *v, loff_t *pos) 567s_next(struct seq_file *m, void *v, loff_t *pos)
448{ 568{
449 struct ftrace_event_call *call = v; 569 struct ftrace_event_file *file = v;
570 struct trace_array *tr = m->private;
450 571
451 (*pos)++; 572 (*pos)++;
452 573
453 list_for_each_entry_continue(call, &ftrace_events, list) { 574 list_for_each_entry_continue(file, &tr->events, list) {
454 if (call->flags & TRACE_EVENT_FL_ENABLED) 575 if (file->flags & FTRACE_EVENT_FL_ENABLED)
455 return call; 576 return file;
456 } 577 }
457 578
458 return NULL; 579 return NULL;
@@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
460 581
461static void *s_start(struct seq_file *m, loff_t *pos) 582static void *s_start(struct seq_file *m, loff_t *pos)
462{ 583{
463 struct ftrace_event_call *call; 584 struct ftrace_event_file *file;
585 struct trace_array *tr = m->private;
464 loff_t l; 586 loff_t l;
465 587
466 mutex_lock(&event_mutex); 588 mutex_lock(&event_mutex);
467 589
468 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 590 file = list_entry(&tr->events, struct ftrace_event_file, list);
469 for (l = 0; l <= *pos; ) { 591 for (l = 0; l <= *pos; ) {
470 call = s_next(m, call, &l); 592 file = s_next(m, file, &l);
471 if (!call) 593 if (!file)
472 break; 594 break;
473 } 595 }
474 return call; 596 return file;
475} 597}
476 598
477static int t_show(struct seq_file *m, void *v) 599static int t_show(struct seq_file *m, void *v)
478{ 600{
479 struct ftrace_event_call *call = v; 601 struct ftrace_event_file *file = v;
602 struct ftrace_event_call *call = file->event_call;
480 603
481 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 604 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
482 seq_printf(m, "%s:", call->class->system); 605 seq_printf(m, "%s:", call->class->system);
@@ -494,25 +617,31 @@ static ssize_t
494event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 617event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
495 loff_t *ppos) 618 loff_t *ppos)
496{ 619{
497 struct ftrace_event_call *call = filp->private_data; 620 struct ftrace_event_file *file = filp->private_data;
498 char *buf; 621 char *buf;
499 622
500 if (call->flags & TRACE_EVENT_FL_ENABLED) 623 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
501 buf = "1\n"; 624 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
502 else 625 buf = "0*\n";
626 else
627 buf = "1\n";
628 } else
503 buf = "0\n"; 629 buf = "0\n";
504 630
505 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); 631 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
506} 632}
507 633
508static ssize_t 634static ssize_t
509event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 635event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
510 loff_t *ppos) 636 loff_t *ppos)
511{ 637{
512 struct ftrace_event_call *call = filp->private_data; 638 struct ftrace_event_file *file = filp->private_data;
513 unsigned long val; 639 unsigned long val;
514 int ret; 640 int ret;
515 641
642 if (!file)
643 return -EINVAL;
644
516 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 645 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
517 if (ret) 646 if (ret)
518 return ret; 647 return ret;
@@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
525 case 0: 654 case 0:
526 case 1: 655 case 1:
527 mutex_lock(&event_mutex); 656 mutex_lock(&event_mutex);
528 ret = ftrace_event_enable_disable(call, val); 657 ret = ftrace_event_enable_disable(file, val);
529 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
530 break; 659 break;
531 660
@@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
543 loff_t *ppos) 672 loff_t *ppos)
544{ 673{
545 const char set_to_char[4] = { '?', '0', '1', 'X' }; 674 const char set_to_char[4] = { '?', '0', '1', 'X' };
546 struct event_subsystem *system = filp->private_data; 675 struct ftrace_subsystem_dir *dir = filp->private_data;
676 struct event_subsystem *system = dir->subsystem;
547 struct ftrace_event_call *call; 677 struct ftrace_event_call *call;
678 struct ftrace_event_file *file;
679 struct trace_array *tr = dir->tr;
548 char buf[2]; 680 char buf[2];
549 int set = 0; 681 int set = 0;
550 int ret; 682 int ret;
551 683
552 mutex_lock(&event_mutex); 684 mutex_lock(&event_mutex);
553 list_for_each_entry(call, &ftrace_events, list) { 685 list_for_each_entry(file, &tr->events, list) {
686 call = file->event_call;
554 if (!call->name || !call->class || !call->class->reg) 687 if (!call->name || !call->class || !call->class->reg)
555 continue; 688 continue;
556 689
@@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
562 * or if all events or cleared, or if we have 695 * or if all events or cleared, or if we have
563 * a mixture. 696 * a mixture.
564 */ 697 */
565 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); 698 set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
566 699
567 /* 700 /*
568 * If we have a mixture, no need to look further. 701 * If we have a mixture, no need to look further.
@@ -584,7 +717,8 @@ static ssize_t
584system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 717system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
585 loff_t *ppos) 718 loff_t *ppos)
586{ 719{
587 struct event_subsystem *system = filp->private_data; 720 struct ftrace_subsystem_dir *dir = filp->private_data;
721 struct event_subsystem *system = dir->subsystem;
588 const char *name = NULL; 722 const char *name = NULL;
589 unsigned long val; 723 unsigned long val;
590 ssize_t ret; 724 ssize_t ret;
@@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
607 if (system) 741 if (system)
608 name = system->name; 742 name = system->name;
609 743
610 ret = __ftrace_set_clr_event(NULL, name, NULL, val); 744 ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
611 if (ret) 745 if (ret)
612 goto out; 746 goto out;
613 747
@@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);
845static int subsystem_open(struct inode *inode, struct file *filp) 979static int subsystem_open(struct inode *inode, struct file *filp)
846{ 980{
847 struct event_subsystem *system = NULL; 981 struct event_subsystem *system = NULL;
982 struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
983 struct trace_array *tr;
848 int ret; 984 int ret;
849 985
850 if (!inode->i_private)
851 goto skip_search;
852
853 /* Make sure the system still exists */ 986 /* Make sure the system still exists */
854 mutex_lock(&event_mutex); 987 mutex_lock(&event_mutex);
855 list_for_each_entry(system, &event_subsystems, list) { 988 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
856 if (system == inode->i_private) { 989 list_for_each_entry(dir, &tr->systems, list) {
857 /* Don't open systems with no events */ 990 if (dir == inode->i_private) {
858 if (!system->nr_events) { 991 /* Don't open systems with no events */
859 system = NULL; 992 if (dir->nr_events) {
860 break; 993 __get_system_dir(dir);
994 system = dir->subsystem;
995 }
996 goto exit_loop;
861 } 997 }
862 __get_system(system);
863 break;
864 } 998 }
865 } 999 }
1000 exit_loop:
866 mutex_unlock(&event_mutex); 1001 mutex_unlock(&event_mutex);
867 1002
868 if (system != inode->i_private) 1003 if (!system)
869 return -ENODEV; 1004 return -ENODEV;
870 1005
871 skip_search: 1006 /* Some versions of gcc think dir can be uninitialized here */
1007 WARN_ON(!dir);
1008
872 ret = tracing_open_generic(inode, filp); 1009 ret = tracing_open_generic(inode, filp);
873 if (ret < 0 && system) 1010 if (ret < 0)
874 put_system(system); 1011 put_system(dir);
1012
1013 return ret;
1014}
1015
1016static int system_tr_open(struct inode *inode, struct file *filp)
1017{
1018 struct ftrace_subsystem_dir *dir;
1019 struct trace_array *tr = inode->i_private;
1020 int ret;
1021
1022 /* Make a temporary dir that has no system but points to tr */
1023 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1024 if (!dir)
1025 return -ENOMEM;
1026
1027 dir->tr = tr;
1028
1029 ret = tracing_open_generic(inode, filp);
1030 if (ret < 0)
1031 kfree(dir);
1032
1033 filp->private_data = dir;
875 1034
876 return ret; 1035 return ret;
877} 1036}
878 1037
879static int subsystem_release(struct inode *inode, struct file *file) 1038static int subsystem_release(struct inode *inode, struct file *file)
880{ 1039{
881 struct event_subsystem *system = inode->i_private; 1040 struct ftrace_subsystem_dir *dir = file->private_data;
882 1041
883 if (system) 1042 /*
884 put_system(system); 1043 * If dir->subsystem is NULL, then this is a temporary
1044 * descriptor that was made for a trace_array to enable
1045 * all subsystems.
1046 */
1047 if (dir->subsystem)
1048 put_system(dir);
1049 else
1050 kfree(dir);
885 1051
886 return 0; 1052 return 0;
887} 1053}
@@ -890,7 +1056,8 @@ static ssize_t
890subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 1056subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
891 loff_t *ppos) 1057 loff_t *ppos)
892{ 1058{
893 struct event_subsystem *system = filp->private_data; 1059 struct ftrace_subsystem_dir *dir = filp->private_data;
1060 struct event_subsystem *system = dir->subsystem;
894 struct trace_seq *s; 1061 struct trace_seq *s;
895 int r; 1062 int r;
896 1063
@@ -915,7 +1082,7 @@ static ssize_t
915subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1082subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
916 loff_t *ppos) 1083 loff_t *ppos)
917{ 1084{
918 struct event_subsystem *system = filp->private_data; 1085 struct ftrace_subsystem_dir *dir = filp->private_data;
919 char *buf; 1086 char *buf;
920 int err; 1087 int err;
921 1088
@@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
932 } 1099 }
933 buf[cnt] = '\0'; 1100 buf[cnt] = '\0';
934 1101
935 err = apply_subsystem_event_filter(system, buf); 1102 err = apply_subsystem_event_filter(dir, buf);
936 free_page((unsigned long) buf); 1103 free_page((unsigned long) buf);
937 if (err < 0) 1104 if (err < 0)
938 return err; 1105 return err;
@@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {
1041 .release = subsystem_release, 1208 .release = subsystem_release,
1042}; 1209};
1043 1210
1211static const struct file_operations ftrace_tr_enable_fops = {
1212 .open = system_tr_open,
1213 .read = system_enable_read,
1214 .write = system_enable_write,
1215 .llseek = default_llseek,
1216 .release = subsystem_release,
1217};
1218
1044static const struct file_operations ftrace_show_header_fops = { 1219static const struct file_operations ftrace_show_header_fops = {
1045 .open = tracing_open_generic, 1220 .open = tracing_open_generic,
1046 .read = show_header, 1221 .read = show_header,
1047 .llseek = default_llseek, 1222 .llseek = default_llseek,
1048}; 1223};
1049 1224
1050static struct dentry *event_trace_events_dir(void) 1225static int
1226ftrace_event_open(struct inode *inode, struct file *file,
1227 const struct seq_operations *seq_ops)
1051{ 1228{
1052 static struct dentry *d_tracer; 1229 struct seq_file *m;
1053 static struct dentry *d_events; 1230 int ret;
1054
1055 if (d_events)
1056 return d_events;
1057
1058 d_tracer = tracing_init_dentry();
1059 if (!d_tracer)
1060 return NULL;
1061 1231
1062 d_events = debugfs_create_dir("events", d_tracer); 1232 ret = seq_open(file, seq_ops);
1063 if (!d_events) 1233 if (ret < 0)
1064 pr_warning("Could not create debugfs " 1234 return ret;
1065 "'events' directory\n"); 1235 m = file->private_data;
1236 /* copy tr over to seq ops */
1237 m->private = inode->i_private;
1066 1238
1067 return d_events; 1239 return ret;
1068} 1240}
1069 1241
1070static int 1242static int
@@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
1072{ 1244{
1073 const struct seq_operations *seq_ops = &show_event_seq_ops; 1245 const struct seq_operations *seq_ops = &show_event_seq_ops;
1074 1246
1075 return seq_open(file, seq_ops); 1247 return ftrace_event_open(inode, file, seq_ops);
1076} 1248}
1077 1249
1078static int 1250static int
1079ftrace_event_set_open(struct inode *inode, struct file *file) 1251ftrace_event_set_open(struct inode *inode, struct file *file)
1080{ 1252{
1081 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1253 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1254 struct trace_array *tr = inode->i_private;
1082 1255
1083 if ((file->f_mode & FMODE_WRITE) && 1256 if ((file->f_mode & FMODE_WRITE) &&
1084 (file->f_flags & O_TRUNC)) 1257 (file->f_flags & O_TRUNC))
1085 ftrace_clear_events(); 1258 ftrace_clear_events(tr);
1086 1259
1087 return seq_open(file, seq_ops); 1260 return ftrace_event_open(inode, file, seq_ops);
1261}
1262
1263static struct event_subsystem *
1264create_new_subsystem(const char *name)
1265{
1266 struct event_subsystem *system;
1267
1268 /* need to create new entry */
1269 system = kmalloc(sizeof(*system), GFP_KERNEL);
1270 if (!system)
1271 return NULL;
1272
1273 system->ref_count = 1;
1274 system->name = name;
1275
1276 system->filter = NULL;
1277
1278 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
1279 if (!system->filter)
1280 goto out_free;
1281
1282 list_add(&system->list, &event_subsystems);
1283
1284 return system;
1285
1286 out_free:
1287 kfree(system);
1288 return NULL;
1088} 1289}
1089 1290
1090static struct dentry * 1291static struct dentry *
1091event_subsystem_dir(const char *name, struct dentry *d_events) 1292event_subsystem_dir(struct trace_array *tr, const char *name,
1293 struct ftrace_event_file *file, struct dentry *parent)
1092{ 1294{
1295 struct ftrace_subsystem_dir *dir;
1093 struct event_subsystem *system; 1296 struct event_subsystem *system;
1094 struct dentry *entry; 1297 struct dentry *entry;
1095 1298
1096 /* First see if we did not already create this dir */ 1299 /* First see if we did not already create this dir */
1097 list_for_each_entry(system, &event_subsystems, list) { 1300 list_for_each_entry(dir, &tr->systems, list) {
1301 system = dir->subsystem;
1098 if (strcmp(system->name, name) == 0) { 1302 if (strcmp(system->name, name) == 0) {
1099 system->nr_events++; 1303 dir->nr_events++;
1100 return system->entry; 1304 file->system = dir;
1305 return dir->entry;
1101 } 1306 }
1102 } 1307 }
1103 1308
1104 /* need to create new entry */ 1309 /* Now see if the system itself exists. */
1105 system = kmalloc(sizeof(*system), GFP_KERNEL); 1310 list_for_each_entry(system, &event_subsystems, list) {
1106 if (!system) { 1311 if (strcmp(system->name, name) == 0)
1107 pr_warning("No memory to create event subsystem %s\n", 1312 break;
1108 name);
1109 return d_events;
1110 } 1313 }
1314 /* Reset system variable when not found */
1315 if (&system->list == &event_subsystems)
1316 system = NULL;
1111 1317
1112 system->entry = debugfs_create_dir(name, d_events); 1318 dir = kmalloc(sizeof(*dir), GFP_KERNEL);
1113 if (!system->entry) { 1319 if (!dir)
1114 pr_warning("Could not create event subsystem %s\n", 1320 goto out_fail;
1115 name);
1116 kfree(system);
1117 return d_events;
1118 }
1119 1321
1120 system->nr_events = 1; 1322 if (!system) {
1121 system->ref_count = 1; 1323 system = create_new_subsystem(name);
1122 system->name = kstrdup(name, GFP_KERNEL); 1324 if (!system)
1123 if (!system->name) { 1325 goto out_free;
1124 debugfs_remove(system->entry); 1326 } else
1125 kfree(system); 1327 __get_system(system);
1126 return d_events; 1328
1329 dir->entry = debugfs_create_dir(name, parent);
1330 if (!dir->entry) {
1331 pr_warning("Failed to create system directory %s\n", name);
1332 __put_system(system);
1333 goto out_free;
1127 } 1334 }
1128 1335
1129 list_add(&system->list, &event_subsystems); 1336 dir->tr = tr;
1130 1337 dir->ref_count = 1;
1131 system->filter = NULL; 1338 dir->nr_events = 1;
1132 1339 dir->subsystem = system;
1133 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); 1340 file->system = dir;
1134 if (!system->filter) {
1135 pr_warning("Could not allocate filter for subsystem "
1136 "'%s'\n", name);
1137 return system->entry;
1138 }
1139 1341
1140 entry = debugfs_create_file("filter", 0644, system->entry, system, 1342 entry = debugfs_create_file("filter", 0644, dir->entry, dir,
1141 &ftrace_subsystem_filter_fops); 1343 &ftrace_subsystem_filter_fops);
1142 if (!entry) { 1344 if (!entry) {
1143 kfree(system->filter); 1345 kfree(system->filter);
1144 system->filter = NULL; 1346 system->filter = NULL;
1145 pr_warning("Could not create debugfs " 1347 pr_warning("Could not create debugfs '%s/filter' entry\n", name);
1146 "'%s/filter' entry\n", name);
1147 } 1348 }
1148 1349
1149 trace_create_file("enable", 0644, system->entry, system, 1350 trace_create_file("enable", 0644, dir->entry, dir,
1150 &ftrace_system_enable_fops); 1351 &ftrace_system_enable_fops);
1151 1352
1152 return system->entry; 1353 list_add(&dir->list, &tr->systems);
1354
1355 return dir->entry;
1356
1357 out_free:
1358 kfree(dir);
1359 out_fail:
1360 /* Only print this message if failed on memory allocation */
1361 if (!dir || !system)
1362 pr_warning("No memory to create event subsystem %s\n",
1363 name);
1364 return NULL;
1153} 1365}
1154 1366
1155static int 1367static int
1156event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, 1368event_create_dir(struct dentry *parent,
1369 struct ftrace_event_file *file,
1157 const struct file_operations *id, 1370 const struct file_operations *id,
1158 const struct file_operations *enable, 1371 const struct file_operations *enable,
1159 const struct file_operations *filter, 1372 const struct file_operations *filter,
1160 const struct file_operations *format) 1373 const struct file_operations *format)
1161{ 1374{
1375 struct ftrace_event_call *call = file->event_call;
1376 struct trace_array *tr = file->tr;
1162 struct list_head *head; 1377 struct list_head *head;
1378 struct dentry *d_events;
1163 int ret; 1379 int ret;
1164 1380
1165 /* 1381 /*
1166 * If the trace point header did not define TRACE_SYSTEM 1382 * If the trace point header did not define TRACE_SYSTEM
1167 * then the system would be called "TRACE_SYSTEM". 1383 * then the system would be called "TRACE_SYSTEM".
1168 */ 1384 */
1169 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 1385 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
1170 d_events = event_subsystem_dir(call->class->system, d_events); 1386 d_events = event_subsystem_dir(tr, call->class->system, file, parent);
1171 1387 if (!d_events)
1172 call->dir = debugfs_create_dir(call->name, d_events); 1388 return -ENOMEM;
1173 if (!call->dir) { 1389 } else
1174 pr_warning("Could not create debugfs " 1390 d_events = parent;
1175 "'%s' directory\n", call->name); 1391
1392 file->dir = debugfs_create_dir(call->name, d_events);
1393 if (!file->dir) {
1394 pr_warning("Could not create debugfs '%s' directory\n",
1395 call->name);
1176 return -1; 1396 return -1;
1177 } 1397 }
1178 1398
1179 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1399 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1180 trace_create_file("enable", 0644, call->dir, call, 1400 trace_create_file("enable", 0644, file->dir, file,
1181 enable); 1401 enable);
1182 1402
1183#ifdef CONFIG_PERF_EVENTS 1403#ifdef CONFIG_PERF_EVENTS
1184 if (call->event.type && call->class->reg) 1404 if (call->event.type && call->class->reg)
1185 trace_create_file("id", 0444, call->dir, call, 1405 trace_create_file("id", 0444, file->dir, call,
1186 id); 1406 id);
1187#endif 1407#endif
1188 1408
@@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1196 if (ret < 0) { 1416 if (ret < 0) {
1197 pr_warning("Could not initialize trace point" 1417 pr_warning("Could not initialize trace point"
1198 " events/%s\n", call->name); 1418 " events/%s\n", call->name);
1199 return ret; 1419 return -1;
1200 } 1420 }
1201 } 1421 }
1202 trace_create_file("filter", 0644, call->dir, call, 1422 trace_create_file("filter", 0644, file->dir, call,
1203 filter); 1423 filter);
1204 1424
1205 trace_create_file("format", 0444, call->dir, call, 1425 trace_create_file("format", 0444, file->dir, call,
1206 format); 1426 format);
1207 1427
1208 return 0; 1428 return 0;
1209} 1429}
1210 1430
1431static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1432{
1433 if (!dir)
1434 return;
1435
1436 if (!--dir->nr_events) {
1437 debugfs_remove_recursive(dir->entry);
1438 list_del(&dir->list);
1439 __put_system_dir(dir);
1440 }
1441}
1442
1443static void remove_event_from_tracers(struct ftrace_event_call *call)
1444{
1445 struct ftrace_event_file *file;
1446 struct trace_array *tr;
1447
1448 do_for_each_event_file_safe(tr, file) {
1449
1450 if (file->event_call != call)
1451 continue;
1452
1453 list_del(&file->list);
1454 debugfs_remove_recursive(file->dir);
1455 remove_subsystem(file->system);
1456 kmem_cache_free(file_cachep, file);
1457
1458 /*
1459 * The do_for_each_event_file_safe() is
1460 * a double loop. After finding the call for this
1461 * trace_array, we use break to jump to the next
1462 * trace_array.
1463 */
1464 break;
1465 } while_for_each_event_file();
1466}
1467
1211static void event_remove(struct ftrace_event_call *call) 1468static void event_remove(struct ftrace_event_call *call)
1212{ 1469{
1213 ftrace_event_enable_disable(call, 0); 1470 struct trace_array *tr;
1471 struct ftrace_event_file *file;
1472
1473 do_for_each_event_file(tr, file) {
1474 if (file->event_call != call)
1475 continue;
1476 ftrace_event_enable_disable(file, 0);
1477 /*
1478 * The do_for_each_event_file() is
1479 * a double loop. After finding the call for this
1480 * trace_array, we use break to jump to the next
1481 * trace_array.
1482 */
1483 break;
1484 } while_for_each_event_file();
1485
1214 if (call->event.funcs) 1486 if (call->event.funcs)
1215 __unregister_ftrace_event(&call->event); 1487 __unregister_ftrace_event(&call->event);
1488 remove_event_from_tracers(call);
1216 list_del(&call->list); 1489 list_del(&call->list);
1217} 1490}
1218 1491
@@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)
1234} 1507}
1235 1508
1236static int 1509static int
1237__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1510__register_event(struct ftrace_event_call *call, struct module *mod)
1238 const struct file_operations *id,
1239 const struct file_operations *enable,
1240 const struct file_operations *filter,
1241 const struct file_operations *format)
1242{ 1511{
1243 struct dentry *d_events;
1244 int ret; 1512 int ret;
1245 1513
1246 ret = event_init(call); 1514 ret = event_init(call);
1247 if (ret < 0) 1515 if (ret < 0)
1248 return ret; 1516 return ret;
1249 1517
1250 d_events = event_trace_events_dir(); 1518 list_add(&call->list, &ftrace_events);
1251 if (!d_events)
1252 return -ENOENT;
1253
1254 ret = event_create_dir(call, d_events, id, enable, filter, format);
1255 if (!ret)
1256 list_add(&call->list, &ftrace_events);
1257 call->mod = mod; 1519 call->mod = mod;
1258 1520
1259 return ret; 1521 return 0;
1522}
1523
1524/* Add an event to a trace directory */
1525static int
1526__trace_add_new_event(struct ftrace_event_call *call,
1527 struct trace_array *tr,
1528 const struct file_operations *id,
1529 const struct file_operations *enable,
1530 const struct file_operations *filter,
1531 const struct file_operations *format)
1532{
1533 struct ftrace_event_file *file;
1534
1535 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1536 if (!file)
1537 return -ENOMEM;
1538
1539 file->event_call = call;
1540 file->tr = tr;
1541 list_add(&file->list, &tr->events);
1542
1543 return event_create_dir(tr->event_dir, file, id, enable, filter, format);
1260} 1544}
1261 1545
1546/*
1547 * Just create a decriptor for early init. A descriptor is required
1548 * for enabling events at boot. We want to enable events before
1549 * the filesystem is initialized.
1550 */
1551static __init int
1552__trace_early_add_new_event(struct ftrace_event_call *call,
1553 struct trace_array *tr)
1554{
1555 struct ftrace_event_file *file;
1556
1557 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1558 if (!file)
1559 return -ENOMEM;
1560
1561 file->event_call = call;
1562 file->tr = tr;
1563 list_add(&file->list, &tr->events);
1564
1565 return 0;
1566}
1567
1568struct ftrace_module_file_ops;
1569static void __add_event_to_tracers(struct ftrace_event_call *call,
1570 struct ftrace_module_file_ops *file_ops);
1571
1262/* Add an additional event_call dynamically */ 1572/* Add an additional event_call dynamically */
1263int trace_add_event_call(struct ftrace_event_call *call) 1573int trace_add_event_call(struct ftrace_event_call *call)
1264{ 1574{
1265 int ret; 1575 int ret;
1266 mutex_lock(&event_mutex); 1576 mutex_lock(&event_mutex);
1267 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1268 &ftrace_enable_fops,
1269 &ftrace_event_filter_fops,
1270 &ftrace_event_format_fops);
1271 mutex_unlock(&event_mutex);
1272 return ret;
1273}
1274 1577
1275static void remove_subsystem_dir(const char *name) 1578 ret = __register_event(call, NULL);
1276{ 1579 if (ret >= 0)
1277 struct event_subsystem *system; 1580 __add_event_to_tracers(call, NULL);
1278
1279 if (strcmp(name, TRACE_SYSTEM) == 0)
1280 return;
1281 1581
1282 list_for_each_entry(system, &event_subsystems, list) { 1582 mutex_unlock(&event_mutex);
1283 if (strcmp(system->name, name) == 0) { 1583 return ret;
1284 if (!--system->nr_events) {
1285 debugfs_remove_recursive(system->entry);
1286 list_del(&system->list);
1287 __put_system(system);
1288 }
1289 break;
1290 }
1291 }
1292} 1584}
1293 1585
1294/* 1586/*
1295 * Must be called under locking both of event_mutex and trace_event_mutex. 1587 * Must be called under locking both of event_mutex and trace_event_sem.
1296 */ 1588 */
1297static void __trace_remove_event_call(struct ftrace_event_call *call) 1589static void __trace_remove_event_call(struct ftrace_event_call *call)
1298{ 1590{
1299 event_remove(call); 1591 event_remove(call);
1300 trace_destroy_fields(call); 1592 trace_destroy_fields(call);
1301 destroy_preds(call); 1593 destroy_preds(call);
1302 debugfs_remove_recursive(call->dir);
1303 remove_subsystem_dir(call->class->system);
1304} 1594}
1305 1595
1306/* Remove an event_call */ 1596/* Remove an event_call */
1307void trace_remove_event_call(struct ftrace_event_call *call) 1597void trace_remove_event_call(struct ftrace_event_call *call)
1308{ 1598{
1309 mutex_lock(&event_mutex); 1599 mutex_lock(&event_mutex);
1310 down_write(&trace_event_mutex); 1600 down_write(&trace_event_sem);
1311 __trace_remove_event_call(call); 1601 __trace_remove_event_call(call);
1312 up_write(&trace_event_mutex); 1602 up_write(&trace_event_sem);
1313 mutex_unlock(&event_mutex); 1603 mutex_unlock(&event_mutex);
1314} 1604}
1315 1605
@@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {
1336}; 1626};
1337 1627
1338static struct ftrace_module_file_ops * 1628static struct ftrace_module_file_ops *
1629find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1630{
1631 /*
1632 * As event_calls are added in groups by module,
1633 * when we find one file_ops, we don't need to search for
1634 * each call in that module, as the rest should be the
1635 * same. Only search for a new one if the last one did
1636 * not match.
1637 */
1638 if (file_ops && mod == file_ops->mod)
1639 return file_ops;
1640
1641 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1642 if (file_ops->mod == mod)
1643 return file_ops;
1644 }
1645 return NULL;
1646}
1647
1648static struct ftrace_module_file_ops *
1339trace_create_file_ops(struct module *mod) 1649trace_create_file_ops(struct module *mod)
1340{ 1650{
1341 struct ftrace_module_file_ops *file_ops; 1651 struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)
1386 return; 1696 return;
1387 1697
1388 for_each_event(call, start, end) { 1698 for_each_event(call, start, end) {
1389 __trace_add_event_call(*call, mod, 1699 __register_event(*call, mod);
1390 &file_ops->id, &file_ops->enable, 1700 __add_event_to_tracers(*call, file_ops);
1391 &file_ops->filter, &file_ops->format);
1392 } 1701 }
1393} 1702}
1394 1703
@@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)
1396{ 1705{
1397 struct ftrace_module_file_ops *file_ops; 1706 struct ftrace_module_file_ops *file_ops;
1398 struct ftrace_event_call *call, *p; 1707 struct ftrace_event_call *call, *p;
1399 bool found = false; 1708 bool clear_trace = false;
1400 1709
1401 down_write(&trace_event_mutex); 1710 down_write(&trace_event_sem);
1402 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1711 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1403 if (call->mod == mod) { 1712 if (call->mod == mod) {
1404 found = true; 1713 if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
1714 clear_trace = true;
1405 __trace_remove_event_call(call); 1715 __trace_remove_event_call(call);
1406 } 1716 }
1407 } 1717 }
@@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)
1415 list_del(&file_ops->list); 1725 list_del(&file_ops->list);
1416 kfree(file_ops); 1726 kfree(file_ops);
1417 } 1727 }
1728 up_write(&trace_event_sem);
1418 1729
1419 /* 1730 /*
1420 * It is safest to reset the ring buffer if the module being unloaded 1731 * It is safest to reset the ring buffer if the module being unloaded
1421 * registered any events. 1732 * registered any events that were used. The only worry is if
1733 * a new module gets loaded, and takes on the same id as the events
1734 * of this module. When printing out the buffer, traced events left
1735 * over from this module may be passed to the new module events and
1736 * unexpected results may occur.
1422 */ 1737 */
1423 if (found) 1738 if (clear_trace)
1424 tracing_reset_current_online_cpus(); 1739 tracing_reset_all_online_cpus();
1425 up_write(&trace_event_mutex);
1426} 1740}
1427 1741
1428static int trace_module_notify(struct notifier_block *self, 1742static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,
1443 1757
1444 return 0; 1758 return 0;
1445} 1759}
1760
1761static int
1762__trace_add_new_mod_event(struct ftrace_event_call *call,
1763 struct trace_array *tr,
1764 struct ftrace_module_file_ops *file_ops)
1765{
1766 return __trace_add_new_event(call, tr,
1767 &file_ops->id, &file_ops->enable,
1768 &file_ops->filter, &file_ops->format);
1769}
1770
1446#else 1771#else
1447static int trace_module_notify(struct notifier_block *self, 1772static inline struct ftrace_module_file_ops *
1448 unsigned long val, void *data) 1773find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1774{
1775 return NULL;
1776}
1777static inline int trace_module_notify(struct notifier_block *self,
1778 unsigned long val, void *data)
1449{ 1779{
1450 return 0; 1780 return 0;
1451} 1781}
1782static inline int
1783__trace_add_new_mod_event(struct ftrace_event_call *call,
1784 struct trace_array *tr,
1785 struct ftrace_module_file_ops *file_ops)
1786{
1787 return -ENODEV;
1788}
1452#endif /* CONFIG_MODULES */ 1789#endif /* CONFIG_MODULES */
1453 1790
1791/* Create a new event directory structure for a trace directory. */
1792static void
1793__trace_add_event_dirs(struct trace_array *tr)
1794{
1795 struct ftrace_module_file_ops *file_ops = NULL;
1796 struct ftrace_event_call *call;
1797 int ret;
1798
1799 list_for_each_entry(call, &ftrace_events, list) {
1800 if (call->mod) {
1801 /*
1802 * Directories for events by modules need to
1803 * keep module ref counts when opened (as we don't
1804 * want the module to disappear when reading one
1805 * of these files). The file_ops keep account of
1806 * the module ref count.
1807 */
1808 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1809 if (!file_ops)
1810 continue; /* Warn? */
1811 ret = __trace_add_new_mod_event(call, tr, file_ops);
1812 if (ret < 0)
1813 pr_warning("Could not create directory for event %s\n",
1814 call->name);
1815 continue;
1816 }
1817 ret = __trace_add_new_event(call, tr,
1818 &ftrace_event_id_fops,
1819 &ftrace_enable_fops,
1820 &ftrace_event_filter_fops,
1821 &ftrace_event_format_fops);
1822 if (ret < 0)
1823 pr_warning("Could not create directory for event %s\n",
1824 call->name);
1825 }
1826}
1827
1828#ifdef CONFIG_DYNAMIC_FTRACE
1829
1830/* Avoid typos */
1831#define ENABLE_EVENT_STR "enable_event"
1832#define DISABLE_EVENT_STR "disable_event"
1833
1834struct event_probe_data {
1835 struct ftrace_event_file *file;
1836 unsigned long count;
1837 int ref;
1838 bool enable;
1839};
1840
1841static struct ftrace_event_file *
1842find_event_file(struct trace_array *tr, const char *system, const char *event)
1843{
1844 struct ftrace_event_file *file;
1845 struct ftrace_event_call *call;
1846
1847 list_for_each_entry(file, &tr->events, list) {
1848
1849 call = file->event_call;
1850
1851 if (!call->name || !call->class || !call->class->reg)
1852 continue;
1853
1854 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
1855 continue;
1856
1857 if (strcmp(event, call->name) == 0 &&
1858 strcmp(system, call->class->system) == 0)
1859 return file;
1860 }
1861 return NULL;
1862}
1863
1864static void
1865event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1866{
1867 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1868 struct event_probe_data *data = *pdata;
1869
1870 if (!data)
1871 return;
1872
1873 if (data->enable)
1874 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1875 else
1876 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1877}
1878
1879static void
1880event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1881{
1882 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1883 struct event_probe_data *data = *pdata;
1884
1885 if (!data)
1886 return;
1887
1888 if (!data->count)
1889 return;
1890
1891 /* Skip if the event is in a state we want to switch to */
1892 if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1893 return;
1894
1895 if (data->count != -1)
1896 (data->count)--;
1897
1898 event_enable_probe(ip, parent_ip, _data);
1899}
1900
1901static int
1902event_enable_print(struct seq_file *m, unsigned long ip,
1903 struct ftrace_probe_ops *ops, void *_data)
1904{
1905 struct event_probe_data *data = _data;
1906
1907 seq_printf(m, "%ps:", (void *)ip);
1908
1909 seq_printf(m, "%s:%s:%s",
1910 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1911 data->file->event_call->class->system,
1912 data->file->event_call->name);
1913
1914 if (data->count == -1)
1915 seq_printf(m, ":unlimited\n");
1916 else
1917 seq_printf(m, ":count=%ld\n", data->count);
1918
1919 return 0;
1920}
1921
1922static int
1923event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
1924 void **_data)
1925{
1926 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1927 struct event_probe_data *data = *pdata;
1928
1929 data->ref++;
1930 return 0;
1931}
1932
1933static void
1934event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
1935 void **_data)
1936{
1937 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1938 struct event_probe_data *data = *pdata;
1939
1940 if (WARN_ON_ONCE(data->ref <= 0))
1941 return;
1942
1943 data->ref--;
1944 if (!data->ref) {
1945 /* Remove the SOFT_MODE flag */
1946 __ftrace_event_enable_disable(data->file, 0, 1);
1947 module_put(data->file->event_call->mod);
1948 kfree(data);
1949 }
1950 *pdata = NULL;
1951}
1952
1953static struct ftrace_probe_ops event_enable_probe_ops = {
1954 .func = event_enable_probe,
1955 .print = event_enable_print,
1956 .init = event_enable_init,
1957 .free = event_enable_free,
1958};
1959
1960static struct ftrace_probe_ops event_enable_count_probe_ops = {
1961 .func = event_enable_count_probe,
1962 .print = event_enable_print,
1963 .init = event_enable_init,
1964 .free = event_enable_free,
1965};
1966
1967static struct ftrace_probe_ops event_disable_probe_ops = {
1968 .func = event_enable_probe,
1969 .print = event_enable_print,
1970 .init = event_enable_init,
1971 .free = event_enable_free,
1972};
1973
1974static struct ftrace_probe_ops event_disable_count_probe_ops = {
1975 .func = event_enable_count_probe,
1976 .print = event_enable_print,
1977 .init = event_enable_init,
1978 .free = event_enable_free,
1979};
1980
1981static int
1982event_enable_func(struct ftrace_hash *hash,
1983 char *glob, char *cmd, char *param, int enabled)
1984{
1985 struct trace_array *tr = top_trace_array();
1986 struct ftrace_event_file *file;
1987 struct ftrace_probe_ops *ops;
1988 struct event_probe_data *data;
1989 const char *system;
1990 const char *event;
1991 char *number;
1992 bool enable;
1993 int ret;
1994
1995 /* hash funcs only work with set_ftrace_filter */
1996 if (!enabled)
1997 return -EINVAL;
1998
1999 if (!param)
2000 return -EINVAL;
2001
2002 system = strsep(&param, ":");
2003 if (!param)
2004 return -EINVAL;
2005
2006 event = strsep(&param, ":");
2007
2008 mutex_lock(&event_mutex);
2009
2010 ret = -EINVAL;
2011 file = find_event_file(tr, system, event);
2012 if (!file)
2013 goto out;
2014
2015 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
2016
2017 if (enable)
2018 ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
2019 else
2020 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
2021
2022 if (glob[0] == '!') {
2023 unregister_ftrace_function_probe_func(glob+1, ops);
2024 ret = 0;
2025 goto out;
2026 }
2027
2028 ret = -ENOMEM;
2029 data = kzalloc(sizeof(*data), GFP_KERNEL);
2030 if (!data)
2031 goto out;
2032
2033 data->enable = enable;
2034 data->count = -1;
2035 data->file = file;
2036
2037 if (!param)
2038 goto out_reg;
2039
2040 number = strsep(&param, ":");
2041
2042 ret = -EINVAL;
2043 if (!strlen(number))
2044 goto out_free;
2045
2046 /*
2047 * We use the callback data field (which is a pointer)
2048 * as our counter.
2049 */
2050 ret = kstrtoul(number, 0, &data->count);
2051 if (ret)
2052 goto out_free;
2053
2054 out_reg:
2055 /* Don't let event modules unload while probe registered */
2056 ret = try_module_get(file->event_call->mod);
2057 if (!ret)
2058 goto out_free;
2059
2060 ret = __ftrace_event_enable_disable(file, 1, 1);
2061 if (ret < 0)
2062 goto out_put;
2063 ret = register_ftrace_function_probe(glob, ops, data);
2064 if (!ret)
2065 goto out_disable;
2066 out:
2067 mutex_unlock(&event_mutex);
2068 return ret;
2069
2070 out_disable:
2071 __ftrace_event_enable_disable(file, 0, 1);
2072 out_put:
2073 module_put(file->event_call->mod);
2074 out_free:
2075 kfree(data);
2076 goto out;
2077}
2078
2079static struct ftrace_func_command event_enable_cmd = {
2080 .name = ENABLE_EVENT_STR,
2081 .func = event_enable_func,
2082};
2083
2084static struct ftrace_func_command event_disable_cmd = {
2085 .name = DISABLE_EVENT_STR,
2086 .func = event_enable_func,
2087};
2088
2089static __init int register_event_cmds(void)
2090{
2091 int ret;
2092
2093 ret = register_ftrace_command(&event_enable_cmd);
2094 if (WARN_ON(ret < 0))
2095 return ret;
2096 ret = register_ftrace_command(&event_disable_cmd);
2097 if (WARN_ON(ret < 0))
2098 unregister_ftrace_command(&event_enable_cmd);
2099 return ret;
2100}
2101#else
2102static inline int register_event_cmds(void) { return 0; }
2103#endif /* CONFIG_DYNAMIC_FTRACE */
2104
2105/*
2106 * The top level array has already had its ftrace_event_file
2107 * descriptors created in order to allow for early events to
2108 * be recorded. This function is called after the debugfs has been
2109 * initialized, and we now have to create the files associated
2110 * to the events.
2111 */
2112static __init void
2113__trace_early_add_event_dirs(struct trace_array *tr)
2114{
2115 struct ftrace_event_file *file;
2116 int ret;
2117
2118
2119 list_for_each_entry(file, &tr->events, list) {
2120 ret = event_create_dir(tr->event_dir, file,
2121 &ftrace_event_id_fops,
2122 &ftrace_enable_fops,
2123 &ftrace_event_filter_fops,
2124 &ftrace_event_format_fops);
2125 if (ret < 0)
2126 pr_warning("Could not create directory for event %s\n",
2127 file->event_call->name);
2128 }
2129}
2130
2131/*
2132 * For early boot up, the top trace array requires to have
2133 * a list of events that can be enabled. This must be done before
2134 * the filesystem is set up in order to allow events to be traced
2135 * early.
2136 */
2137static __init void
2138__trace_early_add_events(struct trace_array *tr)
2139{
2140 struct ftrace_event_call *call;
2141 int ret;
2142
2143 list_for_each_entry(call, &ftrace_events, list) {
2144 /* Early boot up should not have any modules loaded */
2145 if (WARN_ON_ONCE(call->mod))
2146 continue;
2147
2148 ret = __trace_early_add_new_event(call, tr);
2149 if (ret < 0)
2150 pr_warning("Could not create early event %s\n",
2151 call->name);
2152 }
2153}
2154
2155/* Remove the event directory structure for a trace directory. */
2156static void
2157__trace_remove_event_dirs(struct trace_array *tr)
2158{
2159 struct ftrace_event_file *file, *next;
2160
2161 list_for_each_entry_safe(file, next, &tr->events, list) {
2162 list_del(&file->list);
2163 debugfs_remove_recursive(file->dir);
2164 remove_subsystem(file->system);
2165 kmem_cache_free(file_cachep, file);
2166 }
2167}
2168
2169static void
2170__add_event_to_tracers(struct ftrace_event_call *call,
2171 struct ftrace_module_file_ops *file_ops)
2172{
2173 struct trace_array *tr;
2174
2175 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
2176 if (file_ops)
2177 __trace_add_new_mod_event(call, tr, file_ops);
2178 else
2179 __trace_add_new_event(call, tr,
2180 &ftrace_event_id_fops,
2181 &ftrace_enable_fops,
2182 &ftrace_event_filter_fops,
2183 &ftrace_event_format_fops);
2184 }
2185}
2186
1454static struct notifier_block trace_module_nb = { 2187static struct notifier_block trace_module_nb = {
1455 .notifier_call = trace_module_notify, 2188 .notifier_call = trace_module_notify,
1456 .priority = 0, 2189 .priority = 0,
@@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1464static __init int setup_trace_event(char *str) 2197static __init int setup_trace_event(char *str)
1465{ 2198{
1466 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); 2199 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1467 ring_buffer_expanded = 1; 2200 ring_buffer_expanded = true;
1468 tracing_selftest_disabled = 1; 2201 tracing_selftest_disabled = true;
1469 2202
1470 return 1; 2203 return 1;
1471} 2204}
1472__setup("trace_event=", setup_trace_event); 2205__setup("trace_event=", setup_trace_event);
1473 2206
2207/* Expects to have event_mutex held when called */
2208static int
2209create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2210{
2211 struct dentry *d_events;
2212 struct dentry *entry;
2213
2214 entry = debugfs_create_file("set_event", 0644, parent,
2215 tr, &ftrace_set_event_fops);
2216 if (!entry) {
2217 pr_warning("Could not create debugfs 'set_event' entry\n");
2218 return -ENOMEM;
2219 }
2220
2221 d_events = debugfs_create_dir("events", parent);
2222 if (!d_events) {
2223 pr_warning("Could not create debugfs 'events' directory\n");
2224 return -ENOMEM;
2225 }
2226
2227 /* ring buffer internal formats */
2228 trace_create_file("header_page", 0444, d_events,
2229 ring_buffer_print_page_header,
2230 &ftrace_show_header_fops);
2231
2232 trace_create_file("header_event", 0444, d_events,
2233 ring_buffer_print_entry_header,
2234 &ftrace_show_header_fops);
2235
2236 trace_create_file("enable", 0644, d_events,
2237 tr, &ftrace_tr_enable_fops);
2238
2239 tr->event_dir = d_events;
2240
2241 return 0;
2242}
2243
2244/**
2245 * event_trace_add_tracer - add a instance of a trace_array to events
2246 * @parent: The parent dentry to place the files/directories for events in
2247 * @tr: The trace array associated with these events
2248 *
2249 * When a new instance is created, it needs to set up its events
2250 * directory, as well as other files associated with events. It also
2251 * creates the event hierachry in the @parent/events directory.
2252 *
2253 * Returns 0 on success.
2254 */
2255int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
2256{
2257 int ret;
2258
2259 mutex_lock(&event_mutex);
2260
2261 ret = create_event_toplevel_files(parent, tr);
2262 if (ret)
2263 goto out_unlock;
2264
2265 down_write(&trace_event_sem);
2266 __trace_add_event_dirs(tr);
2267 up_write(&trace_event_sem);
2268
2269 out_unlock:
2270 mutex_unlock(&event_mutex);
2271
2272 return ret;
2273}
2274
2275/*
2276 * The top trace array already had its file descriptors created.
2277 * Now the files themselves need to be created.
2278 */
2279static __init int
2280early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2281{
2282 int ret;
2283
2284 mutex_lock(&event_mutex);
2285
2286 ret = create_event_toplevel_files(parent, tr);
2287 if (ret)
2288 goto out_unlock;
2289
2290 down_write(&trace_event_sem);
2291 __trace_early_add_event_dirs(tr);
2292 up_write(&trace_event_sem);
2293
2294 out_unlock:
2295 mutex_unlock(&event_mutex);
2296
2297 return ret;
2298}
2299
2300int event_trace_del_tracer(struct trace_array *tr)
2301{
2302 /* Disable any running events */
2303 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2304
2305 mutex_lock(&event_mutex);
2306
2307 down_write(&trace_event_sem);
2308 __trace_remove_event_dirs(tr);
2309 debugfs_remove_recursive(tr->event_dir);
2310 up_write(&trace_event_sem);
2311
2312 tr->event_dir = NULL;
2313
2314 mutex_unlock(&event_mutex);
2315
2316 return 0;
2317}
2318
2319static __init int event_trace_memsetup(void)
2320{
2321 field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
2322 file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
2323 return 0;
2324}
2325
1474static __init int event_trace_enable(void) 2326static __init int event_trace_enable(void)
1475{ 2327{
2328 struct trace_array *tr = top_trace_array();
1476 struct ftrace_event_call **iter, *call; 2329 struct ftrace_event_call **iter, *call;
1477 char *buf = bootup_event_buf; 2330 char *buf = bootup_event_buf;
1478 char *token; 2331 char *token;
@@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)
1486 list_add(&call->list, &ftrace_events); 2339 list_add(&call->list, &ftrace_events);
1487 } 2340 }
1488 2341
2342 /*
2343 * We need the top trace array to have a working set of trace
2344 * points at early init, before the debug files and directories
2345 * are created. Create the file entries now, and attach them
2346 * to the actual file dentries later.
2347 */
2348 __trace_early_add_events(tr);
2349
1489 while (true) { 2350 while (true) {
1490 token = strsep(&buf, ","); 2351 token = strsep(&buf, ",");
1491 2352
@@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)
1494 if (!*token) 2355 if (!*token)
1495 continue; 2356 continue;
1496 2357
1497 ret = ftrace_set_clr_event(token, 1); 2358 ret = ftrace_set_clr_event(tr, token, 1);
1498 if (ret) 2359 if (ret)
1499 pr_warn("Failed to enable trace event: %s\n", token); 2360 pr_warn("Failed to enable trace event: %s\n", token);
1500 } 2361 }
1501 2362
1502 trace_printk_start_comm(); 2363 trace_printk_start_comm();
1503 2364
2365 register_event_cmds();
2366
1504 return 0; 2367 return 0;
1505} 2368}
1506 2369
1507static __init int event_trace_init(void) 2370static __init int event_trace_init(void)
1508{ 2371{
1509 struct ftrace_event_call *call; 2372 struct trace_array *tr;
1510 struct dentry *d_tracer; 2373 struct dentry *d_tracer;
1511 struct dentry *entry; 2374 struct dentry *entry;
1512 struct dentry *d_events;
1513 int ret; 2375 int ret;
1514 2376
2377 tr = top_trace_array();
2378
1515 d_tracer = tracing_init_dentry(); 2379 d_tracer = tracing_init_dentry();
1516 if (!d_tracer) 2380 if (!d_tracer)
1517 return 0; 2381 return 0;
1518 2382
1519 entry = debugfs_create_file("available_events", 0444, d_tracer, 2383 entry = debugfs_create_file("available_events", 0444, d_tracer,
1520 NULL, &ftrace_avail_fops); 2384 tr, &ftrace_avail_fops);
1521 if (!entry) 2385 if (!entry)
1522 pr_warning("Could not create debugfs " 2386 pr_warning("Could not create debugfs "
1523 "'available_events' entry\n"); 2387 "'available_events' entry\n");
1524 2388
1525 entry = debugfs_create_file("set_event", 0644, d_tracer,
1526 NULL, &ftrace_set_event_fops);
1527 if (!entry)
1528 pr_warning("Could not create debugfs "
1529 "'set_event' entry\n");
1530
1531 d_events = event_trace_events_dir();
1532 if (!d_events)
1533 return 0;
1534
1535 /* ring buffer internal formats */
1536 trace_create_file("header_page", 0444, d_events,
1537 ring_buffer_print_page_header,
1538 &ftrace_show_header_fops);
1539
1540 trace_create_file("header_event", 0444, d_events,
1541 ring_buffer_print_entry_header,
1542 &ftrace_show_header_fops);
1543
1544 trace_create_file("enable", 0644, d_events,
1545 NULL, &ftrace_system_enable_fops);
1546
1547 if (trace_define_common_fields()) 2389 if (trace_define_common_fields())
1548 pr_warning("tracing: Failed to allocate common fields"); 2390 pr_warning("tracing: Failed to allocate common fields");
1549 2391
1550 /* 2392 ret = early_event_add_tracer(d_tracer, tr);
1551 * Early initialization already enabled ftrace event. 2393 if (ret)
1552 * Now it's only necessary to create the event directory. 2394 return ret;
1553 */
1554 list_for_each_entry(call, &ftrace_events, list) {
1555
1556 ret = event_create_dir(call, d_events,
1557 &ftrace_event_id_fops,
1558 &ftrace_enable_fops,
1559 &ftrace_event_filter_fops,
1560 &ftrace_event_format_fops);
1561 if (ret < 0)
1562 event_remove(call);
1563 }
1564 2395
1565 ret = register_module_notifier(&trace_module_nb); 2396 ret = register_module_notifier(&trace_module_nb);
1566 if (ret) 2397 if (ret)
@@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)
1568 2399
1569 return 0; 2400 return 0;
1570} 2401}
2402early_initcall(event_trace_memsetup);
1571core_initcall(event_trace_enable); 2403core_initcall(event_trace_enable);
1572fs_initcall(event_trace_init); 2404fs_initcall(event_trace_init);
1573 2405
@@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)
1627 */ 2459 */
1628static __init void event_trace_self_tests(void) 2460static __init void event_trace_self_tests(void)
1629{ 2461{
2462 struct ftrace_subsystem_dir *dir;
2463 struct ftrace_event_file *file;
1630 struct ftrace_event_call *call; 2464 struct ftrace_event_call *call;
1631 struct event_subsystem *system; 2465 struct event_subsystem *system;
2466 struct trace_array *tr;
1632 int ret; 2467 int ret;
1633 2468
2469 tr = top_trace_array();
2470
1634 pr_info("Running tests on trace events:\n"); 2471 pr_info("Running tests on trace events:\n");
1635 2472
1636 list_for_each_entry(call, &ftrace_events, list) { 2473 list_for_each_entry(file, &tr->events, list) {
2474
2475 call = file->event_call;
1637 2476
1638 /* Only test those that have a probe */ 2477 /* Only test those that have a probe */
1639 if (!call->class || !call->class->probe) 2478 if (!call->class || !call->class->probe)
@@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)
1657 * If an event is already enabled, someone is using 2496 * If an event is already enabled, someone is using
1658 * it and the self test should not be on. 2497 * it and the self test should not be on.
1659 */ 2498 */
1660 if (call->flags & TRACE_EVENT_FL_ENABLED) { 2499 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
1661 pr_warning("Enabled event during self test!\n"); 2500 pr_warning("Enabled event during self test!\n");
1662 WARN_ON_ONCE(1); 2501 WARN_ON_ONCE(1);
1663 continue; 2502 continue;
1664 } 2503 }
1665 2504
1666 ftrace_event_enable_disable(call, 1); 2505 ftrace_event_enable_disable(file, 1);
1667 event_test_stuff(); 2506 event_test_stuff();
1668 ftrace_event_enable_disable(call, 0); 2507 ftrace_event_enable_disable(file, 0);
1669 2508
1670 pr_cont("OK\n"); 2509 pr_cont("OK\n");
1671 } 2510 }
@@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)
1674 2513
1675 pr_info("Running tests on trace event systems:\n"); 2514 pr_info("Running tests on trace event systems:\n");
1676 2515
1677 list_for_each_entry(system, &event_subsystems, list) { 2516 list_for_each_entry(dir, &tr->systems, list) {
2517
2518 system = dir->subsystem;
1678 2519
1679 /* the ftrace system is special, skip it */ 2520 /* the ftrace system is special, skip it */
1680 if (strcmp(system->name, "ftrace") == 0) 2521 if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)
1682 2523
1683 pr_info("Testing event system %s: ", system->name); 2524 pr_info("Testing event system %s: ", system->name);
1684 2525
1685 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); 2526 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
1686 if (WARN_ON_ONCE(ret)) { 2527 if (WARN_ON_ONCE(ret)) {
1687 pr_warning("error enabling system %s\n", 2528 pr_warning("error enabling system %s\n",
1688 system->name); 2529 system->name);
@@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)
1691 2532
1692 event_test_stuff(); 2533 event_test_stuff();
1693 2534
1694 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 2535 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
1695 if (WARN_ON_ONCE(ret)) { 2536 if (WARN_ON_ONCE(ret)) {
1696 pr_warning("error disabling system %s\n", 2537 pr_warning("error disabling system %s\n",
1697 system->name); 2538 system->name);
@@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)
1706 pr_info("Running tests on all trace events:\n"); 2547 pr_info("Running tests on all trace events:\n");
1707 pr_info("Testing all events: "); 2548 pr_info("Testing all events: ");
1708 2549
1709 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); 2550 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
1710 if (WARN_ON_ONCE(ret)) { 2551 if (WARN_ON_ONCE(ret)) {
1711 pr_warning("error enabling all events\n"); 2552 pr_warning("error enabling all events\n");
1712 return; 2553 return;
@@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)
1715 event_test_stuff(); 2556 event_test_stuff();
1716 2557
1717 /* reset sysname */ 2558 /* reset sysname */
1718 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); 2559 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
1719 if (WARN_ON_ONCE(ret)) { 2560 if (WARN_ON_ONCE(ret)) {
1720 pr_warning("error disabling all events\n"); 2561 pr_warning("error disabling all events\n");
1721 return; 2562 return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
658 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
659} 659}
660 660
661static struct ftrace_event_field *
662__find_event_field(struct list_head *head, char *name)
663{
664 struct ftrace_event_field *field;
665
666 list_for_each_entry(field, head, link) {
667 if (!strcmp(field->name, name))
668 return field;
669 }
670
671 return NULL;
672}
673
674static struct ftrace_event_field *
675find_event_field(struct ftrace_event_call *call, char *name)
676{
677 struct ftrace_event_field *field;
678 struct list_head *head;
679
680 field = __find_event_field(&ftrace_common_fields, name);
681 if (field)
682 return field;
683
684 head = trace_get_fields(call);
685 return __find_event_field(head, name);
686}
687
688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 661static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
689{ 662{
690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); 663 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1337 return NULL; 1310 return NULL;
1338 } 1311 }
1339 1312
1340 field = find_event_field(call, operand1); 1313 field = trace_find_event_field(call, operand1);
1341 if (!field) { 1314 if (!field) {
1342 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); 1315 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1343 return NULL; 1316 return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
1907 return err; 1880 return err;
1908} 1881}
1909 1882
1910int apply_subsystem_event_filter(struct event_subsystem *system, 1883int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1911 char *filter_string) 1884 char *filter_string)
1912{ 1885{
1886 struct event_subsystem *system = dir->subsystem;
1913 struct event_filter *filter; 1887 struct event_filter *filter;
1914 int err = 0; 1888 int err = 0;
1915 1889
1916 mutex_lock(&event_mutex); 1890 mutex_lock(&event_mutex);
1917 1891
1918 /* Make sure the system still has events */ 1892 /* Make sure the system still has events */
1919 if (!system->nr_events) { 1893 if (!dir->nr_events) {
1920 err = -ENODEV; 1894 err = -ENODEV;
1921 goto out_unlock; 1895 goto out_unlock;
1922 } 1896 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
129 129
130#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
132int \ 132static int __init \
133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
134{ \ 134{ \
135 struct struct_name field; \ 135 struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \ 169 regfn) \
170 \ 170 \
171struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class __refdata event_class_ftrace_##call = { \
172 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
173 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
28static int function_trace_init(struct trace_array *tr) 28static int function_trace_init(struct trace_array *tr)
29{ 29{
30 func_trace = tr; 30 func_trace = tr;
31 tr->cpu = get_cpu(); 31 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 32 put_cpu();
33 33
34 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
44 44
45static void function_trace_start(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
46{ 46{
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(&tr->trace_buffer);
48} 48}
49 49
50/* Our option */ 50/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
76 goto out; 76 goto out;
77 77
78 cpu = smp_processor_id(); 78 cpu = smp_processor_id();
79 data = tr->data[cpu]; 79 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
80 if (!atomic_read(&data->disabled)) { 80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags); 81 local_save_flags(flags);
82 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
107 */ 107 */
108 local_irq_save(flags); 108 local_irq_save(flags);
109 cpu = raw_smp_processor_id(); 109 cpu = raw_smp_processor_id();
110 data = tr->data[cpu]; 110 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
111 disabled = atomic_inc_return(&data->disabled); 111 disabled = atomic_inc_return(&data->disabled);
112 112
113 if (likely(disabled == 1)) { 113 if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
214}; 214};
215 215
216#ifdef CONFIG_DYNAMIC_FTRACE 216#ifdef CONFIG_DYNAMIC_FTRACE
217static void 217static int update_count(void **data)
218ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
219{ 218{
220 long *count = (long *)data; 219 unsigned long *count = (long *)data;
221
222 if (tracing_is_on())
223 return;
224 220
225 if (!*count) 221 if (!*count)
226 return; 222 return 0;
227 223
228 if (*count != -1) 224 if (*count != -1)
229 (*count)--; 225 (*count)--;
230 226
231 tracing_on(); 227 return 1;
232} 228}
233 229
234static void 230static void
235ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) 231ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
236{ 232{
237 long *count = (long *)data; 233 if (tracing_is_on())
234 return;
235
236 if (update_count(data))
237 tracing_on();
238}
238 239
240static void
241ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
242{
239 if (!tracing_is_on()) 243 if (!tracing_is_on())
240 return; 244 return;
241 245
242 if (!*count) 246 if (update_count(data))
247 tracing_off();
248}
249
250static void
251ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
252{
253 if (tracing_is_on())
243 return; 254 return;
244 255
245 if (*count != -1) 256 tracing_on();
246 (*count)--; 257}
258
259static void
260ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
261{
262 if (!tracing_is_on())
263 return;
247 264
248 tracing_off(); 265 tracing_off();
249} 266}
250 267
251static int 268/*
252ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 269 * Skip 4:
253 struct ftrace_probe_ops *ops, void *data); 270 * ftrace_stacktrace()
271 * function_trace_probe_call()
272 * ftrace_ops_list_func()
273 * ftrace_call()
274 */
275#define STACK_SKIP 4
254 276
255static struct ftrace_probe_ops traceon_probe_ops = { 277static void
256 .func = ftrace_traceon, 278ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
257 .print = ftrace_trace_onoff_print, 279{
258}; 280 trace_dump_stack(STACK_SKIP);
281}
259 282
260static struct ftrace_probe_ops traceoff_probe_ops = { 283static void
261 .func = ftrace_traceoff, 284ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
262 .print = ftrace_trace_onoff_print, 285{
263}; 286 if (!tracing_is_on())
287 return;
288
289 if (update_count(data))
290 trace_dump_stack(STACK_SKIP);
291}
264 292
265static int 293static int
266ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 294ftrace_probe_print(const char *name, struct seq_file *m,
267 struct ftrace_probe_ops *ops, void *data) 295 unsigned long ip, void *data)
268{ 296{
269 long count = (long)data; 297 long count = (long)data;
270 298
271 seq_printf(m, "%ps:", (void *)ip); 299 seq_printf(m, "%ps:%s", (void *)ip, name);
272
273 if (ops == &traceon_probe_ops)
274 seq_printf(m, "traceon");
275 else
276 seq_printf(m, "traceoff");
277 300
278 if (count == -1) 301 if (count == -1)
279 seq_printf(m, ":unlimited\n"); 302 seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
284} 307}
285 308
286static int 309static int
287ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) 310ftrace_traceon_print(struct seq_file *m, unsigned long ip,
311 struct ftrace_probe_ops *ops, void *data)
288{ 312{
289 struct ftrace_probe_ops *ops; 313 return ftrace_probe_print("traceon", m, ip, data);
290 314}
291 /* we register both traceon and traceoff to this callback */
292 if (strcmp(cmd, "traceon") == 0)
293 ops = &traceon_probe_ops;
294 else
295 ops = &traceoff_probe_ops;
296 315
297 unregister_ftrace_function_probe_func(glob, ops); 316static int
317ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
318 struct ftrace_probe_ops *ops, void *data)
319{
320 return ftrace_probe_print("traceoff", m, ip, data);
321}
298 322
299 return 0; 323static int
324ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
325 struct ftrace_probe_ops *ops, void *data)
326{
327 return ftrace_probe_print("stacktrace", m, ip, data);
300} 328}
301 329
330static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print,
333};
334
335static struct ftrace_probe_ops traceoff_count_probe_ops = {
336 .func = ftrace_traceoff_count,
337 .print = ftrace_traceoff_print,
338};
339
340static struct ftrace_probe_ops stacktrace_count_probe_ops = {
341 .func = ftrace_stacktrace_count,
342 .print = ftrace_stacktrace_print,
343};
344
345static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon,
347 .print = ftrace_traceon_print,
348};
349
350static struct ftrace_probe_ops traceoff_probe_ops = {
351 .func = ftrace_traceoff,
352 .print = ftrace_traceoff_print,
353};
354
355static struct ftrace_probe_ops stacktrace_probe_ops = {
356 .func = ftrace_stacktrace,
357 .print = ftrace_stacktrace_print,
358};
359
302static int 360static int
303ftrace_trace_onoff_callback(struct ftrace_hash *hash, 361ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
304 char *glob, char *cmd, char *param, int enable) 362 struct ftrace_hash *hash, char *glob,
363 char *cmd, char *param, int enable)
305{ 364{
306 struct ftrace_probe_ops *ops;
307 void *count = (void *)-1; 365 void *count = (void *)-1;
308 char *number; 366 char *number;
309 int ret; 367 int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
312 if (!enable) 370 if (!enable)
313 return -EINVAL; 371 return -EINVAL;
314 372
315 if (glob[0] == '!') 373 if (glob[0] == '!') {
316 return ftrace_trace_onoff_unreg(glob+1, cmd, param); 374 unregister_ftrace_function_probe_func(glob+1, ops);
317 375 return 0;
318 /* we register both traceon and traceoff to this callback */ 376 }
319 if (strcmp(cmd, "traceon") == 0)
320 ops = &traceon_probe_ops;
321 else
322 ops = &traceoff_probe_ops;
323 377
324 if (!param) 378 if (!param)
325 goto out_reg; 379 goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
343 return ret < 0 ? ret : 0; 397 return ret < 0 ? ret : 0;
344} 398}
345 399
400static int
401ftrace_trace_onoff_callback(struct ftrace_hash *hash,
402 char *glob, char *cmd, char *param, int enable)
403{
404 struct ftrace_probe_ops *ops;
405
406 /* we register both traceon and traceoff to this callback */
407 if (strcmp(cmd, "traceon") == 0)
408 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
409 else
410 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
411
412 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
413 param, enable);
414}
415
416static int
417ftrace_stacktrace_callback(struct ftrace_hash *hash,
418 char *glob, char *cmd, char *param, int enable)
419{
420 struct ftrace_probe_ops *ops;
421
422 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
423
424 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
425 param, enable);
426}
427
346static struct ftrace_func_command ftrace_traceon_cmd = { 428static struct ftrace_func_command ftrace_traceon_cmd = {
347 .name = "traceon", 429 .name = "traceon",
348 .func = ftrace_trace_onoff_callback, 430 .func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
353 .func = ftrace_trace_onoff_callback, 435 .func = ftrace_trace_onoff_callback,
354}; 436};
355 437
438static struct ftrace_func_command ftrace_stacktrace_cmd = {
439 .name = "stacktrace",
440 .func = ftrace_stacktrace_callback,
441};
442
356static int __init init_func_cmd_traceon(void) 443static int __init init_func_cmd_traceon(void)
357{ 444{
358 int ret; 445 int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
364 ret = register_ftrace_command(&ftrace_traceon_cmd); 451 ret = register_ftrace_command(&ftrace_traceon_cmd);
365 if (ret) 452 if (ret)
366 unregister_ftrace_command(&ftrace_traceoff_cmd); 453 unregister_ftrace_command(&ftrace_traceoff_cmd);
454
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) {
457 unregister_ftrace_command(&ftrace_traceoff_cmd);
458 unregister_ftrace_command(&ftrace_traceon_cmd);
459 }
367 return ret; 460 return ret;
368} 461}
369#else 462#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
218{ 218{
219 struct ftrace_event_call *call = &event_funcgraph_entry; 219 struct ftrace_event_call *call = &event_funcgraph_entry;
220 struct ring_buffer_event *event; 220 struct ring_buffer_event *event;
221 struct ring_buffer *buffer = tr->buffer; 221 struct ring_buffer *buffer = tr->trace_buffer.buffer;
222 struct ftrace_graph_ent_entry *entry; 222 struct ftrace_graph_ent_entry *entry;
223 223
224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
265 265
266 local_irq_save(flags); 266 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 267 cpu = raw_smp_processor_id();
268 data = tr->data[cpu]; 268 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
269 disabled = atomic_inc_return(&data->disabled); 269 disabled = atomic_inc_return(&data->disabled);
270 if (likely(disabled == 1)) { 270 if (likely(disabled == 1)) {
271 pc = preempt_count(); 271 pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
323{ 323{
324 struct ftrace_event_call *call = &event_funcgraph_exit; 324 struct ftrace_event_call *call = &event_funcgraph_exit;
325 struct ring_buffer_event *event; 325 struct ring_buffer_event *event;
326 struct ring_buffer *buffer = tr->buffer; 326 struct ring_buffer *buffer = tr->trace_buffer.buffer;
327 struct ftrace_graph_ret_entry *entry; 327 struct ftrace_graph_ret_entry *entry;
328 328
329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
350 350
351 local_irq_save(flags); 351 local_irq_save(flags);
352 cpu = raw_smp_processor_id(); 352 cpu = raw_smp_processor_id();
353 data = tr->data[cpu]; 353 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
354 disabled = atomic_inc_return(&data->disabled); 354 disabled = atomic_inc_return(&data->disabled);
355 if (likely(disabled == 1)) { 355 if (likely(disabled == 1)) {
356 pc = preempt_count(); 356 pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
560 * We need to consume the current entry to see 560 * We need to consume the current entry to see
561 * the next one. 561 * the next one.
562 */ 562 */
563 ring_buffer_consume(iter->tr->buffer, iter->cpu, 563 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
564 NULL, NULL); 564 NULL, NULL);
565 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 565 event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
566 NULL, NULL); 566 NULL, NULL);
567 } 567 }
568 568
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 443b25b43b4f..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -33,6 +33,7 @@ enum {
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_flags; 35static int save_flags;
36static bool function_enabled;
36 37
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph); 38static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph); 39static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
121 if (!irqs_disabled_flags(*flags)) 122 if (!irqs_disabled_flags(*flags))
122 return 0; 123 return 0;
123 124
124 *data = tr->data[cpu]; 125 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
125 disabled = atomic_inc_return(&(*data)->disabled); 126 disabled = atomic_inc_return(&(*data)->disabled);
126 127
127 if (likely(disabled == 1)) 128 if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
175 per_cpu(tracing_cpu, cpu) = 0; 176 per_cpu(tracing_cpu, cpu) = 0;
176 177
177 tracing_max_latency = 0; 178 tracing_max_latency = 0;
178 tracing_reset_online_cpus(irqsoff_trace); 179 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
179 180
180 return start_irqsoff_tracer(irqsoff_trace, set); 181 return start_irqsoff_tracer(irqsoff_trace, set);
181} 182}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
380 if (per_cpu(tracing_cpu, cpu)) 381 if (per_cpu(tracing_cpu, cpu))
381 return; 382 return;
382 383
383 data = tr->data[cpu]; 384 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
384 385
385 if (unlikely(!data) || atomic_read(&data->disabled)) 386 if (unlikely(!data) || atomic_read(&data->disabled))
386 return; 387 return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
418 if (!tracer_enabled) 419 if (!tracer_enabled)
419 return; 420 return;
420 421
421 data = tr->data[cpu]; 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
422 423
423 if (unlikely(!data) || 424 if (unlikely(!data) ||
424 !data->critical_start || atomic_read(&data->disabled)) 425 !data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
528} 529}
529#endif /* CONFIG_PREEMPT_TRACER */ 530#endif /* CONFIG_PREEMPT_TRACER */
530 531
531static int start_irqsoff_tracer(struct trace_array *tr, int graph) 532static int register_irqsoff_function(int graph, int set)
532{ 533{
533 int ret = 0; 534 int ret;
534 535
535 if (!graph) 536 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
536 ret = register_ftrace_function(&trace_ops); 537 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
537 else 538 return 0;
539
540 if (graph)
538 ret = register_ftrace_graph(&irqsoff_graph_return, 541 ret = register_ftrace_graph(&irqsoff_graph_return,
539 &irqsoff_graph_entry); 542 &irqsoff_graph_entry);
543 else
544 ret = register_ftrace_function(&trace_ops);
545
546 if (!ret)
547 function_enabled = true;
548
549 return ret;
550}
551
552static void unregister_irqsoff_function(int graph)
553{
554 if (!function_enabled)
555 return;
556
557 if (graph)
558 unregister_ftrace_graph();
559 else
560 unregister_ftrace_function(&trace_ops);
561
562 function_enabled = false;
563}
564
565static void irqsoff_function_set(int set)
566{
567 if (set)
568 register_irqsoff_function(is_graph(), 1);
569 else
570 unregister_irqsoff_function(is_graph());
571}
572
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
574{
575 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set);
577
578 return trace_keep_overwrite(tracer, mask, set);
579}
580
581static int start_irqsoff_tracer(struct trace_array *tr, int graph)
582{
583 int ret;
584
585 ret = register_irqsoff_function(graph, 0);
540 586
541 if (!ret && tracing_is_enabled()) 587 if (!ret && tracing_is_enabled())
542 tracer_enabled = 1; 588 tracer_enabled = 1;
@@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
550{ 596{
551 tracer_enabled = 0; 597 tracer_enabled = 0;
552 598
553 if (!graph) 599 unregister_irqsoff_function(graph);
554 unregister_ftrace_function(&trace_ops);
555 else
556 unregister_ftrace_graph();
557} 600}
558 601
559static void __irqsoff_tracer_init(struct trace_array *tr) 602static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -561,14 +604,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
561 save_flags = trace_flags; 604 save_flags = trace_flags;
562 605
563 /* non overwrite screws up the latency tracers */ 606 /* non overwrite screws up the latency tracers */
564 set_tracer_flag(TRACE_ITER_OVERWRITE, 1); 607 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
565 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); 608 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
566 609
567 tracing_max_latency = 0; 610 tracing_max_latency = 0;
568 irqsoff_trace = tr; 611 irqsoff_trace = tr;
569 /* make sure that the tracer is visible */ 612 /* make sure that the tracer is visible */
570 smp_wmb(); 613 smp_wmb();
571 tracing_reset_online_cpus(tr); 614 tracing_reset_online_cpus(&tr->trace_buffer);
572 615
573 if (start_irqsoff_tracer(tr, is_graph())) 616 if (start_irqsoff_tracer(tr, is_graph()))
574 printk(KERN_ERR "failed to start irqsoff tracer\n"); 617 printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -581,8 +624,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
581 624
582 stop_irqsoff_tracer(tr, is_graph()); 625 stop_irqsoff_tracer(tr, is_graph());
583 626
584 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); 627 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
585 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); 628 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
586} 629}
587 630
588static void irqsoff_tracer_start(struct trace_array *tr) 631static void irqsoff_tracer_start(struct trace_array *tr)
@@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
615 .print_line = irqsoff_print_line, 658 .print_line = irqsoff_print_line,
616 .flags = &tracer_flags, 659 .flags = &tracer_flags,
617 .set_flag = irqsoff_set_flag, 660 .set_flag = irqsoff_set_flag,
618 .flag_changed = trace_keep_overwrite, 661 .flag_changed = irqsoff_flag_changed,
619#ifdef CONFIG_FTRACE_SELFTEST 662#ifdef CONFIG_FTRACE_SELFTEST
620 .selftest = trace_selftest_startup_irqsoff, 663 .selftest = trace_selftest_startup_irqsoff,
621#endif 664#endif
@@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
649 .print_line = irqsoff_print_line, 692 .print_line = irqsoff_print_line,
650 .flags = &tracer_flags, 693 .flags = &tracer_flags,
651 .set_flag = irqsoff_set_flag, 694 .set_flag = irqsoff_set_flag,
652 .flag_changed = trace_keep_overwrite, 695 .flag_changed = irqsoff_flag_changed,
653#ifdef CONFIG_FTRACE_SELFTEST 696#ifdef CONFIG_FTRACE_SELFTEST
654 .selftest = trace_selftest_startup_preemptoff, 697 .selftest = trace_selftest_startup_preemptoff,
655#endif 698#endif
@@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
685 .print_line = irqsoff_print_line, 728 .print_line = irqsoff_print_line,
686 .flags = &tracer_flags, 729 .flags = &tracer_flags,
687 .set_flag = irqsoff_set_flag, 730 .set_flag = irqsoff_set_flag,
688 .flag_changed = trace_keep_overwrite, 731 .flag_changed = irqsoff_flag_changed,
689#ifdef CONFIG_FTRACE_SELFTEST 732#ifdef CONFIG_FTRACE_SELFTEST
690 .selftest = trace_selftest_startup_preemptirqsoff, 733 .selftest = trace_selftest_startup_preemptirqsoff,
691#endif 734#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
26 trace_init_global_iter(&iter); 26 trace_init_global_iter(&iter);
27 27
28 for_each_tracing_cpu(cpu) { 28 for_each_tracing_cpu(cpu) {
29 atomic_inc(&iter.tr->data[cpu]->disabled); 29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
30 } 30 }
31 31
32 old_userobj = trace_flags; 32 old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
43 iter.iter_flags |= TRACE_FILE_LAT_FMT; 43 iter.iter_flags |= TRACE_FILE_LAT_FMT;
44 iter.pos = -1; 44 iter.pos = -1;
45 45
46 if (cpu_file == TRACE_PIPE_ALL_CPU) { 46 if (cpu_file == RING_BUFFER_ALL_CPUS) {
47 for_each_tracing_cpu(cpu) { 47 for_each_tracing_cpu(cpu) {
48 iter.buffer_iter[cpu] = 48 iter.buffer_iter[cpu] =
49 ring_buffer_read_prepare(iter.tr->buffer, cpu); 49 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
50 ring_buffer_read_start(iter.buffer_iter[cpu]); 50 ring_buffer_read_start(iter.buffer_iter[cpu]);
51 tracing_iter_reset(&iter, cpu); 51 tracing_iter_reset(&iter, cpu);
52 } 52 }
53 } else { 53 } else {
54 iter.cpu_file = cpu_file; 54 iter.cpu_file = cpu_file;
55 iter.buffer_iter[cpu_file] = 55 iter.buffer_iter[cpu_file] =
56 ring_buffer_read_prepare(iter.tr->buffer, cpu_file); 56 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 57 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 58 tracing_iter_reset(&iter, cpu_file);
59 } 59 }
@@ -83,7 +83,7 @@ out:
83 trace_flags = old_userobj; 83 trace_flags = old_userobj;
84 84
85 for_each_tracing_cpu(cpu) { 85 for_each_tracing_cpu(cpu) {
86 atomic_dec(&iter.tr->data[cpu]->disabled); 86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 87 }
88 88
89 for_each_tracing_cpu(cpu) 89 for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
115 !cpu_online(cpu_file)) 115 !cpu_online(cpu_file))
116 return KDB_BADINT; 116 return KDB_BADINT;
117 } else { 117 } else {
118 cpu_file = TRACE_PIPE_ALL_CPU; 118 cpu_file = RING_BUFFER_ALL_CPUS;
119 } 119 }
120 120
121 kdb_trap_printk++; 121 kdb_trap_printk++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
31 overrun_detected = false; 31 overrun_detected = false;
32 prev_overruns = 0; 32 prev_overruns = 0;
33 33
34 tracing_reset_online_cpus(tr); 34 tracing_reset_online_cpus(&tr->trace_buffer);
35} 35}
36 36
37static int mmio_trace_init(struct trace_array *tr) 37static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
128static unsigned long count_overruns(struct trace_iterator *iter) 128static unsigned long count_overruns(struct trace_iterator *iter)
129{ 129{
130 unsigned long cnt = atomic_xchg(&dropped_count, 0); 130 unsigned long cnt = atomic_xchg(&dropped_count, 0);
131 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 131 unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
132 132
133 if (over > prev_overruns) 133 if (over > prev_overruns)
134 cnt += over - prev_overruns; 134 cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
309 struct mmiotrace_rw *rw) 309 struct mmiotrace_rw *rw)
310{ 310{
311 struct ftrace_event_call *call = &event_mmiotrace_rw; 311 struct ftrace_event_call *call = &event_mmiotrace_rw;
312 struct ring_buffer *buffer = tr->buffer; 312 struct ring_buffer *buffer = tr->trace_buffer.buffer;
313 struct ring_buffer_event *event; 313 struct ring_buffer_event *event;
314 struct trace_mmiotrace_rw *entry; 314 struct trace_mmiotrace_rw *entry;
315 int pc = preempt_count(); 315 int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
330void mmio_trace_rw(struct mmiotrace_rw *rw) 330void mmio_trace_rw(struct mmiotrace_rw *rw)
331{ 331{
332 struct trace_array *tr = mmio_trace_array; 332 struct trace_array *tr = mmio_trace_array;
333 struct trace_array_cpu *data = tr->data[smp_processor_id()]; 333 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
334 __trace_mmiotrace_rw(tr, data, rw); 334 __trace_mmiotrace_rw(tr, data, rw);
335} 335}
336 336
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
339 struct mmiotrace_map *map) 339 struct mmiotrace_map *map)
340{ 340{
341 struct ftrace_event_call *call = &event_mmiotrace_map; 341 struct ftrace_event_call *call = &event_mmiotrace_map;
342 struct ring_buffer *buffer = tr->buffer; 342 struct ring_buffer *buffer = tr->trace_buffer.buffer;
343 struct ring_buffer_event *event; 343 struct ring_buffer_event *event;
344 struct trace_mmiotrace_map *entry; 344 struct trace_mmiotrace_map *entry;
345 int pc = preempt_count(); 345 int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
363 struct trace_array_cpu *data; 363 struct trace_array_cpu *data;
364 364
365 preempt_disable(); 365 preempt_disable();
366 data = tr->data[smp_processor_id()]; 366 data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
367 __trace_mmiotrace_map(tr, data, map); 367 __trace_mmiotrace_map(tr, data, map);
368 preempt_enable(); 368 preempt_enable();
369} 369}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_sem);
18 18
19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
37 return ret; 37 return ret;
38} 38}
39 39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{
42 struct trace_seq *s = &iter->seq;
43 struct trace_entry *entry = iter->ent;
44 struct bputs_entry *field;
45 int ret;
46
47 trace_assign_type(field, entry);
48
49 ret = trace_seq_puts(s, field->str);
50 if (!ret)
51 return TRACE_TYPE_PARTIAL_LINE;
52
53 return TRACE_TYPE_HANDLED;
54}
55
40enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 56enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41{ 57{
42 struct trace_seq *s = &iter->seq; 58 struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
397} 413}
398EXPORT_SYMBOL(ftrace_print_hex_seq); 414EXPORT_SYMBOL(ftrace_print_hex_seq);
399 415
416int ftrace_raw_output_prep(struct trace_iterator *iter,
417 struct trace_event *trace_event)
418{
419 struct ftrace_event_call *event;
420 struct trace_seq *s = &iter->seq;
421 struct trace_seq *p = &iter->tmp_seq;
422 struct trace_entry *entry;
423 int ret;
424
425 event = container_of(trace_event, struct ftrace_event_call, event);
426 entry = iter->ent;
427
428 if (entry->type != event->event.type) {
429 WARN_ON_ONCE(1);
430 return TRACE_TYPE_UNHANDLED;
431 }
432
433 trace_seq_init(p);
434 ret = trace_seq_printf(s, "%s: ", event->name);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return 0;
439}
440EXPORT_SYMBOL(ftrace_raw_output_prep);
441
400#ifdef CONFIG_KRETPROBES 442#ifdef CONFIG_KRETPROBES
401static inline const char *kretprobed(const char *name) 443static inline const char *kretprobed(const char *name)
402{ 444{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617{ 659{
618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; 660 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; 661 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
620 unsigned long long abs_ts = iter->ts - iter->tr->time_start; 662 unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts; 663 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq; 664 struct trace_seq *s = &iter->seq;
623 665
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
783 825
784void trace_event_read_lock(void) 826void trace_event_read_lock(void)
785{ 827{
786 down_read(&trace_event_mutex); 828 down_read(&trace_event_sem);
787} 829}
788 830
789void trace_event_read_unlock(void) 831void trace_event_read_unlock(void)
790{ 832{
791 up_read(&trace_event_mutex); 833 up_read(&trace_event_sem);
792} 834}
793 835
794/** 836/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
811 unsigned key; 853 unsigned key;
812 int ret = 0; 854 int ret = 0;
813 855
814 down_write(&trace_event_mutex); 856 down_write(&trace_event_sem);
815 857
816 if (WARN_ON(!event)) 858 if (WARN_ON(!event))
817 goto out; 859 goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
866 908
867 ret = event->type; 909 ret = event->type;
868 out: 910 out:
869 up_write(&trace_event_mutex); 911 up_write(&trace_event_sem);
870 912
871 return ret; 913 return ret;
872} 914}
873EXPORT_SYMBOL_GPL(register_ftrace_event); 915EXPORT_SYMBOL_GPL(register_ftrace_event);
874 916
875/* 917/*
876 * Used by module code with the trace_event_mutex held for write. 918 * Used by module code with the trace_event_sem held for write.
877 */ 919 */
878int __unregister_ftrace_event(struct trace_event *event) 920int __unregister_ftrace_event(struct trace_event *event)
879{ 921{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
888 */ 930 */
889int unregister_ftrace_event(struct trace_event *event) 931int unregister_ftrace_event(struct trace_event *event)
890{ 932{
891 down_write(&trace_event_mutex); 933 down_write(&trace_event_sem);
892 __unregister_ftrace_event(event); 934 __unregister_ftrace_event(event);
893 up_write(&trace_event_mutex); 935 up_write(&trace_event_sem);
894 936
895 return 0; 937 return 0;
896} 938}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
1217 .funcs = &trace_user_stack_funcs, 1259 .funcs = &trace_user_stack_funcs,
1218}; 1260};
1219 1261
1262/* TRACE_BPUTS */
1263static enum print_line_t
1264trace_bputs_print(struct trace_iterator *iter, int flags,
1265 struct trace_event *event)
1266{
1267 struct trace_entry *entry = iter->ent;
1268 struct trace_seq *s = &iter->seq;
1269 struct bputs_entry *field;
1270
1271 trace_assign_type(field, entry);
1272
1273 if (!seq_print_ip_sym(s, field->ip, flags))
1274 goto partial;
1275
1276 if (!trace_seq_puts(s, ": "))
1277 goto partial;
1278
1279 if (!trace_seq_puts(s, field->str))
1280 goto partial;
1281
1282 return TRACE_TYPE_HANDLED;
1283
1284 partial:
1285 return TRACE_TYPE_PARTIAL_LINE;
1286}
1287
1288
1289static enum print_line_t
1290trace_bputs_raw(struct trace_iterator *iter, int flags,
1291 struct trace_event *event)
1292{
1293 struct bputs_entry *field;
1294 struct trace_seq *s = &iter->seq;
1295
1296 trace_assign_type(field, iter->ent);
1297
1298 if (!trace_seq_printf(s, ": %lx : ", field->ip))
1299 goto partial;
1300
1301 if (!trace_seq_puts(s, field->str))
1302 goto partial;
1303
1304 return TRACE_TYPE_HANDLED;
1305
1306 partial:
1307 return TRACE_TYPE_PARTIAL_LINE;
1308}
1309
1310static struct trace_event_functions trace_bputs_funcs = {
1311 .trace = trace_bputs_print,
1312 .raw = trace_bputs_raw,
1313};
1314
1315static struct trace_event trace_bputs_event = {
1316 .type = TRACE_BPUTS,
1317 .funcs = &trace_bputs_funcs,
1318};
1319
1220/* TRACE_BPRINT */ 1320/* TRACE_BPRINT */
1221static enum print_line_t 1321static enum print_line_t
1222trace_bprint_print(struct trace_iterator *iter, int flags, 1322trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
1329 &trace_wake_event, 1429 &trace_wake_event,
1330 &trace_stack_event, 1430 &trace_stack_event,
1331 &trace_user_stack_event, 1431 &trace_user_stack_event,
1432 &trace_bputs_event,
1332 &trace_bprint_event, 1433 &trace_bprint_event,
1333 &trace_print_event, 1434 &trace_print_event,
1334 NULL 1435 NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
5#include "trace.h" 5#include "trace.h"
6 6
7extern enum print_line_t 7extern enum print_line_t
8trace_print_bputs_msg_only(struct trace_iterator *iter);
9extern enum print_line_t
8trace_print_bprintk_msg_only(struct trace_iterator *iter); 10trace_print_bprintk_msg_only(struct trace_iterator *iter);
9extern enum print_line_t 11extern enum print_line_t
10trace_print_printk_msg_only(struct trace_iterator *iter); 12trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 33
32/* used by module unregistering */ 34/* used by module unregistering */
33extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
34extern struct rw_semaphore trace_event_mutex; 36extern struct rw_semaphore trace_event_sem;
35 37
36#define MAX_MEMHEX_BYTES 8 38#define MAX_MEMHEX_BYTES 8
37#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
28 unsigned long flags, int pc) 28 unsigned long flags, int pc)
29{ 29{
30 struct ftrace_event_call *call = &event_context_switch; 30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer; 31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event; 32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry; 33 struct ctx_switch_entry *entry;
34 34
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
69 pc = preempt_count(); 69 pc = preempt_count();
70 local_irq_save(flags); 70 local_irq_save(flags);
71 cpu = raw_smp_processor_id(); 71 cpu = raw_smp_processor_id();
72 data = ctx_trace->data[cpu]; 72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73 73
74 if (likely(!atomic_read(&data->disabled))) 74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); 75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
86 struct ftrace_event_call *call = &event_wakeup; 86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event; 87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry; 88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->buffer; 89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90 90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, 91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc); 92 sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
123 pc = preempt_count(); 123 pc = preempt_count();
124 local_irq_save(flags); 124 local_irq_save(flags);
125 cpu = raw_smp_processor_id(); 125 cpu = raw_smp_processor_id();
126 data = ctx_trace->data[cpu]; 126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127 127
128 if (likely(!atomic_read(&data->disabled))) 128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current, 129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fde652c9a511..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace); 37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
38 38
39static int save_flags; 39static int save_flags;
40static bool function_enabled;
40 41
41#define TRACE_DISPLAY_GRAPH 1 42#define TRACE_DISPLAY_GRAPH 1
42 43
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
89 if (cpu != wakeup_current_cpu) 90 if (cpu != wakeup_current_cpu)
90 goto out_enable; 91 goto out_enable;
91 92
92 *data = tr->data[cpu]; 93 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
93 disabled = atomic_inc_return(&(*data)->disabled); 94 disabled = atomic_inc_return(&(*data)->disabled);
94 if (unlikely(disabled != 1)) 95 if (unlikely(disabled != 1))
95 goto out; 96 goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
134}; 135};
135#endif /* CONFIG_FUNCTION_TRACER */ 136#endif /* CONFIG_FUNCTION_TRACER */
136 137
137static int start_func_tracer(int graph) 138static int register_wakeup_function(int graph, int set)
138{ 139{
139 int ret; 140 int ret;
140 141
141 if (!graph) 142 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
142 ret = register_ftrace_function(&trace_ops); 143 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
143 else 144 return 0;
145
146 if (graph)
144 ret = register_ftrace_graph(&wakeup_graph_return, 147 ret = register_ftrace_graph(&wakeup_graph_return,
145 &wakeup_graph_entry); 148 &wakeup_graph_entry);
149 else
150 ret = register_ftrace_function(&trace_ops);
151
152 if (!ret)
153 function_enabled = true;
154
155 return ret;
156}
157
158static void unregister_wakeup_function(int graph)
159{
160 if (!function_enabled)
161 return;
162
163 if (graph)
164 unregister_ftrace_graph();
165 else
166 unregister_ftrace_function(&trace_ops);
167
168 function_enabled = false;
169}
170
171static void wakeup_function_set(int set)
172{
173 if (set)
174 register_wakeup_function(is_graph(), 1);
175 else
176 unregister_wakeup_function(is_graph());
177}
178
179static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
180{
181 if (mask & TRACE_ITER_FUNCTION)
182 wakeup_function_set(set);
183
184 return trace_keep_overwrite(tracer, mask, set);
185}
186
187static int start_func_tracer(int graph)
188{
189 int ret;
190
191 ret = register_wakeup_function(graph, 0);
146 192
147 if (!ret && tracing_is_enabled()) 193 if (!ret && tracing_is_enabled())
148 tracer_enabled = 1; 194 tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
156{ 202{
157 tracer_enabled = 0; 203 tracer_enabled = 0;
158 204
159 if (!graph) 205 unregister_wakeup_function(graph);
160 unregister_ftrace_function(&trace_ops);
161 else
162 unregister_ftrace_graph();
163} 206}
164 207
165#ifdef CONFIG_FUNCTION_GRAPH_TRACER 208#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
353 396
354 /* disable local data, not wakeup_cpu data */ 397 /* disable local data, not wakeup_cpu data */
355 cpu = raw_smp_processor_id(); 398 cpu = raw_smp_processor_id();
356 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 399 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
357 if (likely(disabled != 1)) 400 if (likely(disabled != 1))
358 goto out; 401 goto out;
359 402
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
365 goto out_unlock; 408 goto out_unlock;
366 409
367 /* The task we are waiting for is waking up */ 410 /* The task we are waiting for is waking up */
368 data = wakeup_trace->data[wakeup_cpu]; 411 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
369 412
370 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 413 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
371 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 414 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
387 arch_spin_unlock(&wakeup_lock); 430 arch_spin_unlock(&wakeup_lock);
388 local_irq_restore(flags); 431 local_irq_restore(flags);
389out: 432out:
390 atomic_dec(&wakeup_trace->data[cpu]->disabled); 433 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
391} 434}
392 435
393static void __wakeup_reset(struct trace_array *tr) 436static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
405{ 448{
406 unsigned long flags; 449 unsigned long flags;
407 450
408 tracing_reset_online_cpus(tr); 451 tracing_reset_online_cpus(&tr->trace_buffer);
409 452
410 local_irq_save(flags); 453 local_irq_save(flags);
411 arch_spin_lock(&wakeup_lock); 454 arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
435 return; 478 return;
436 479
437 pc = preempt_count(); 480 pc = preempt_count();
438 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 481 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
439 if (unlikely(disabled != 1)) 482 if (unlikely(disabled != 1))
440 goto out; 483 goto out;
441 484
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
458 501
459 local_save_flags(flags); 502 local_save_flags(flags);
460 503
461 data = wakeup_trace->data[wakeup_cpu]; 504 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
462 data->preempt_timestamp = ftrace_now(cpu); 505 data->preempt_timestamp = ftrace_now(cpu);
463 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); 506 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
464 507
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472out_locked: 515out_locked:
473 arch_spin_unlock(&wakeup_lock); 516 arch_spin_unlock(&wakeup_lock);
474out: 517out:
475 atomic_dec(&wakeup_trace->data[cpu]->disabled); 518 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
476} 519}
477 520
478static void start_wakeup_tracer(struct trace_array *tr) 521static void start_wakeup_tracer(struct trace_array *tr)
@@ -543,8 +586,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)
543 save_flags = trace_flags; 586 save_flags = trace_flags;
544 587
545 /* non overwrite screws up the latency tracers */ 588 /* non overwrite screws up the latency tracers */
546 set_tracer_flag(TRACE_ITER_OVERWRITE, 1); 589 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
547 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); 590 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
548 591
549 tracing_max_latency = 0; 592 tracing_max_latency = 0;
550 wakeup_trace = tr; 593 wakeup_trace = tr;
@@ -573,8 +616,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
573 /* make sure we put back any tasks we are tracing */ 616 /* make sure we put back any tasks we are tracing */
574 wakeup_reset(tr); 617 wakeup_reset(tr);
575 618
576 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); 619 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
577 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); 620 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
578} 621}
579 622
580static void wakeup_tracer_start(struct trace_array *tr) 623static void wakeup_tracer_start(struct trace_array *tr)
@@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
600 .print_line = wakeup_print_line, 643 .print_line = wakeup_print_line,
601 .flags = &tracer_flags, 644 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag, 645 .set_flag = wakeup_set_flag,
603 .flag_changed = trace_keep_overwrite, 646 .flag_changed = wakeup_flag_changed,
604#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
605 .selftest = trace_selftest_startup_wakeup, 648 .selftest = trace_selftest_startup_wakeup,
606#endif 649#endif
@@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
622 .print_line = wakeup_print_line, 665 .print_line = wakeup_print_line,
623 .flags = &tracer_flags, 666 .flags = &tracer_flags,
624 .set_flag = wakeup_set_flag, 667 .set_flag = wakeup_set_flag,
625 .flag_changed = trace_keep_overwrite, 668 .flag_changed = wakeup_flag_changed,
626#ifdef CONFIG_FTRACE_SELFTEST 669#ifdef CONFIG_FTRACE_SELFTEST
627 .selftest = trace_selftest_startup_wakeup, 670 .selftest = trace_selftest_startup_wakeup,
628#endif 671#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
21 return 0; 21 return 0;
22} 22}
23 23
24static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) 24static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
25{ 25{
26 struct ring_buffer_event *event; 26 struct ring_buffer_event *event;
27 struct trace_entry *entry; 27 struct trace_entry *entry;
28 unsigned int loops = 0; 28 unsigned int loops = 0;
29 29
30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { 30 while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
31 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
32 32
33 /* 33 /*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
58 * Test the trace buffer to see if all the elements 58 * Test the trace buffer to see if all the elements
59 * are still sane. 59 * are still sane.
60 */ 60 */
61static int trace_test_buffer(struct trace_array *tr, unsigned long *count) 61static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
62{ 62{
63 unsigned long flags, cnt = 0; 63 unsigned long flags, cnt = 0;
64 int cpu, ret = 0; 64 int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&ftrace_max_lock);
69 69
70 cnt = ring_buffer_entries(tr->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
72 /* 72 /*
73 * The trace_test_buffer_cpu runs a while loop to consume all data. 73 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
78 */ 78 */
79 tracing_off(); 79 tracing_off();
80 for_each_possible_cpu(cpu) { 80 for_each_possible_cpu(cpu) {
81 ret = trace_test_buffer_cpu(tr, cpu); 81 ret = trace_test_buffer_cpu(buf, cpu);
82 if (ret) 82 if (ret)
83 break; 83 break;
84 } 84 }
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
355 msleep(100); 355 msleep(100);
356 356
357 /* we should have nothing in the buffer */ 357 /* we should have nothing in the buffer */
358 ret = trace_test_buffer(tr, &count); 358 ret = trace_test_buffer(&tr->trace_buffer, &count);
359 if (ret) 359 if (ret)
360 goto out; 360 goto out;
361 361
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
376 ftrace_enabled = 0; 376 ftrace_enabled = 0;
377 377
378 /* check the trace buffer */ 378 /* check the trace buffer */
379 ret = trace_test_buffer(tr, &count); 379 ret = trace_test_buffer(&tr->trace_buffer, &count);
380 tracing_start(); 380 tracing_start();
381 381
382 /* we should only have one item */ 382 /* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
666 ftrace_enabled = 0; 666 ftrace_enabled = 0;
667 667
668 /* check the trace buffer */ 668 /* check the trace buffer */
669 ret = trace_test_buffer(tr, &count); 669 ret = trace_test_buffer(&tr->trace_buffer, &count);
670 trace->reset(tr); 670 trace->reset(tr);
671 tracing_start(); 671 tracing_start();
672 672
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
703/* Maximum number of functions to trace before diagnosing a hang */ 703/* Maximum number of functions to trace before diagnosing a hang */
704#define GRAPH_MAX_FUNC_TEST 100000000 704#define GRAPH_MAX_FUNC_TEST 100000000
705 705
706static void
707__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
708static unsigned int graph_hang_thresh; 706static unsigned int graph_hang_thresh;
709 707
710/* Wrap the real function entry probe to avoid possible hanging */ 708/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
714 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { 712 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
715 ftrace_graph_stop(); 713 ftrace_graph_stop();
716 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 714 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
717 if (ftrace_dump_on_oops) 715 if (ftrace_dump_on_oops) {
718 __ftrace_dump(false, DUMP_ALL); 716 ftrace_dump(DUMP_ALL);
717 /* ftrace_dump() disables tracing */
718 tracing_on();
719 }
719 return 0; 720 return 0;
720 } 721 }
721 722
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
737 * Simulate the init() callback but we attach a watchdog callback 738 * Simulate the init() callback but we attach a watchdog callback
738 * to detect and recover from possible hangs 739 * to detect and recover from possible hangs
739 */ 740 */
740 tracing_reset_online_cpus(tr); 741 tracing_reset_online_cpus(&tr->trace_buffer);
741 set_graph_array(tr); 742 set_graph_array(tr);
742 ret = register_ftrace_graph(&trace_graph_return, 743 ret = register_ftrace_graph(&trace_graph_return,
743 &trace_graph_entry_watchdog); 744 &trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
760 tracing_stop(); 761 tracing_stop();
761 762
762 /* check the trace buffer */ 763 /* check the trace buffer */
763 ret = trace_test_buffer(tr, &count); 764 ret = trace_test_buffer(&tr->trace_buffer, &count);
764 765
765 trace->reset(tr); 766 trace->reset(tr);
766 tracing_start(); 767 tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
815 /* stop the tracing. */ 816 /* stop the tracing. */
816 tracing_stop(); 817 tracing_stop();
817 /* check both trace buffers */ 818 /* check both trace buffers */
818 ret = trace_test_buffer(tr, NULL); 819 ret = trace_test_buffer(&tr->trace_buffer, NULL);
819 if (!ret) 820 if (!ret)
820 ret = trace_test_buffer(&max_tr, &count); 821 ret = trace_test_buffer(&tr->max_buffer, &count);
821 trace->reset(tr); 822 trace->reset(tr);
822 tracing_start(); 823 tracing_start();
823 824
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
877 /* stop the tracing. */ 878 /* stop the tracing. */
878 tracing_stop(); 879 tracing_stop();
879 /* check both trace buffers */ 880 /* check both trace buffers */
880 ret = trace_test_buffer(tr, NULL); 881 ret = trace_test_buffer(&tr->trace_buffer, NULL);
881 if (!ret) 882 if (!ret)
882 ret = trace_test_buffer(&max_tr, &count); 883 ret = trace_test_buffer(&tr->max_buffer, &count);
883 trace->reset(tr); 884 trace->reset(tr);
884 tracing_start(); 885 tracing_start();
885 886
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
943 /* stop the tracing. */ 944 /* stop the tracing. */
944 tracing_stop(); 945 tracing_stop();
945 /* check both trace buffers */ 946 /* check both trace buffers */
946 ret = trace_test_buffer(tr, NULL); 947 ret = trace_test_buffer(&tr->trace_buffer, NULL);
947 if (ret) 948 if (ret)
948 goto out; 949 goto out;
949 950
950 ret = trace_test_buffer(&max_tr, &count); 951 ret = trace_test_buffer(&tr->max_buffer, &count);
951 if (ret) 952 if (ret)
952 goto out; 953 goto out;
953 954
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 /* stop the tracing. */ 974 /* stop the tracing. */
974 tracing_stop(); 975 tracing_stop();
975 /* check both trace buffers */ 976 /* check both trace buffers */
976 ret = trace_test_buffer(tr, NULL); 977 ret = trace_test_buffer(&tr->trace_buffer, NULL);
977 if (ret) 978 if (ret)
978 goto out; 979 goto out;
979 980
980 ret = trace_test_buffer(&max_tr, &count); 981 ret = trace_test_buffer(&tr->max_buffer, &count);
981 982
982 if (!ret && !count) { 983 if (!ret && !count) {
983 printk(KERN_CONT ".. no entries found .."); 984 printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1084 /* stop the tracing. */ 1085 /* stop the tracing. */
1085 tracing_stop(); 1086 tracing_stop();
1086 /* check both trace buffers */ 1087 /* check both trace buffers */
1087 ret = trace_test_buffer(tr, NULL); 1088 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1088 printk("ret = %d\n", ret); 1089 printk("ret = %d\n", ret);
1089 if (!ret) 1090 if (!ret)
1090 ret = trace_test_buffer(&max_tr, &count); 1091 ret = trace_test_buffer(&tr->max_buffer, &count);
1091 1092
1092 1093
1093 trace->reset(tr); 1094 trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
1126 /* stop the tracing. */ 1127 /* stop the tracing. */
1127 tracing_stop(); 1128 tracing_stop();
1128 /* check the trace buffer */ 1129 /* check the trace buffer */
1129 ret = trace_test_buffer(tr, &count); 1130 ret = trace_test_buffer(&tr->trace_buffer, &count);
1130 trace->reset(tr); 1131 trace->reset(tr);
1131 tracing_start(); 1132 tracing_start();
1132 1133
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 83a8b5b7bd35..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
20 20
21#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
22 22
23#ifdef CC_USING_FENTRY
24# define fentry 1
25#else
26# define fentry 0
27#endif
28
23static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 29static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
24 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; 30 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
25static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; 31static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
26 32
33/*
34 * Reserve one entry for the passed in ip. This will allow
35 * us to remove most or all of the stack size overhead
36 * added by the stack tracer itself.
37 */
27static struct stack_trace max_stack_trace = { 38static struct stack_trace max_stack_trace = {
28 .max_entries = STACK_TRACE_ENTRIES, 39 .max_entries = STACK_TRACE_ENTRIES - 1,
29 .entries = stack_dump_trace, 40 .entries = &stack_dump_trace[1],
30}; 41};
31 42
32static unsigned long max_stack_size; 43static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
39int stack_tracer_enabled; 50int stack_tracer_enabled;
40static int last_stack_tracer_enabled; 51static int last_stack_tracer_enabled;
41 52
42static inline void check_stack(void) 53static inline void
54check_stack(unsigned long ip, unsigned long *stack)
43{ 55{
44 unsigned long this_size, flags; 56 unsigned long this_size, flags;
45 unsigned long *p, *top, *start; 57 unsigned long *p, *top, *start;
58 static int tracer_frame;
59 int frame_size = ACCESS_ONCE(tracer_frame);
46 int i; 60 int i;
47 61
48 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); 62 this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
49 this_size = THREAD_SIZE - this_size; 63 this_size = THREAD_SIZE - this_size;
64 /* Remove the frame of the tracer */
65 this_size -= frame_size;
50 66
51 if (this_size <= max_stack_size) 67 if (this_size <= max_stack_size)
52 return; 68 return;
53 69
54 /* we do not handle interrupt stacks yet */ 70 /* we do not handle interrupt stacks yet */
55 if (!object_is_on_stack(&this_size)) 71 if (!object_is_on_stack(stack))
56 return; 72 return;
57 73
58 local_irq_save(flags); 74 local_irq_save(flags);
59 arch_spin_lock(&max_stack_lock); 75 arch_spin_lock(&max_stack_lock);
60 76
77 /* In case another CPU set the tracer_frame on us */
78 if (unlikely(!frame_size))
79 this_size -= tracer_frame;
80
61 /* a race could have already updated it */ 81 /* a race could have already updated it */
62 if (this_size <= max_stack_size) 82 if (this_size <= max_stack_size)
63 goto out; 83 goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
70 save_stack_trace(&max_stack_trace); 90 save_stack_trace(&max_stack_trace);
71 91
72 /* 92 /*
93 * Add the passed in ip from the function tracer.
94 * Searching for this on the stack will skip over
95 * most of the overhead from the stack tracer itself.
96 */
97 stack_dump_trace[0] = ip;
98 max_stack_trace.nr_entries++;
99
100 /*
73 * Now find where in the stack these are. 101 * Now find where in the stack these are.
74 */ 102 */
75 i = 0; 103 i = 0;
76 start = &this_size; 104 start = stack;
77 top = (unsigned long *) 105 top = (unsigned long *)
78 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 106 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
79 107
@@ -97,6 +125,18 @@ static inline void check_stack(void)
97 found = 1; 125 found = 1;
98 /* Start the search from here */ 126 /* Start the search from here */
99 start = p + 1; 127 start = p + 1;
128 /*
129 * We do not want to show the overhead
130 * of the stack tracer stack in the
131 * max stack. If we haven't figured
132 * out what that is, then figure it out
133 * now.
134 */
135 if (unlikely(!tracer_frame) && i == 1) {
136 tracer_frame = (p - stack) *
137 sizeof(unsigned long);
138 max_stack_size -= tracer_frame;
139 }
100 } 140 }
101 } 141 }
102 142
@@ -113,6 +153,7 @@ static void
113stack_trace_call(unsigned long ip, unsigned long parent_ip, 153stack_trace_call(unsigned long ip, unsigned long parent_ip,
114 struct ftrace_ops *op, struct pt_regs *pt_regs) 154 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 155{
156 unsigned long stack;
116 int cpu; 157 int cpu;
117 158
118 preempt_disable_notrace(); 159 preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
122 if (per_cpu(trace_active, cpu)++ != 0) 163 if (per_cpu(trace_active, cpu)++ != 0)
123 goto out; 164 goto out;
124 165
125 check_stack(); 166 /*
167 * When fentry is used, the traced function does not get
168 * its stack frame set up, and we lose the parent.
169 * The ip is pretty useless because the function tracer
170 * was called before that function set up its stack frame.
171 * In this case, we use the parent ip.
172 *
173 * By adding the return address of either the parent ip
174 * or the current ip we can disregard most of the stack usage
175 * caused by the stack tracer itself.
176 *
177 * The function tracer always reports the address of where the
178 * mcount call was, but the stack will hold the return address.
179 */
180 if (fentry)
181 ip = parent_ip;
182 else
183 ip += MCOUNT_INSN_SIZE;
184
185 check_stack(ip, &stack);
126 186
127 out: 187 out:
128 per_cpu(trace_active, cpu)--; 188 per_cpu(trace_active, cpu)--;
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
371 struct dentry *d_tracer; 431 struct dentry *d_tracer;
372 432
373 d_tracer = tracing_init_dentry(); 433 d_tracer = tracing_init_dentry();
434 if (!d_tracer)
435 return 0;
374 436
375 trace_create_file("stack_max_size", 0644, d_tracer, 437 trace_create_file("stack_max_size", 0644, d_tracer,
376 &max_stack_size, &stack_max_size_fops); 438 &max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
307 struct dentry *d_tracing; 307 struct dentry *d_tracing;
308 308
309 d_tracing = tracing_init_dentry(); 309 d_tracing = tracing_init_dentry();
310 if (!d_tracing)
311 return 0;
310 312
311 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 313 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
312 if (!stat_dir) 314 if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
12#include "trace.h" 12#include "trace.h"
13 13
14static DEFINE_MUTEX(syscall_trace_lock); 14static DEFINE_MUTEX(syscall_trace_lock);
15static int sys_refcount_enter;
16static int sys_refcount_exit;
17static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
18static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
19 15
20static int syscall_enter_register(struct ftrace_event_call *event, 16static int syscall_enter_register(struct ftrace_event_call *event,
21 enum trace_reg type, void *data); 17 enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
41 /* 37 /*
42 * Only compare after the "sys" prefix. Archs that use 38 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed 39 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with "SyS" instead of "sys", leading to an unwanted 40 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch. 41 * mismatch.
46 */ 42 */
47 return !strcmp(sym + 3, name + 3); 43 return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
265 kfree(call->print_fmt); 261 kfree(call->print_fmt);
266} 262}
267 263
268static int syscall_enter_define_fields(struct ftrace_event_call *call) 264static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
269{ 265{
270 struct syscall_trace_enter trace; 266 struct syscall_trace_enter trace;
271 struct syscall_metadata *meta = call->data; 267 struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
288 return ret; 284 return ret;
289} 285}
290 286
291static int syscall_exit_define_fields(struct ftrace_event_call *call) 287static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
292{ 288{
293 struct syscall_trace_exit trace; 289 struct syscall_trace_exit trace;
294 int ret; 290 int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
303 return ret; 299 return ret;
304} 300}
305 301
306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
307{ 303{
304 struct trace_array *tr = data;
308 struct syscall_trace_enter *entry; 305 struct syscall_trace_enter *entry;
309 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
310 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
315 syscall_nr = trace_get_syscall_nr(current, regs); 312 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 313 if (syscall_nr < 0)
317 return; 314 return;
318 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 315 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
319 return; 316 return;
320 317
321 sys_data = syscall_nr_to_meta(syscall_nr); 318 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
324 321
325 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
326 323
327 event = trace_current_buffer_lock_reserve(&buffer, 324 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer,
328 sys_data->enter_event->event.type, size, 0, 0); 326 sys_data->enter_event->event.type, size, 0, 0);
329 if (!event) 327 if (!event)
330 return; 328 return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
338 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 336 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
339} 337}
340 338
341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
342{ 340{
341 struct trace_array *tr = data;
343 struct syscall_trace_exit *entry; 342 struct syscall_trace_exit *entry;
344 struct syscall_metadata *sys_data; 343 struct syscall_metadata *sys_data;
345 struct ring_buffer_event *event; 344 struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
349 syscall_nr = trace_get_syscall_nr(current, regs); 348 syscall_nr = trace_get_syscall_nr(current, regs);
350 if (syscall_nr < 0) 349 if (syscall_nr < 0)
351 return; 350 return;
352 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 351 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
353 return; 352 return;
354 353
355 sys_data = syscall_nr_to_meta(syscall_nr); 354 sys_data = syscall_nr_to_meta(syscall_nr);
356 if (!sys_data) 355 if (!sys_data)
357 return; 356 return;
358 357
359 event = trace_current_buffer_lock_reserve(&buffer, 358 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
361 if (!event) 361 if (!event)
362 return; 362 return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
371} 371}
372 372
373static int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_file *file,
374 struct ftrace_event_call *call)
374{ 375{
376 struct trace_array *tr = file->tr;
375 int ret = 0; 377 int ret = 0;
376 int num; 378 int num;
377 379
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
379 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 381 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
380 return -ENOSYS; 382 return -ENOSYS;
381 mutex_lock(&syscall_trace_lock); 383 mutex_lock(&syscall_trace_lock);
382 if (!sys_refcount_enter) 384 if (!tr->sys_refcount_enter)
383 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 385 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
384 if (!ret) { 386 if (!ret) {
385 set_bit(num, enabled_enter_syscalls); 387 set_bit(num, tr->enabled_enter_syscalls);
386 sys_refcount_enter++; 388 tr->sys_refcount_enter++;
387 } 389 }
388 mutex_unlock(&syscall_trace_lock); 390 mutex_unlock(&syscall_trace_lock);
389 return ret; 391 return ret;
390} 392}
391 393
392static void unreg_event_syscall_enter(struct ftrace_event_call *call) 394static void unreg_event_syscall_enter(struct ftrace_event_file *file,
395 struct ftrace_event_call *call)
393{ 396{
397 struct trace_array *tr = file->tr;
394 int num; 398 int num;
395 399
396 num = ((struct syscall_metadata *)call->data)->syscall_nr; 400 num = ((struct syscall_metadata *)call->data)->syscall_nr;
397 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 401 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
398 return; 402 return;
399 mutex_lock(&syscall_trace_lock); 403 mutex_lock(&syscall_trace_lock);
400 sys_refcount_enter--; 404 tr->sys_refcount_enter--;
401 clear_bit(num, enabled_enter_syscalls); 405 clear_bit(num, tr->enabled_enter_syscalls);
402 if (!sys_refcount_enter) 406 if (!tr->sys_refcount_enter)
403 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 407 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
404 mutex_unlock(&syscall_trace_lock); 408 mutex_unlock(&syscall_trace_lock);
405} 409}
406 410
407static int reg_event_syscall_exit(struct ftrace_event_call *call) 411static int reg_event_syscall_exit(struct ftrace_event_file *file,
412 struct ftrace_event_call *call)
408{ 413{
414 struct trace_array *tr = file->tr;
409 int ret = 0; 415 int ret = 0;
410 int num; 416 int num;
411 417
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
413 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 419 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
414 return -ENOSYS; 420 return -ENOSYS;
415 mutex_lock(&syscall_trace_lock); 421 mutex_lock(&syscall_trace_lock);
416 if (!sys_refcount_exit) 422 if (!tr->sys_refcount_exit)
417 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 423 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
418 if (!ret) { 424 if (!ret) {
419 set_bit(num, enabled_exit_syscalls); 425 set_bit(num, tr->enabled_exit_syscalls);
420 sys_refcount_exit++; 426 tr->sys_refcount_exit++;
421 } 427 }
422 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
423 return ret; 429 return ret;
424} 430}
425 431
426static void unreg_event_syscall_exit(struct ftrace_event_call *call) 432static void unreg_event_syscall_exit(struct ftrace_event_file *file,
433 struct ftrace_event_call *call)
427{ 434{
435 struct trace_array *tr = file->tr;
428 int num; 436 int num;
429 437
430 num = ((struct syscall_metadata *)call->data)->syscall_nr; 438 num = ((struct syscall_metadata *)call->data)->syscall_nr;
431 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 439 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
432 return; 440 return;
433 mutex_lock(&syscall_trace_lock); 441 mutex_lock(&syscall_trace_lock);
434 sys_refcount_exit--; 442 tr->sys_refcount_exit--;
435 clear_bit(num, enabled_exit_syscalls); 443 clear_bit(num, tr->enabled_exit_syscalls);
436 if (!sys_refcount_exit) 444 if (!tr->sys_refcount_exit)
437 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 445 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
438 mutex_unlock(&syscall_trace_lock); 446 mutex_unlock(&syscall_trace_lock);
439} 447}
440 448
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
471 .trace = print_syscall_exit, 479 .trace = print_syscall_exit,
472}; 480};
473 481
474struct ftrace_event_class event_class_syscall_enter = { 482struct ftrace_event_class __refdata event_class_syscall_enter = {
475 .system = "syscalls", 483 .system = "syscalls",
476 .reg = syscall_enter_register, 484 .reg = syscall_enter_register,
477 .define_fields = syscall_enter_define_fields, 485 .define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
479 .raw_init = init_syscall_trace, 487 .raw_init = init_syscall_trace,
480}; 488};
481 489
482struct ftrace_event_class event_class_syscall_exit = { 490struct ftrace_event_class __refdata event_class_syscall_exit = {
483 .system = "syscalls", 491 .system = "syscalls",
484 .reg = syscall_exit_register, 492 .reg = syscall_exit_register,
485 .define_fields = syscall_exit_define_fields, 493 .define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
685static int syscall_enter_register(struct ftrace_event_call *event, 693static int syscall_enter_register(struct ftrace_event_call *event,
686 enum trace_reg type, void *data) 694 enum trace_reg type, void *data)
687{ 695{
696 struct ftrace_event_file *file = data;
697
688 switch (type) { 698 switch (type) {
689 case TRACE_REG_REGISTER: 699 case TRACE_REG_REGISTER:
690 return reg_event_syscall_enter(event); 700 return reg_event_syscall_enter(file, event);
691 case TRACE_REG_UNREGISTER: 701 case TRACE_REG_UNREGISTER:
692 unreg_event_syscall_enter(event); 702 unreg_event_syscall_enter(file, event);
693 return 0; 703 return 0;
694 704
695#ifdef CONFIG_PERF_EVENTS 705#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
711static int syscall_exit_register(struct ftrace_event_call *event, 721static int syscall_exit_register(struct ftrace_event_call *event,
712 enum trace_reg type, void *data) 722 enum trace_reg type, void *data)
713{ 723{
724 struct ftrace_event_file *file = data;
725
714 switch (type) { 726 switch (type) {
715 case TRACE_REG_REGISTER: 727 case TRACE_REG_REGISTER:
716 return reg_event_syscall_exit(event); 728 return reg_event_syscall_exit(file, event);
717 case TRACE_REG_UNREGISTER: 729 case TRACE_REG_UNREGISTER:
718 unreg_event_syscall_exit(event); 730 unreg_event_syscall_exit(file, event);
719 return 0; 731 return 0;
720 732
721#ifdef CONFIG_PERF_EVENTS 733#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
112 int nr_probes = 0; 112 int nr_probes = 0;
113 struct tracepoint_func *old, *new; 113 struct tracepoint_func *old, *new;
114 114
115 WARN_ON(!probe); 115 if (WARN_ON(!probe))
116 return ERR_PTR(-EINVAL);
116 117
117 debug_print_probes(entry); 118 debug_print_probes(entry);
118 old = entry->funcs; 119 old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
152 153
153 debug_print_probes(entry); 154 debug_print_probes(entry);
154 /* (N -> M), (N > 1, M >= 0) probes */ 155 /* (N -> M), (N > 1, M >= 0) probes */
155 for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 156 if (probe) {
156 if (!probe || 157 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
157 (old[nr_probes].func == probe && 158 if (old[nr_probes].func == probe &&
158 old[nr_probes].data == data)) 159 old[nr_probes].data == data)
159 nr_del++; 160 nr_del++;
161 }
160 } 162 }
161 163
164 /*
165 * If probe is NULL, then nr_probes = nr_del = 0, and then the
166 * entire entry will be removed.
167 */
162 if (nr_probes - nr_del == 0) { 168 if (nr_probes - nr_del == 0) {
163 /* N -> 0, (N > 1) */ 169 /* N -> 0, (N > 1) */
164 entry->funcs = NULL; 170 entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
173 if (new == NULL) 179 if (new == NULL)
174 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
175 for (i = 0; old[i].func; i++) 181 for (i = 0; old[i].func; i++)
176 if (probe && 182 if (old[i].func != probe || old[i].data != data)
177 (old[i].func != probe || old[i].data != data))
178 new[j++] = old[i]; 183 new[j++] = old[i];
179 new[nr_probes - nr_del].func = NULL; 184 new[nr_probes - nr_del].func = NULL;
180 entry->refcount = nr_probes - nr_del; 185 entry->refcount = nr_probes - nr_del;