aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 16:24:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 16:24:03 -0400
commit483e3cd6a34ad2d7e41100bc1b98614ac42a4567 (patch)
treeef544ccdd1e95991c32fd8b656714583b7398371
parent774a694f8cd08115d130a290d73c6d8563f26b1b (diff)
parentd28daf923ac5e4a0d7cecebae56f3e339189366b (diff)
Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (105 commits) ring-buffer: only enable ring_buffer_swap_cpu when needed ring-buffer: check for swapped buffers in start of committing tracing: report error in trace if we fail to swap latency buffer tracing: add trace_array_printk for internal tracers to use tracing: pass around ring buffer instead of tracer tracing: make tracing_reset safe for external use tracing: use timestamp to determine start of latency traces tracing: Remove mentioning of legacy latency_trace file from documentation tracing/filters: Defer pred allocation, fix memory leak tracing: remove users of tracing_reset tracing: disable buffers and synchronize_sched before resetting tracing: disable update max tracer while reading trace tracing: print out start and stop in latency traces ring-buffer: disable all cpu buffers when one finds a problem ring-buffer: do not count discarded events ring-buffer: remove ring_buffer_event_discard ring-buffer: fix ring_buffer_read crossing pages ring-buffer: remove unnecessary cpu_relax ring-buffer: do not swap buffers during a commit ring-buffer: do not reset while in a commit ...
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/trace/events.txt9
-rw-r--r--Documentation/trace/ftrace.txt68
-rw-r--r--Documentation/trace/function-graph-fold.vim42
-rw-r--r--Documentation/trace/ring-buffer-design.txt955
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/defconfig2
-rw-r--r--arch/s390/include/asm/thread_info.h4
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kernel/entry64.S2
-rw-r--r--arch/s390/kernel/ftrace.c36
-rw-r--r--arch/s390/kernel/ptrace.c11
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/thread_info.h13
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--include/linux/ftrace_event.h51
-rw-r--r--include/linux/module.h14
-rw-r--r--include/linux/perf_counter.h2
-rw-r--r--include/linux/ring_buffer.h24
-rw-r--r--include/linux/syscalls.h131
-rw-r--r--include/linux/tracepoint.h29
-rw-r--r--include/trace/define_trace.h7
-rw-r--r--include/trace/events/module.h126
-rw-r--r--include/trace/events/sched.h12
-rw-r--r--include/trace/events/syscalls.h70
-rw-r--r--include/trace/ftrace.h93
-rw-r--r--include/trace/syscall.h48
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/trace/Kconfig13
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c107
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1112
-rw-r--r--kernel/trace/trace.c679
-rw-r--r--kernel/trace/trace.h76
-rw-r--r--kernel/trace/trace_boot.c16
-rw-r--r--kernel/trace/trace_events.c146
-rw-r--r--kernel/trace/trace_events_filter.c261
-rw-r--r--kernel/trace/trace_export.c28
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c166
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c43
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rwxr-xr-xscripts/recordmcount.pl1
-rw-r--r--tools/perf/util/parse-events.c8
65 files changed, 4106 insertions, 1286 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7936b801fe6a..8e91863190e8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2480,6 +2480,11 @@ and is between 256 and 4096 characters. It is defined in the file
2480 trace_buf_size=nn[KMG] 2480 trace_buf_size=nn[KMG]
2481 [FTRACE] will set tracing buffer size. 2481 [FTRACE] will set tracing buffer size.
2482 2482
2483 trace_event=[event-list]
2484 [FTRACE] Set and start specified trace events in order
2485 to facilitate early boot debugging.
2486 See also Documentation/trace/events.txt
2487
2483 trix= [HW,OSS] MediaTrix AudioTrix Pro 2488 trix= [HW,OSS] MediaTrix AudioTrix Pro
2484 Format: 2489 Format:
2485 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2490 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index f157d7594ea7..2bcc8d4dea29 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
83 X - there is a mixture of events enabled and disabled 83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event 84 ? - this file does not affect any event
85 85
862.3 Boot option
87---------------
88
89In order to facilitate early boot debugging, use boot option:
90
91 trace_event=[event-list]
92
93The format of this boot option is the same as described in section 2.1.
94
863. Defining an event-enabled tracepoint 953. Defining an event-enabled tracepoint
87======================================= 96=======================================
88 97
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index a39b3c749de5..355d0f1f8c50 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files:
85 This file holds the output of the trace in a human 85 This file holds the output of the trace in a human
86 readable format (described below). 86 readable format (described below).
87 87
88 latency_trace:
89
90 This file shows the same trace but the information
91 is organized more to display possible latencies
92 in the system (described below).
93
94 trace_pipe: 88 trace_pipe:
95 89
96 The output is the same as the "trace" file but this 90 The output is the same as the "trace" file but this
97 file is meant to be streamed with live tracing. 91 file is meant to be streamed with live tracing.
98 Reads from this file will block until new data 92 Reads from this file will block until new data is
99 is retrieved. Unlike the "trace" and "latency_trace" 93 retrieved. Unlike the "trace" file, this file is a
100 files, this file is a consumer. This means reading 94 consumer. This means reading from this file causes
101 from this file causes sequential reads to display 95 sequential reads to display more current data. Once
102 more current data. Once data is read from this 96 data is read from this file, it is consumed, and
103 file, it is consumed, and will not be read 97 will not be read again with a sequential read. The
104 again with a sequential read. The "trace" and 98 "trace" file is static, and if the tracer is not
105 "latency_trace" files are static, and if the 99 adding more data,they will display the same
106 tracer is not adding more data, they will display 100 information every time they are read.
107 the same information every time they are read.
108 101
109 trace_options: 102 trace_options:
110 103
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files:
117 Some of the tracers record the max latency. 110 Some of the tracers record the max latency.
118 For example, the time interrupts are disabled. 111 For example, the time interrupts are disabled.
119 This time is saved in this file. The max trace 112 This time is saved in this file. The max trace
120 will also be stored, and displayed by either 113 will also be stored, and displayed by "trace".
121 "trace" or "latency_trace". A new max trace will 114 A new max trace will only be recorded if the
122 only be recorded if the latency is greater than 115 latency is greater than the value in this
123 the value in this file. (in microseconds) 116 file. (in microseconds)
124 117
125 buffer_size_kb: 118 buffer_size_kb:
126 119
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured.
210 the trace with the longest max latency. 203 the trace with the longest max latency.
211 See tracing_max_latency. When a new max is recorded, 204 See tracing_max_latency. When a new max is recorded,
212 it replaces the old trace. It is best to view this 205 it replaces the old trace. It is best to view this
213 trace via the latency_trace file. 206 trace with the latency-format option enabled.
214 207
215 "preemptoff" 208 "preemptoff"
216 209
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0).
307Latency trace format 300Latency trace format
308-------------------- 301--------------------
309 302
310For traces that display latency times, the latency_trace file 303When the latency-format option is enabled, the trace file gives
311gives somewhat more information to see why a latency happened. 304somewhat more information to see why a latency happened.
312Here is a typical trace. 305Here is a typical trace.
313 306
314# tracer: irqsoff 307# tracer: irqsoff
@@ -380,9 +373,10 @@ explains which is which.
380 373
381The above is mostly meaningful for kernel developers. 374The above is mostly meaningful for kernel developers.
382 375
383 time: This differs from the trace file output. The trace file output 376 time: When the latency-format option is enabled, the trace file
384 includes an absolute timestamp. The timestamp used by the 377 output includes a timestamp relative to the start of the
385 latency_trace file is relative to the start of the trace. 378 trace. This differs from the output when latency-format
379 is disabled, which includes an absolute timestamp.
386 380
387 delay: This is just to help catch your eye a bit better. And 381 delay: This is just to help catch your eye a bit better. And
388 needs to be fixed to be only relative to the same CPU. 382 needs to be fixed to be only relative to the same CPU.
@@ -440,7 +434,8 @@ Here are the available options:
440 sym-addr: 434 sym-addr:
441 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 435 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
442 436
443 verbose - This deals with the latency_trace file. 437 verbose - This deals with the trace file when the
438 latency-format option is enabled.
444 439
445 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 440 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
446 (+0.000ms): simple_strtoul (strict_strtoul) 441 (+0.000ms): simple_strtoul (strict_strtoul)
@@ -472,7 +467,7 @@ Here are the available options:
472 the app is no longer running 467 the app is no longer running
473 468
474 The lookup is performed when you read 469 The lookup is performed when you read
475 trace,trace_pipe,latency_trace. Example: 470 trace,trace_pipe. Example:
476 471
477 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 472 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
478x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 473x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
481 every scheduling event. Will add overhead if 476 every scheduling event. Will add overhead if
482 there's a lot of tasks running at once. 477 there's a lot of tasks running at once.
483 478
479 latency-format - This option changes the trace. When
480 it is enabled, the trace displays
481 additional information about the
482 latencies, as described in "Latency
483 trace format".
484 484
485sched_switch 485sched_switch
486------------ 486------------
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is
596an example: 596an example:
597 597
598 # echo irqsoff > current_tracer 598 # echo irqsoff > current_tracer
599 # echo latency-format > trace_options
599 # echo 0 > tracing_max_latency 600 # echo 0 > tracing_max_latency
600 # echo 1 > tracing_enabled 601 # echo 1 > tracing_enabled
601 # ls -ltr 602 # ls -ltr
602 [...] 603 [...]
603 # echo 0 > tracing_enabled 604 # echo 0 > tracing_enabled
604 # cat latency_trace 605 # cat trace
605# tracer: irqsoff 606# tracer: irqsoff
606# 607#
607irqsoff latency trace v1.1.5 on 2.6.26 608irqsoff latency trace v1.1.5 on 2.6.26
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer
703is much like the irqsoff tracer. 704is much like the irqsoff tracer.
704 705
705 # echo preemptoff > current_tracer 706 # echo preemptoff > current_tracer
707 # echo latency-format > trace_options
706 # echo 0 > tracing_max_latency 708 # echo 0 > tracing_max_latency
707 # echo 1 > tracing_enabled 709 # echo 1 > tracing_enabled
708 # ls -ltr 710 # ls -ltr
709 [...] 711 [...]
710 # echo 0 > tracing_enabled 712 # echo 0 > tracing_enabled
711 # cat latency_trace 713 # cat trace
712# tracer: preemptoff 714# tracer: preemptoff
713# 715#
714preemptoff latency trace v1.1.5 on 2.6.26-rc8 716preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff
850tracers. 852tracers.
851 853
852 # echo preemptirqsoff > current_tracer 854 # echo preemptirqsoff > current_tracer
855 # echo latency-format > trace_options
853 # echo 0 > tracing_max_latency 856 # echo 0 > tracing_max_latency
854 # echo 1 > tracing_enabled 857 # echo 1 > tracing_enabled
855 # ls -ltr 858 # ls -ltr
856 [...] 859 [...]
857 # echo 0 > tracing_enabled 860 # echo 0 > tracing_enabled
858 # cat latency_trace 861 # cat trace
859# tracer: preemptirqsoff 862# tracer: preemptirqsoff
860# 863#
861preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 864preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under
1012'chrt' which changes the priority of the task. 1015'chrt' which changes the priority of the task.
1013 1016
1014 # echo wakeup > current_tracer 1017 # echo wakeup > current_tracer
1018 # echo latency-format > trace_options
1015 # echo 0 > tracing_max_latency 1019 # echo 0 > tracing_max_latency
1016 # echo 1 > tracing_enabled 1020 # echo 1 > tracing_enabled
1017 # chrt -f 5 sleep 1 1021 # chrt -f 5 sleep 1
1018 # echo 0 > tracing_enabled 1022 # echo 0 > tracing_enabled
1019 # cat latency_trace 1023 # cat trace
1020# tracer: wakeup 1024# tracer: wakeup
1021# 1025#
1022wakeup latency trace v1.1.5 on 2.6.26-rc8 1026wakeup latency trace v1.1.5 on 2.6.26-rc8
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim
new file mode 100644
index 000000000000..0544b504c8b0
--- /dev/null
+++ b/Documentation/trace/function-graph-fold.vim
@@ -0,0 +1,42 @@
1" Enable folding for ftrace function_graph traces.
2"
3" To use, :source this file while viewing a function_graph trace, or use vim's
4" -S option to load from the command-line together with a trace. You can then
5" use the usual vim fold commands, such as "za", to open and close nested
6" functions. While closed, a fold will show the total time taken for a call,
7" as would normally appear on the line with the closing brace. Folded
8" functions will not include finish_task_switch(), so folding should remain
9" relatively sane even through a context switch.
10"
11" Note that this will almost certainly only work well with a
12" single-CPU trace (e.g. trace-cmd report --cpu 1).
13
14function! FunctionGraphFoldExpr(lnum)
15 let line = getline(a:lnum)
16 if line[-1:] == '{'
17 if line =~ 'finish_task_switch() {$'
18 return '>1'
19 endif
20 return 'a1'
21 elseif line[-1:] == '}'
22 return 's1'
23 else
24 return '='
25 endif
26endfunction
27
28function! FunctionGraphFoldText()
29 let s = split(getline(v:foldstart), '|', 1)
30 if getline(v:foldend+1) =~ 'finish_task_switch() {$'
31 let s[2] = ' task switch '
32 else
33 let e = split(getline(v:foldend), '|', 1)
34 let s[2] = e[2]
35 endif
36 return join(s, '|')
37endfunction
38
39setlocal foldexpr=FunctionGraphFoldExpr(v:lnum)
40setlocal foldtext=FunctionGraphFoldText()
41setlocal foldcolumn=12
42setlocal foldmethod=expr
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644
index 000000000000..5b1d23d604c5
--- /dev/null
+++ b/Documentation/trace/ring-buffer-design.txt
@@ -0,0 +1,955 @@
1 Lockless Ring Buffer Design
2 ===========================
3
4Copyright 2009 Red Hat Inc.
5 Author: Steven Rostedt <srostedt@redhat.com>
6 License: The GNU Free Documentation License, Version 1.2
7 (dual licensed under the GPL v2)
8Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
9 and Frederic Weisbecker.
10
11
12Written for: 2.6.31
13
14Terminology used in this Document
15---------------------------------
16
17tail - where new writes happen in the ring buffer.
18
19head - where new reads happen in the ring buffer.
20
21producer - the task that writes into the ring buffer (same as writer)
22
23writer - same as producer
24
25consumer - the task that reads from the buffer (same as reader)
26
27reader - same as consumer.
28
29reader_page - A page outside the ring buffer used solely (for the most part)
30 by the reader.
31
32head_page - a pointer to the page that the reader will use next
33
34tail_page - a pointer to the page that will be written to next
35
36commit_page - a pointer to the page with the last finished non nested write.
37
38cmpxchg - hardware assisted atomic transaction that performs the following:
39
40 A = B iff previous A == C
41
42 R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
43 current A is equal to C, and we put the old (current) A into R
44
45 R gets the previous A regardless if A is updated with B or not.
46
47 To see if the update was successful a compare of R == C may be used.
48
49The Generic Ring Buffer
50-----------------------
51
52The ring buffer can be used in either an overwrite mode or in
53producer/consumer mode.
54
55Producer/consumer mode is where the producer were to fill up the
56buffer before the consumer could free up anything, the producer
57will stop writing to the buffer. This will lose most recent events.
58
59Overwrite mode is where the produce were to fill up the buffer
60before the consumer could free up anything, the producer will
61overwrite the older data. This will lose the oldest events.
62
63No two writers can write at the same time (on the same per cpu buffer),
64but a writer may interrupt another writer, but it must finish writing
65before the previous writer may continue. This is very important to the
66algorithm. The writers act like a "stack". The way interrupts works
67enforces this behavior.
68
69
70 writer1 start
71 <preempted> writer2 start
72 <preempted> writer3 start
73 writer3 finishes
74 writer2 finishes
75 writer1 finishes
76
77This is very much like a writer being preempted by an interrupt and
78the interrupt doing a write as well.
79
80Readers can happen at any time. But no two readers may run at the
81same time, nor can a reader preempt/interrupt another reader. A reader
82can not preempt/interrupt a writer, but it may read/consume from the
83buffer at the same time as a writer is writing, but the reader must be
84on another processor to do so. A reader may read on its own processor
85and can be preempted by a writer.
86
87A writer can preempt a reader, but a reader can not preempt a writer.
88But a reader can read the buffer at the same time (on another processor)
89as a writer.
90
91The ring buffer is made up of a list of pages held together by a link list.
92
93At initialization a reader page is allocated for the reader that is not
94part of the ring buffer.
95
96The head_page, tail_page and commit_page are all initialized to point
97to the same page.
98
99The reader page is initialized to have its next pointer pointing to
100the head page, and its previous pointer pointing to a page before
101the head page.
102
103The reader has its own page to use. At start up time, this page is
104allocated but is not attached to the list. When the reader wants
105to read from the buffer, if its page is empty (like it is on start up)
106it will swap its page with the head_page. The old reader page will
107become part of the ring buffer and the head_page will be removed.
108The page after the inserted page (old reader_page) will become the
109new head page.
110
111Once the new page is given to the reader, the reader could do what
112it wants with it, as long as a writer has left that page.
113
114A sample of how the reader page is swapped: Note this does not
115show the head page in the buffer, it is for demonstrating a swap
116only.
117
118 +------+
119 |reader| RING BUFFER
120 |page |
121 +------+
122 +---+ +---+ +---+
123 | |-->| |-->| |
124 | |<--| |<--| |
125 +---+ +---+ +---+
126 ^ | ^ |
127 | +-------------+ |
128 +-----------------+
129
130
131 +------+
132 |reader| RING BUFFER
133 |page |-------------------+
134 +------+ v
135 | +---+ +---+ +---+
136 | | |-->| |-->| |
137 | | |<--| |<--| |<-+
138 | +---+ +---+ +---+ |
139 | ^ | ^ | |
140 | | +-------------+ | |
141 | +-----------------+ |
142 +------------------------------------+
143
144 +------+
145 |reader| RING BUFFER
146 |page |-------------------+
147 +------+ <---------------+ v
148 | ^ +---+ +---+ +---+
149 | | | |-->| |-->| |
150 | | | | | |<--| |<-+
151 | | +---+ +---+ +---+ |
152 | | | ^ | |
153 | | +-------------+ | |
154 | +-----------------------------+ |
155 +------------------------------------+
156
157 +------+
158 |buffer| RING BUFFER
159 |page |-------------------+
160 +------+ <---------------+ v
161 | ^ +---+ +---+ +---+
162 | | | | | |-->| |
163 | | New | | | |<--| |<-+
164 | | Reader +---+ +---+ +---+ |
165 | | page ----^ | |
166 | | | |
167 | +-----------------------------+ |
168 +------------------------------------+
169
170
171
172It is possible that the page swapped is the commit page and the tail page,
173if what is in the ring buffer is less than what is held in a buffer page.
174
175
176 reader page commit page tail page
177 | | |
178 v | |
179 +---+ | |
180 | |<----------+ |
181 | |<------------------------+
182 | |------+
183 +---+ |
184 |
185 v
186 +---+ +---+ +---+ +---+
187<---| |--->| |--->| |--->| |--->
188--->| |<---| |<---| |<---| |<---
189 +---+ +---+ +---+ +---+
190
191This case is still valid for this algorithm.
192When the writer leaves the page, it simply goes into the ring buffer
193since the reader page still points to the next location in the ring
194buffer.
195
196
197The main pointers:
198
199 reader page - The page used solely by the reader and is not part
200 of the ring buffer (may be swapped in)
201
202 head page - the next page in the ring buffer that will be swapped
203 with the reader page.
204
205 tail page - the page where the next write will take place.
206
207 commit page - the page that last finished a write.
208
209The commit page only is updated by the outer most writer in the
210writer stack. A writer that preempts another writer will not move the
211commit page.
212
213When data is written into the ring buffer, a position is reserved
214in the ring buffer and passed back to the writer. When the writer
215is finished writing data into that position, it commits the write.
216
217Another write (or a read) may take place at anytime during this
218transaction. If another write happens it must finish before continuing
219with the previous write.
220
221
222 Write reserve:
223
224 Buffer page
225 +---------+
226 |written |
227 +---------+ <--- given back to writer (current commit)
228 |reserved |
229 +---------+ <--- tail pointer
230 | empty |
231 +---------+
232
233 Write commit:
234
235 Buffer page
236 +---------+
237 |written |
238 +---------+
239 |written |
240 +---------+ <--- next positon for write (current commit)
241 | empty |
242 +---------+
243
244
245 If a write happens after the first reserve:
246
247 Buffer page
248 +---------+
249 |written |
250 +---------+ <-- current commit
251 |reserved |
252 +---------+ <--- given back to second writer
253 |reserved |
254 +---------+ <--- tail pointer
255
256 After second writer commits:
257
258
259 Buffer page
260 +---------+
261 |written |
262 +---------+ <--(last full commit)
263 |reserved |
264 +---------+
265 |pending |
266 |commit |
267 +---------+ <--- tail pointer
268
269 When the first writer commits:
270
271 Buffer page
272 +---------+
273 |written |
274 +---------+
275 |written |
276 +---------+
277 |written |
278 +---------+ <--(last full commit and tail pointer)
279
280
281The commit pointer points to the last write location that was
282committed without preempting another write. When a write that
283preempted another write is committed, it only becomes a pending commit
284and will not be a full commit till all writes have been committed.
285
286The commit page points to the page that has the last full commit.
287The tail page points to the page with the last write (before
288committing).
289
290The tail page is always equal to or after the commit page. It may
291be several pages ahead. If the tail page catches up to the commit
292page then no more writes may take place (regardless of the mode
293of the ring buffer: overwrite and produce/consumer).
294
295The order of pages are:
296
297 head page
298 commit page
299 tail page
300
301Possible scenario:
302 tail page
303 head page commit page |
304 | | |
305 v v v
306 +---+ +---+ +---+ +---+
307<---| |--->| |--->| |--->| |--->
308--->| |<---| |<---| |<---| |<---
309 +---+ +---+ +---+ +---+
310
311There is a special case that the head page is after either the commit page
312and possibly the tail page. That is when the commit (and tail) page has been
313swapped with the reader page. This is because the head page is always
314part of the ring buffer, but the reader page is not. When ever there
315has been less than a full page that has been committed inside the ring buffer,
316and a reader swaps out a page, it will be swapping out the commit page.
317
318
319 reader page commit page tail page
320 | | |
321 v | |
322 +---+ | |
323 | |<----------+ |
324 | |<------------------------+
325 | |------+
326 +---+ |
327 |
328 v
329 +---+ +---+ +---+ +---+
330<---| |--->| |--->| |--->| |--->
331--->| |<---| |<---| |<---| |<---
332 +---+ +---+ +---+ +---+
333 ^
334 |
335 head page
336
337
338In this case, the head page will not move when the tail and commit
339move back into the ring buffer.
340
341The reader can not swap a page into the ring buffer if the commit page
342is still on that page. If the read meets the last commit (real commit
343not pending or reserved), then there is nothing more to read.
344The buffer is considered empty until another full commit finishes.
345
346When the tail meets the head page, if the buffer is in overwrite mode,
347the head page will be pushed ahead one. If the buffer is in producer/consumer
348mode, the write will fail.
349
350Overwrite mode:
351
352 tail page
353 |
354 v
355 +---+ +---+ +---+ +---+
356<---| |--->| |--->| |--->| |--->
357--->| |<---| |<---| |<---| |<---
358 +---+ +---+ +---+ +---+
359 ^
360 |
361 head page
362
363
364 tail page
365 |
366 v
367 +---+ +---+ +---+ +---+
368<---| |--->| |--->| |--->| |--->
369--->| |<---| |<---| |<---| |<---
370 +---+ +---+ +---+ +---+
371 ^
372 |
373 head page
374
375
376 tail page
377 |
378 v
379 +---+ +---+ +---+ +---+
380<---| |--->| |--->| |--->| |--->
381--->| |<---| |<---| |<---| |<---
382 +---+ +---+ +---+ +---+
383 ^
384 |
385 head page
386
387Note, the reader page will still point to the previous head page.
388But when a swap takes place, it will use the most recent head page.
389
390
391Making the Ring Buffer Lockless:
392--------------------------------
393
394The main idea behind the lockless algorithm is to combine the moving
395of the head_page pointer with the swapping of pages with the reader.
396State flags are placed inside the pointer to the page. To do this,
397each page must be aligned in memory by 4 bytes. This will allow the 2
398least significant bits of the address to be used as flags. Since
399they will always be zero for the address. To get the address,
400simply mask out the flags.
401
402 MASK = ~3
403
404 address & MASK
405
406Two flags will be kept by these two bits:
407
408 HEADER - the page being pointed to is a head page
409
410 UPDATE - the page being pointed to is being updated by a writer
411 and was or is about to be a head page.
412
413
414 reader page
415 |
416 v
417 +---+
418 | |------+
419 +---+ |
420 |
421 v
422 +---+ +---+ +---+ +---+
423<---| |--->| |-H->| |--->| |--->
424--->| |<---| |<---| |<---| |<---
425 +---+ +---+ +---+ +---+
426
427
428The above pointer "-H->" would have the HEADER flag set. That is
429the next page is the next page to be swapped out by the reader.
430This pointer means the next page is the head page.
431
432When the tail page meets the head pointer, it will use cmpxchg to
433change the pointer to the UPDATE state:
434
435
436 tail page
437 |
438 v
439 +---+ +---+ +---+ +---+
440<---| |--->| |-H->| |--->| |--->
441--->| |<---| |<---| |<---| |<---
442 +---+ +---+ +---+ +---+
443
444 tail page
445 |
446 v
447 +---+ +---+ +---+ +---+
448<---| |--->| |-U->| |--->| |--->
449--->| |<---| |<---| |<---| |<---
450 +---+ +---+ +---+ +---+
451
452"-U->" represents a pointer in the UPDATE state.
453
454Any access to the reader will need to take some sort of lock to serialize
455the readers. But the writers will never take a lock to write to the
456ring buffer. This means we only need to worry about a single reader,
457and writes only preempt in "stack" formation.
458
459When the reader tries to swap the page with the ring buffer, it
460will also use cmpxchg. If the flag bit in the pointer to the
461head page does not have the HEADER flag set, the compare will fail
462and the reader will need to look for the new head page and try again.
463Note, the flag UPDATE and HEADER are never set at the same time.
464
465The reader swaps the reader page as follows:
466
467 +------+
468 |reader| RING BUFFER
469 |page |
470 +------+
471 +---+ +---+ +---+
472 | |--->| |--->| |
473 | |<---| |<---| |
474 +---+ +---+ +---+
475 ^ | ^ |
476 | +---------------+ |
477 +-----H-------------+
478
479The reader sets the reader page next pointer as HEADER to the page after
480the head page.
481
482
483 +------+
484 |reader| RING BUFFER
485 |page |-------H-----------+
486 +------+ v
487 | +---+ +---+ +---+
488 | | |--->| |--->| |
489 | | |<---| |<---| |<-+
490 | +---+ +---+ +---+ |
491 | ^ | ^ | |
492 | | +---------------+ | |
493 | +-----H-------------+ |
494 +--------------------------------------+
495
496It does a cmpxchg with the pointer to the previous head page to make it
497point to the reader page. Note that the new pointer does not have the HEADER
498flag set. This action atomically moves the head page forward.
499
500 +------+
501 |reader| RING BUFFER
502 |page |-------H-----------+
503 +------+ v
504 | ^ +---+ +---+ +---+
505 | | | |-->| |-->| |
506 | | | |<--| |<--| |<-+
507 | | +---+ +---+ +---+ |
508 | | | ^ | |
509 | | +-------------+ | |
510 | +-----------------------------+ |
511 +------------------------------------+
512
513After the new head page is set, the previous pointer of the head page is
514updated to the reader page.
515
516 +------+
517 |reader| RING BUFFER
518 |page |-------H-----------+
519 +------+ <---------------+ v
520 | ^ +---+ +---+ +---+
521 | | | |-->| |-->| |
522 | | | | | |<--| |<-+
523 | | +---+ +---+ +---+ |
524 | | | ^ | |
525 | | +-------------+ | |
526 | +-----------------------------+ |
527 +------------------------------------+
528
529 +------+
530 |buffer| RING BUFFER
531 |page |-------H-----------+ <--- New head page
532 +------+ <---------------+ v
533 | ^ +---+ +---+ +---+
534 | | | | | |-->| |
535 | | New | | | |<--| |<-+
536 | | Reader +---+ +---+ +---+ |
537 | | page ----^ | |
538 | | | |
539 | +-----------------------------+ |
540 +------------------------------------+
541
542Another important point. The page that the reader page points back to
543by its previous pointer (the one that now points to the new head page)
544never points back to the reader page. That is because the reader page is
545not part of the ring buffer. Traversing the ring buffer via the next pointers
546will always stay in the ring buffer. Traversing the ring buffer via the
547prev pointers may not.
548
549Note, the way to determine a reader page is simply by examining the previous
550pointer of the page. If the next pointer of the previous page does not
551point back to the original page, then the original page is a reader page:
552
553
554 +--------+
555 | reader | next +----+
556 | page |-------->| |<====== (buffer page)
557 +--------+ +----+
558 | | ^
559 | v | next
560 prev | +----+
561 +------------->| |
562 +----+
563
564The way the head page moves forward:
565
566When the tail page meets the head page and the buffer is in overwrite mode
567and more writes take place, the head page must be moved forward before the
568writer may move the tail page. The way this is done is that the writer
569performs a cmpxchg to convert the pointer to the head page from the HEADER
570flag to have the UPDATE flag set. Once this is done, the reader will
571not be able to swap the head page from the buffer, nor will it be able to
572move the head page, until the writer is finished with the move.
573
574This eliminates any races that the reader can have on the writer. The reader
575must spin, and this is why the reader can not preempt the writer.
576
577 tail page
578 |
579 v
580 +---+ +---+ +---+ +---+
581<---| |--->| |-H->| |--->| |--->
582--->| |<---| |<---| |<---| |<---
583 +---+ +---+ +---+ +---+
584
585 tail page
586 |
587 v
588 +---+ +---+ +---+ +---+
589<---| |--->| |-U->| |--->| |--->
590--->| |<---| |<---| |<---| |<---
591 +---+ +---+ +---+ +---+
592
593The following page will be made into the new head page.
594
595 tail page
596 |
597 v
598 +---+ +---+ +---+ +---+
599<---| |--->| |-U->| |-H->| |--->
600--->| |<---| |<---| |<---| |<---
601 +---+ +---+ +---+ +---+
602
603After the new head page has been set, we can set the old head page
604pointer back to NORMAL.
605
606 tail page
607 |
608 v
609 +---+ +---+ +---+ +---+
610<---| |--->| |--->| |-H->| |--->
611--->| |<---| |<---| |<---| |<---
612 +---+ +---+ +---+ +---+
613
614After the head page has been moved, the tail page may now move forward.
615
616 tail page
617 |
618 v
619 +---+ +---+ +---+ +---+
620<---| |--->| |--->| |-H->| |--->
621--->| |<---| |<---| |<---| |<---
622 +---+ +---+ +---+ +---+
623
624
625The above are the trivial updates. Now for the more complex scenarios.
626
627
628As stated before, if enough writes preempt the first write, the
629tail page may make it all the way around the buffer and meet the commit
630page. At this time, we must start dropping writes (usually with some kind
631of warning to the user). But what happens if the commit was still on the
632reader page? The commit page is not part of the ring buffer. The tail page
633must account for this.
634
635
636 reader page commit page
637 | |
638 v |
639 +---+ |
640 | |<----------+
641 | |
642 | |------+
643 +---+ |
644 |
645 v
646 +---+ +---+ +---+ +---+
647<---| |--->| |-H->| |--->| |--->
648--->| |<---| |<---| |<---| |<---
649 +---+ +---+ +---+ +---+
650 ^
651 |
652 tail page
653
654If the tail page were to simply push the head page forward, the commit when
655leaving the reader page would not be pointing to the correct page.
656
657The solution to this is to test if the commit page is on the reader page
658before pushing the head page. If it is, then it can be assumed that the
659tail page wrapped the buffer, and we must drop new writes.
660
661This is not a race condition, because the commit page can only be moved
662by the outter most writer (the writer that was preempted).
663This means that the commit will not move while a writer is moving the
664tail page. The reader can not swap the reader page if it is also being
665used as the commit page. The reader can simply check that the commit
666is off the reader page. Once the commit page leaves the reader page
667it will never go back on it unless a reader does another swap with the
668buffer page that is also the commit page.
669
670
671Nested writes
672-------------
673
674In the pushing forward of the tail page we must first push forward
675the head page if the head page is the next page. If the head page
676is not the next page, the tail page is simply updated with a cmpxchg.
677
678Only writers move the tail page. This must be done atomically to protect
679against nested writers.
680
681 temp_page = tail_page
682 next_page = temp_page->next
683 cmpxchg(tail_page, temp_page, next_page)
684
685The above will update the tail page if it is still pointing to the expected
686page. If this fails, a nested write pushed it forward, the the current write
687does not need to push it.
688
689
690 temp page
691 |
692 v
693 tail page
694 |
695 v
696 +---+ +---+ +---+ +---+
697<---| |--->| |--->| |--->| |--->
698--->| |<---| |<---| |<---| |<---
699 +---+ +---+ +---+ +---+
700
701Nested write comes in and moves the tail page forward:
702
703 tail page (moved by nested writer)
704 temp page |
705 | |
706 v v
707 +---+ +---+ +---+ +---+
708<---| |--->| |--->| |--->| |--->
709--->| |<---| |<---| |<---| |<---
710 +---+ +---+ +---+ +---+
711
712The above would fail the cmpxchg, but since the tail page has already
713been moved forward, the writer will just try again to reserve storage
714on the new tail page.
715
716But the moving of the head page is a bit more complex.
717
718 tail page
719 |
720 v
721 +---+ +---+ +---+ +---+
722<---| |--->| |-H->| |--->| |--->
723--->| |<---| |<---| |<---| |<---
724 +---+ +---+ +---+ +---+
725
726The write converts the head page pointer to UPDATE.
727
728 tail page
729 |
730 v
731 +---+ +---+ +---+ +---+
732<---| |--->| |-U->| |--->| |--->
733--->| |<---| |<---| |<---| |<---
734 +---+ +---+ +---+ +---+
735
736But if a nested writer preempts here. It will see that the next
737page is a head page, but it is also nested. It will detect that
738it is nested and will save that information. The detection is the
739fact that it sees the UPDATE flag instead of a HEADER or NORMAL
740pointer.
741
742The nested writer will set the new head page pointer.
743
744 tail page
745 |
746 v
747 +---+ +---+ +---+ +---+
748<---| |--->| |-U->| |-H->| |--->
749--->| |<---| |<---| |<---| |<---
750 +---+ +---+ +---+ +---+
751
752But it will not reset the update back to normal. Only the writer
753that converted a pointer from HEAD to UPDATE will convert it back
754to NORMAL.
755
756 tail page
757 |
758 v
759 +---+ +---+ +---+ +---+
760<---| |--->| |-U->| |-H->| |--->
761--->| |<---| |<---| |<---| |<---
762 +---+ +---+ +---+ +---+
763
764After the nested writer finishes, the outer most writer will convert
765the UPDATE pointer to NORMAL.
766
767
768 tail page
769 |
770 v
771 +---+ +---+ +---+ +---+
772<---| |--->| |--->| |-H->| |--->
773--->| |<---| |<---| |<---| |<---
774 +---+ +---+ +---+ +---+
775
776
777It can be even more complex if several nested writes came in and moved
778the tail page ahead several pages:
779
780
781(first writer)
782
783 tail page
784 |
785 v
786 +---+ +---+ +---+ +---+
787<---| |--->| |-H->| |--->| |--->
788--->| |<---| |<---| |<---| |<---
789 +---+ +---+ +---+ +---+
790
791The write converts the head page pointer to UPDATE.
792
793 tail page
794 |
795 v
796 +---+ +---+ +---+ +---+
797<---| |--->| |-U->| |--->| |--->
798--->| |<---| |<---| |<---| |<---
799 +---+ +---+ +---+ +---+
800
801Next writer comes in, and sees the update and sets up the new
802head page.
803
804(second writer)
805
806 tail page
807 |
808 v
809 +---+ +---+ +---+ +---+
810<---| |--->| |-U->| |-H->| |--->
811--->| |<---| |<---| |<---| |<---
812 +---+ +---+ +---+ +---+
813
814The nested writer moves the tail page forward. But does not set the old
815update page to NORMAL because it is not the outer most writer.
816
817 tail page
818 |
819 v
820 +---+ +---+ +---+ +---+
821<---| |--->| |-U->| |-H->| |--->
822--->| |<---| |<---| |<---| |<---
823 +---+ +---+ +---+ +---+
824
825Another writer preempts and sees the page after the tail page is a head page.
826It changes it from HEAD to UPDATE.
827
828(third writer)
829
830 tail page
831 |
832 v
833 +---+ +---+ +---+ +---+
834<---| |--->| |-U->| |-U->| |--->
835--->| |<---| |<---| |<---| |<---
836 +---+ +---+ +---+ +---+
837
838The writer will move the head page forward:
839
840
841(third writer)
842
843 tail page
844 |
845 v
846 +---+ +---+ +---+ +---+
847<---| |--->| |-U->| |-U->| |-H->
848--->| |<---| |<---| |<---| |<---
849 +---+ +---+ +---+ +---+
850
851But now that the third writer did change the HEAD flag to UPDATE it
852will convert it to normal:
853
854
855(third writer)
856
857 tail page
858 |
859 v
860 +---+ +---+ +---+ +---+
861<---| |--->| |-U->| |--->| |-H->
862--->| |<---| |<---| |<---| |<---
863 +---+ +---+ +---+ +---+
864
865
866Then it will move the tail page, and return back to the second writer.
867
868
869(second writer)
870
871 tail page
872 |
873 v
874 +---+ +---+ +---+ +---+
875<---| |--->| |-U->| |--->| |-H->
876--->| |<---| |<---| |<---| |<---
877 +---+ +---+ +---+ +---+
878
879
880The second writer will fail to move the tail page because it was already
881moved, so it will try again and add its data to the new tail page.
882It will return to the first writer.
883
884
885(first writer)
886
887 tail page
888 |
889 v
890 +---+ +---+ +---+ +---+
891<---| |--->| |-U->| |--->| |-H->
892--->| |<---| |<---| |<---| |<---
893 +---+ +---+ +---+ +---+
894
895The first writer can not know atomically test if the tail page moved
896while it updates the HEAD page. It will then update the head page to
897what it thinks is the new head page.
898
899
900(first writer)
901
902 tail page
903 |
904 v
905 +---+ +---+ +---+ +---+
906<---| |--->| |-U->| |-H->| |-H->
907--->| |<---| |<---| |<---| |<---
908 +---+ +---+ +---+ +---+
909
910Since the cmpxchg returns the old value of the pointer the first writer
911will see it succeeded in updating the pointer from NORMAL to HEAD.
912But as we can see, this is not good enough. It must also check to see
913if the tail page is either where it use to be or on the next page:
914
915
916(first writer)
917
918 A B tail page
919 | | |
920 v v v
921 +---+ +---+ +---+ +---+
922<---| |--->| |-U->| |-H->| |-H->
923--->| |<---| |<---| |<---| |<---
924 +---+ +---+ +---+ +---+
925
926If tail page != A and tail page does not equal B, then it must reset the
927pointer back to NORMAL. The fact that it only needs to worry about
928nested writers, it only needs to check this after setting the HEAD page.
929
930
931(first writer)
932
933 A B tail page
934 | | |
935 v v v
936 +---+ +---+ +---+ +---+
937<---| |--->| |-U->| |--->| |-H->
938--->| |<---| |<---| |<---| |<---
939 +---+ +---+ +---+ +---+
940
941Now the writer can update the head page. This is also why the head page must
942remain in UPDATE and only reset by the outer most writer. This prevents
943the reader from seeing the incorrect head page.
944
945
946(first writer)
947
948 A B tail page
949 | | |
950 v v v
951 +---+ +---+ +---+ +---+
952<---| |--->| |--->| |--->| |-H->
953--->| |<---| |<---| |<---| |<---
954 +---+ +---+ +---+ +---+
955
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e030e86ff6a3..1c866efd217d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -84,7 +84,7 @@ config S390
84 select HAVE_FUNCTION_TRACER 84 select HAVE_FUNCTION_TRACER
85 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 85 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
86 select HAVE_FTRACE_MCOUNT_RECORD 86 select HAVE_FTRACE_MCOUNT_RECORD
87 select HAVE_FTRACE_SYSCALLS 87 select HAVE_SYSCALL_TRACEPOINTS
88 select HAVE_DYNAMIC_FTRACE 88 select HAVE_DYNAMIC_FTRACE
89 select HAVE_FUNCTION_GRAPH_TRACER 89 select HAVE_FUNCTION_GRAPH_TRACER
90 select HAVE_DEFAULT_NO_SPIN_MUTEXES 90 select HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index fcba206529f3..4e91a2573cc4 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
900CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 900CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
901CONFIG_HAVE_DYNAMIC_FTRACE=y 901CONFIG_HAVE_DYNAMIC_FTRACE=y
902CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 902CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
903CONFIG_HAVE_FTRACE_SYSCALLS=y 903CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
904CONFIG_TRACING_SUPPORT=y 904CONFIG_TRACING_SUPPORT=y
905CONFIG_FTRACE=y 905CONFIG_FTRACE=y
906# CONFIG_FUNCTION_TRACER is not set 906# CONFIG_FUNCTION_TRACER is not set
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index ba1cab9fc1f9..07eb61b2fb3a 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void)
92#define TIF_SYSCALL_TRACE 8 /* syscall trace active */ 92#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
93#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ 93#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */
94#define TIF_SECCOMP 10 /* secure computing */ 94#define TIF_SECCOMP 10 /* secure computing */
95#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */ 95#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */
96#define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ 96#define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */
97#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling 97#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling
98 TIF_NEED_RESCHED */ 98 TIF_NEED_RESCHED */
@@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void)
111#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 111#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
112#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) 112#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
113#define _TIF_SECCOMP (1<<TIF_SECCOMP) 113#define _TIF_SECCOMP (1<<TIF_SECCOMP)
114#define _TIF_SYSCALL_FTRACE (1<<TIF_SYSCALL_FTRACE) 114#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
115#define _TIF_USEDFPU (1<<TIF_USEDFPU) 115#define _TIF_USEDFPU (1<<TIF_USEDFPU)
116#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) 116#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
117#define _TIF_31BIT (1<<TIF_31BIT) 117#define _TIF_31BIT (1<<TIF_31BIT)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index f78580a74039..f43d2ee54464 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -54,7 +54,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
54_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ 54_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
55 _TIF_MCCK_PENDING) 55 _TIF_MCCK_PENDING)
56_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ 56_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
57 _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) 57 _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
58 58
59STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER 59STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
60STACK_SIZE = 1 << STACK_SHIFT 60STACK_SIZE = 1 << STACK_SHIFT
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 009ca6175db9..a6f7b20df616 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
57_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ 57_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
58 _TIF_MCCK_PENDING) 58 _TIF_MCCK_PENDING)
59_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ 59_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
60 _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) 60 _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
61 61
62#define BASED(name) name-system_call(%r13) 62#define BASED(name) name-system_call(%r13)
63 63
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 3e298e64f0db..57bdcb1e3cdf 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -220,6 +220,29 @@ struct syscall_metadata *syscall_nr_to_meta(int nr)
220 return syscalls_metadata[nr]; 220 return syscalls_metadata[nr];
221} 221}
222 222
223int syscall_name_to_nr(char *name)
224{
225 int i;
226
227 if (!syscalls_metadata)
228 return -1;
229 for (i = 0; i < NR_syscalls; i++)
230 if (syscalls_metadata[i])
231 if (!strcmp(syscalls_metadata[i]->name, name))
232 return i;
233 return -1;
234}
235
236void set_syscall_enter_id(int num, int id)
237{
238 syscalls_metadata[num]->enter_id = id;
239}
240
241void set_syscall_exit_id(int num, int id)
242{
243 syscalls_metadata[num]->exit_id = id;
244}
245
223static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 246static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
224{ 247{
225 struct syscall_metadata *start; 248 struct syscall_metadata *start;
@@ -237,24 +260,19 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
237 return NULL; 260 return NULL;
238} 261}
239 262
240void arch_init_ftrace_syscalls(void) 263static int __init arch_init_ftrace_syscalls(void)
241{ 264{
242 struct syscall_metadata *meta; 265 struct syscall_metadata *meta;
243 int i; 266 int i;
244 static atomic_t refs;
245
246 if (atomic_inc_return(&refs) != 1)
247 goto out;
248 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls, 267 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
249 GFP_KERNEL); 268 GFP_KERNEL);
250 if (!syscalls_metadata) 269 if (!syscalls_metadata)
251 goto out; 270 return -ENOMEM;
252 for (i = 0; i < NR_syscalls; i++) { 271 for (i = 0; i < NR_syscalls; i++) {
253 meta = find_syscall_meta((unsigned long)sys_call_table[i]); 272 meta = find_syscall_meta((unsigned long)sys_call_table[i]);
254 syscalls_metadata[i] = meta; 273 syscalls_metadata[i] = meta;
255 } 274 }
256 return; 275 return 0;
257out:
258 atomic_dec(&refs);
259} 276}
277arch_initcall(arch_init_ftrace_syscalls);
260#endif 278#endif
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 43acd73105b7..f3ddd7ac06c5 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -51,6 +51,9 @@
51#include "compat_ptrace.h" 51#include "compat_ptrace.h"
52#endif 52#endif
53 53
54#define CREATE_TRACE_POINTS
55#include <trace/events/syscalls.h>
56
54enum s390_regset { 57enum s390_regset {
55 REGSET_GENERAL, 58 REGSET_GENERAL,
56 REGSET_FP, 59 REGSET_FP,
@@ -661,8 +664,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
661 ret = -1; 664 ret = -1;
662 } 665 }
663 666
664 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 667 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
665 ftrace_syscall_enter(regs); 668 trace_sys_enter(regs, regs->gprs[2]);
666 669
667 if (unlikely(current->audit_context)) 670 if (unlikely(current->audit_context))
668 audit_syscall_entry(is_compat_task() ? 671 audit_syscall_entry(is_compat_task() ?
@@ -679,8 +682,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
679 audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), 682 audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
680 regs->gprs[2]); 683 regs->gprs[2]);
681 684
682 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 685 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
683 ftrace_syscall_exit(regs); 686 trace_sys_exit(regs, regs->gprs[2]);
684 687
685 if (test_thread_flag(TIF_SYSCALL_TRACE)) 688 if (test_thread_flag(TIF_SYSCALL_TRACE))
686 tracehook_report_syscall_exit(regs, 0); 689 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d9c18aa17eb..fc20fdc0f7f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,7 +38,7 @@ config X86
38 select HAVE_FUNCTION_GRAPH_FP_TEST 38 select HAVE_FUNCTION_GRAPH_FP_TEST
39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE 40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
41 select HAVE_FTRACE_SYSCALLS 41 select HAVE_SYSCALL_TRACEPOINTS
42 select HAVE_KVM 42 select HAVE_KVM
43 select HAVE_ARCH_KGDB 43 select HAVE_ARCH_KGDB
44 select HAVE_ARCH_TRACEHOOK 44 select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edb992ebef92..d28fad19654a 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2355CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2357CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y 2358CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2359CONFIG_RING_BUFFER=y 2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y 2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y 2361CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cee1dd2e69b2..6c86acd847a4 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2329CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2331CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y 2332CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2333CONFIG_RING_BUFFER=y 2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y 2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y 2335CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index bd2c6511c887..db24c2278be0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,13 +28,6 @@
28 28
29#endif 29#endif
30 30
31/* FIXME: I don't want to stay hardcoded */
32#ifdef CONFIG_X86_64
33# define FTRACE_SYSCALL_MAX 296
34#else
35# define FTRACE_SYSCALL_MAX 333
36#endif
37
38#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
39#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
40#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40b75f8..6f7786aea4fc 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,7 +95,7 @@ struct thread_info {
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ 98#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
99 99
100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -118,17 +118,17 @@ struct thread_info {
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
122 122
123/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
124#define _TIF_WORK_SYSCALL_ENTRY \ 124#define _TIF_WORK_SYSCALL_ENTRY \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ 125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
126 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) 126 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
127 127
128/* work to do in syscall_trace_leave() */ 128/* work to do in syscall_trace_leave() */
129#define _TIF_WORK_SYSCALL_EXIT \ 129#define _TIF_WORK_SYSCALL_EXIT \
130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ 130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
131 _TIF_SYSCALL_FTRACE) 131 _TIF_SYSCALL_TRACEPOINT)
132 132
133/* work to do on interrupt/exception return */ 133/* work to do on interrupt/exception return */
134#define _TIF_WORK_MASK \ 134#define _TIF_WORK_MASK \
@@ -137,7 +137,8 @@ struct thread_info {
137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
138 138
139/* work to do on any return to user space */ 139/* work to do on any return to user space */
140#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) 140#define _TIF_ALLWORK_MASK \
141 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
141 142
142/* Only used for 64 bit */ 143/* Only used for 64 bit */
143#define _TIF_DO_NOTIFY_MASK \ 144#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a30706153..8deaada61bc8 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -345,6 +345,8 @@
345 345
346#ifdef __KERNEL__ 346#ifdef __KERNEL__
347 347
348#define NR_syscalls 337
349
348#define __ARCH_WANT_IPC_PARSE_VERSION 350#define __ARCH_WANT_IPC_PARSE_VERSION
349#define __ARCH_WANT_OLD_READDIR 351#define __ARCH_WANT_OLD_READDIR
350#define __ARCH_WANT_OLD_STAT 352#define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e1617e672..b9f3c60de5f7 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
688#endif /* __NO_STUBS */ 688#endif /* __NO_STUBS */
689 689
690#ifdef __KERNEL__ 690#ifdef __KERNEL__
691
692#ifndef COMPILE_OFFSETS
693#include <asm/asm-offsets.h>
694#define NR_syscalls (__NR_syscall_max + 1)
695#endif
696
691/* 697/*
692 * "Conditional" syscalls 698 * "Conditional" syscalls
693 * 699 *
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
3 * This code generates raw asm output which is post-processed to extract 3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data. 4 * and format the required data.
5 */ 5 */
6#define COMPILE_OFFSETS
6 7
7#include <linux/crypto.h> 8#include <linux/crypto.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
417 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
418 &return_to_handler; 418 &return_to_handler;
419 419
420 /* Nmi's are currently unsupported */
421 if (unlikely(in_nmi()))
422 return;
423
424 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
425 return; 421 return;
426 422
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
498 494
499struct syscall_metadata *syscall_nr_to_meta(int nr) 495struct syscall_metadata *syscall_nr_to_meta(int nr)
500{ 496{
501 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
502 return NULL; 498 return NULL;
503 499
504 return syscalls_metadata[nr]; 500 return syscalls_metadata[nr];
505} 501}
506 502
507void arch_init_ftrace_syscalls(void) 503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
508{ 530{
509 int i; 531 int i;
510 struct syscall_metadata *meta; 532 struct syscall_metadata *meta;
511 unsigned long **psys_syscall_table = &sys_call_table; 533 unsigned long **psys_syscall_table = &sys_call_table;
512 static atomic_t refs;
513
514 if (atomic_inc_return(&refs) != 1)
515 goto end;
516 534
517 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
518 FTRACE_SYSCALL_MAX, GFP_KERNEL); 536 NR_syscalls, GFP_KERNEL);
519 if (!syscalls_metadata) { 537 if (!syscalls_metadata) {
520 WARN_ON(1); 538 WARN_ON(1);
521 return; 539 return -ENOMEM;
522 } 540 }
523 541
524 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { 542 for (i = 0; i < NR_syscalls; i++) {
525 meta = find_syscall_meta(psys_syscall_table[i]); 543 meta = find_syscall_meta(psys_syscall_table[i]);
526 syscalls_metadata[i] = meta; 544 syscalls_metadata[i] = meta;
527 } 545 }
528 return; 546 return 0;
529
530 /* Paranoid: avoid overflow */
531end:
532 atomic_dec(&refs);
533} 547}
548arch_initcall(arch_init_ftrace_syscalls);
534#endif 549#endif
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..8d7d5c9c1be3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
35#include <asm/proto.h> 35#include <asm/proto.h>
36#include <asm/ds.h> 36#include <asm/ds.h>
37 37
38#include <trace/syscall.h>
39
40#include "tls.h" 38#include "tls.h"
41 39
40#define CREATE_TRACE_POINTS
41#include <trace/events/syscalls.h>
42
42enum x86_regset { 43enum x86_regset {
43 REGSET_GENERAL, 44 REGSET_GENERAL,
44 REGSET_FP, 45 REGSET_FP,
@@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1497 tracehook_report_syscall_entry(regs)) 1498 tracehook_report_syscall_entry(regs))
1498 ret = -1L; 1499 ret = -1L;
1499 1500
1500 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1501 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1501 ftrace_syscall_enter(regs); 1502 trace_sys_enter(regs, regs->orig_ax);
1502 1503
1503 if (unlikely(current->audit_context)) { 1504 if (unlikely(current->audit_context)) {
1504 if (IS_IA32) 1505 if (IS_IA32)
@@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1523 if (unlikely(current->audit_context)) 1524 if (unlikely(current->audit_context))
1524 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1525 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1525 1526
1526 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1527 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1527 ftrace_syscall_exit(regs); 1528 trace_sys_exit(regs, regs->ax);
1528 1529
1529 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1530 if (test_thread_flag(TIF_SYSCALL_TRACE))
1530 tracehook_report_syscall_exit(regs, 0); 1531 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long prot, unsigned long flags, 22 unsigned long, prot, unsigned long, flags,
23 unsigned long fd, unsigned long off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file; 26 struct file *file;
@@ -226,7 +226,7 @@ bottomup:
226} 226}
227 227
228 228
229asmlinkage long sys_uname(struct new_utsname __user *name) 229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{ 230{
231 int err; 231 int err;
232 down_read(&uts_sem); 232 down_read(&uts_sem);
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a81170de7f6b..23f7179bf74e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -93,16 +93,22 @@ void tracing_generic_entry_update(struct trace_entry *entry,
93 unsigned long flags, 93 unsigned long flags,
94 int pc); 94 int pc);
95struct ring_buffer_event * 95struct ring_buffer_event *
96trace_current_buffer_lock_reserve(int type, unsigned long len, 96trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
97 int type, unsigned long len,
97 unsigned long flags, int pc); 98 unsigned long flags, int pc);
98void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 99void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
100 struct ring_buffer_event *event,
99 unsigned long flags, int pc); 101 unsigned long flags, int pc);
100void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 102void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
103 struct ring_buffer_event *event,
101 unsigned long flags, int pc); 104 unsigned long flags, int pc);
102void trace_current_buffer_discard_commit(struct ring_buffer_event *event); 105void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
106 struct ring_buffer_event *event);
103 107
104void tracing_record_cmdline(struct task_struct *tsk); 108void tracing_record_cmdline(struct task_struct *tsk);
105 109
110struct event_filter;
111
106struct ftrace_event_call { 112struct ftrace_event_call {
107 struct list_head list; 113 struct list_head list;
108 char *name; 114 char *name;
@@ -110,16 +116,18 @@ struct ftrace_event_call {
110 struct dentry *dir; 116 struct dentry *dir;
111 struct trace_event *event; 117 struct trace_event *event;
112 int enabled; 118 int enabled;
113 int (*regfunc)(void); 119 int (*regfunc)(void *);
114 void (*unregfunc)(void); 120 void (*unregfunc)(void *);
115 int id; 121 int id;
116 int (*raw_init)(void); 122 int (*raw_init)(void);
117 int (*show_format)(struct trace_seq *s); 123 int (*show_format)(struct ftrace_event_call *call,
118 int (*define_fields)(void); 124 struct trace_seq *s);
125 int (*define_fields)(struct ftrace_event_call *);
119 struct list_head fields; 126 struct list_head fields;
120 int filter_active; 127 int filter_active;
121 void *filter; 128 struct event_filter *filter;
122 void *mod; 129 void *mod;
130 void *data;
123 131
124 atomic_t profile_count; 132 atomic_t profile_count;
125 int (*profile_enable)(struct ftrace_event_call *); 133 int (*profile_enable)(struct ftrace_event_call *);
@@ -129,15 +137,25 @@ struct ftrace_event_call {
129#define MAX_FILTER_PRED 32 137#define MAX_FILTER_PRED 32
130#define MAX_FILTER_STR_VAL 128 138#define MAX_FILTER_STR_VAL 128
131 139
132extern int init_preds(struct ftrace_event_call *call);
133extern void destroy_preds(struct ftrace_event_call *call); 140extern void destroy_preds(struct ftrace_event_call *call);
134extern int filter_match_preds(struct ftrace_event_call *call, void *rec); 141extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
135extern int filter_current_check_discard(struct ftrace_event_call *call, 142extern int filter_current_check_discard(struct ring_buffer *buffer,
143 struct ftrace_event_call *call,
136 void *rec, 144 void *rec,
137 struct ring_buffer_event *event); 145 struct ring_buffer_event *event);
138 146
139extern int trace_define_field(struct ftrace_event_call *call, char *type, 147enum {
140 char *name, int offset, int size, int is_signed); 148 FILTER_OTHER = 0,
149 FILTER_STATIC_STRING,
150 FILTER_DYN_STRING,
151 FILTER_PTR_STRING,
152};
153
154extern int trace_define_field(struct ftrace_event_call *call,
155 const char *type, const char *name,
156 int offset, int size, int is_signed,
157 int filter_type);
158extern int trace_define_common_fields(struct ftrace_event_call *call);
141 159
142#define is_signed_type(type) (((type)(-1)) < 0) 160#define is_signed_type(type) (((type)(-1)) < 0)
143 161
@@ -162,11 +180,4 @@ do { \
162 __trace_printk(ip, fmt, ##args); \ 180 __trace_printk(ip, fmt, ##args); \
163} while (0) 181} while (0)
164 182
165#define __common_field(type, item, is_signed) \
166 ret = trace_define_field(event_call, #type, "common_" #item, \
167 offsetof(typeof(field.ent), item), \
168 sizeof(field.ent.item), is_signed); \
169 if (ret) \
170 return ret;
171
172#endif /* _LINUX_FTRACE_EVENT_H */ 183#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index 098bdb7bfacf..f8f92d015efe 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,10 +17,12 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/marker.h> 18#include <linux/marker.h>
19#include <linux/tracepoint.h> 19#include <linux/tracepoint.h>
20#include <asm/local.h>
21 20
21#include <asm/local.h>
22#include <asm/module.h> 22#include <asm/module.h>
23 23
24#include <trace/events/module.h>
25
24/* Not Yet Implemented */ 26/* Not Yet Implemented */
25#define MODULE_SUPPORTED_DEVICE(name) 27#define MODULE_SUPPORTED_DEVICE(name)
26 28
@@ -462,7 +464,10 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
462static inline void __module_get(struct module *module) 464static inline void __module_get(struct module *module)
463{ 465{
464 if (module) { 466 if (module) {
465 local_inc(__module_ref_addr(module, get_cpu())); 467 unsigned int cpu = get_cpu();
468 local_inc(__module_ref_addr(module, cpu));
469 trace_module_get(module, _THIS_IP_,
470 local_read(__module_ref_addr(module, cpu)));
466 put_cpu(); 471 put_cpu();
467 } 472 }
468} 473}
@@ -473,8 +478,11 @@ static inline int try_module_get(struct module *module)
473 478
474 if (module) { 479 if (module) {
475 unsigned int cpu = get_cpu(); 480 unsigned int cpu = get_cpu();
476 if (likely(module_is_live(module))) 481 if (likely(module_is_live(module))) {
477 local_inc(__module_ref_addr(module, cpu)); 482 local_inc(__module_ref_addr(module, cpu));
483 trace_module_get(module, _THIS_IP_,
484 local_read(__module_ref_addr(module, cpu)));
485 }
478 else 486 else
479 ret = 0; 487 ret = 0;
480 put_cpu(); 488 put_cpu();
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e022b847c90d..972f90d7a32f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -766,6 +766,8 @@ extern int sysctl_perf_counter_mlock;
766extern int sysctl_perf_counter_sample_rate; 766extern int sysctl_perf_counter_sample_rate;
767 767
768extern void perf_counter_init(void); 768extern void perf_counter_init(void);
769extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
770 void *record, int entry_size);
769 771
770#ifndef perf_misc_flags 772#ifndef perf_misc_flags
771#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ 773#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 29f8599e6bea..5fcc31ed5771 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -75,20 +75,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
75} 75}
76 76
77/* 77/*
78 * ring_buffer_event_discard can discard any event in the ring buffer.
79 * it is up to the caller to protect against a reader from
80 * consuming it or a writer from wrapping and replacing it.
81 *
82 * No external protection is needed if this is called before
83 * the event is commited. But in that case it would be better to
84 * use ring_buffer_discard_commit.
85 *
86 * Note, if an event that has not been committed is discarded
87 * with ring_buffer_event_discard, it must still be committed.
88 */
89void ring_buffer_event_discard(struct ring_buffer_event *event);
90
91/*
92 * ring_buffer_discard_commit will remove an event that has not 78 * ring_buffer_discard_commit will remove an event that has not
93 * ben committed yet. If this is used, then ring_buffer_unlock_commit 79 * ben committed yet. If this is used, then ring_buffer_unlock_commit
94 * must not be called on the discarded event. This function 80 * must not be called on the discarded event. This function
@@ -154,8 +140,17 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer);
154void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu); 140void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
155void ring_buffer_reset(struct ring_buffer *buffer); 141void ring_buffer_reset(struct ring_buffer *buffer);
156 142
143#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
157int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, 144int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
158 struct ring_buffer *buffer_b, int cpu); 145 struct ring_buffer *buffer_b, int cpu);
146#else
147static inline int
148ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
149 struct ring_buffer *buffer_b, int cpu)
150{
151 return -ENODEV;
152}
153#endif
159 154
160int ring_buffer_empty(struct ring_buffer *buffer); 155int ring_buffer_empty(struct ring_buffer *buffer);
161int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); 156int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
@@ -170,7 +165,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
170unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); 165unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
171unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); 166unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
172unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); 167unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
173unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
174 168
175u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); 169u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
176void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, 170void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 80de7003d8c2..a8e37821cc60 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -64,6 +64,7 @@ struct perf_counter_attr;
64#include <linux/sem.h> 64#include <linux/sem.h>
65#include <asm/siginfo.h> 65#include <asm/siginfo.h>
66#include <asm/signal.h> 66#include <asm/signal.h>
67#include <linux/unistd.h>
67#include <linux/quota.h> 68#include <linux/quota.h>
68#include <linux/key.h> 69#include <linux/key.h>
69#include <trace/syscall.h> 70#include <trace/syscall.h>
@@ -97,6 +98,53 @@ struct perf_counter_attr;
97#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) 98#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
98#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) 99#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
99 100
101#ifdef CONFIG_EVENT_PROFILE
102#define TRACE_SYS_ENTER_PROFILE(sname) \
103static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \
104{ \
105 int ret = 0; \
106 if (!atomic_inc_return(&event_enter_##sname.profile_count)) \
107 ret = reg_prof_syscall_enter("sys"#sname); \
108 return ret; \
109} \
110 \
111static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
112{ \
113 if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \
114 unreg_prof_syscall_enter("sys"#sname); \
115}
116
117#define TRACE_SYS_EXIT_PROFILE(sname) \
118static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \
119{ \
120 int ret = 0; \
121 if (!atomic_inc_return(&event_exit_##sname.profile_count)) \
122 ret = reg_prof_syscall_exit("sys"#sname); \
123 return ret; \
124} \
125 \
126static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
127{ \
128 if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \
129 unreg_prof_syscall_exit("sys"#sname); \
130}
131
132#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \
133 .profile_count = ATOMIC_INIT(-1), \
134 .profile_enable = prof_sysenter_enable_##sname, \
135 .profile_disable = prof_sysenter_disable_##sname,
136
137#define TRACE_SYS_EXIT_PROFILE_INIT(sname) \
138 .profile_count = ATOMIC_INIT(-1), \
139 .profile_enable = prof_sysexit_enable_##sname, \
140 .profile_disable = prof_sysexit_disable_##sname,
141#else
142#define TRACE_SYS_ENTER_PROFILE(sname)
143#define TRACE_SYS_ENTER_PROFILE_INIT(sname)
144#define TRACE_SYS_EXIT_PROFILE(sname)
145#define TRACE_SYS_EXIT_PROFILE_INIT(sname)
146#endif
147
100#ifdef CONFIG_FTRACE_SYSCALLS 148#ifdef CONFIG_FTRACE_SYSCALLS
101#define __SC_STR_ADECL1(t, a) #a 149#define __SC_STR_ADECL1(t, a) #a
102#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) 150#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -112,7 +160,81 @@ struct perf_counter_attr;
112#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) 160#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
113#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) 161#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
114 162
163#define SYSCALL_TRACE_ENTER_EVENT(sname) \
164 static struct ftrace_event_call event_enter_##sname; \
165 struct trace_event enter_syscall_print_##sname = { \
166 .trace = print_syscall_enter, \
167 }; \
168 static int init_enter_##sname(void) \
169 { \
170 int num, id; \
171 num = syscall_name_to_nr("sys"#sname); \
172 if (num < 0) \
173 return -ENOSYS; \
174 id = register_ftrace_event(&enter_syscall_print_##sname);\
175 if (!id) \
176 return -ENODEV; \
177 event_enter_##sname.id = id; \
178 set_syscall_enter_id(num, id); \
179 INIT_LIST_HEAD(&event_enter_##sname.fields); \
180 return 0; \
181 } \
182 TRACE_SYS_ENTER_PROFILE(sname); \
183 static struct ftrace_event_call __used \
184 __attribute__((__aligned__(4))) \
185 __attribute__((section("_ftrace_events"))) \
186 event_enter_##sname = { \
187 .name = "sys_enter"#sname, \
188 .system = "syscalls", \
189 .event = &event_syscall_enter, \
190 .raw_init = init_enter_##sname, \
191 .show_format = syscall_enter_format, \
192 .define_fields = syscall_enter_define_fields, \
193 .regfunc = reg_event_syscall_enter, \
194 .unregfunc = unreg_event_syscall_enter, \
195 .data = "sys"#sname, \
196 TRACE_SYS_ENTER_PROFILE_INIT(sname) \
197 }
198
199#define SYSCALL_TRACE_EXIT_EVENT(sname) \
200 static struct ftrace_event_call event_exit_##sname; \
201 struct trace_event exit_syscall_print_##sname = { \
202 .trace = print_syscall_exit, \
203 }; \
204 static int init_exit_##sname(void) \
205 { \
206 int num, id; \
207 num = syscall_name_to_nr("sys"#sname); \
208 if (num < 0) \
209 return -ENOSYS; \
210 id = register_ftrace_event(&exit_syscall_print_##sname);\
211 if (!id) \
212 return -ENODEV; \
213 event_exit_##sname.id = id; \
214 set_syscall_exit_id(num, id); \
215 INIT_LIST_HEAD(&event_exit_##sname.fields); \
216 return 0; \
217 } \
218 TRACE_SYS_EXIT_PROFILE(sname); \
219 static struct ftrace_event_call __used \
220 __attribute__((__aligned__(4))) \
221 __attribute__((section("_ftrace_events"))) \
222 event_exit_##sname = { \
223 .name = "sys_exit"#sname, \
224 .system = "syscalls", \
225 .event = &event_syscall_exit, \
226 .raw_init = init_exit_##sname, \
227 .show_format = syscall_exit_format, \
228 .define_fields = syscall_exit_define_fields, \
229 .regfunc = reg_event_syscall_exit, \
230 .unregfunc = unreg_event_syscall_exit, \
231 .data = "sys"#sname, \
232 TRACE_SYS_EXIT_PROFILE_INIT(sname) \
233 }
234
115#define SYSCALL_METADATA(sname, nb) \ 235#define SYSCALL_METADATA(sname, nb) \
236 SYSCALL_TRACE_ENTER_EVENT(sname); \
237 SYSCALL_TRACE_EXIT_EVENT(sname); \
116 static const struct syscall_metadata __used \ 238 static const struct syscall_metadata __used \
117 __attribute__((__aligned__(4))) \ 239 __attribute__((__aligned__(4))) \
118 __attribute__((section("__syscalls_metadata"))) \ 240 __attribute__((section("__syscalls_metadata"))) \
@@ -121,18 +243,23 @@ struct perf_counter_attr;
121 .nb_args = nb, \ 243 .nb_args = nb, \
122 .types = types_##sname, \ 244 .types = types_##sname, \
123 .args = args_##sname, \ 245 .args = args_##sname, \
124 } 246 .enter_event = &event_enter_##sname, \
247 .exit_event = &event_exit_##sname, \
248 };
125 249
126#define SYSCALL_DEFINE0(sname) \ 250#define SYSCALL_DEFINE0(sname) \
251 SYSCALL_TRACE_ENTER_EVENT(_##sname); \
252 SYSCALL_TRACE_EXIT_EVENT(_##sname); \
127 static const struct syscall_metadata __used \ 253 static const struct syscall_metadata __used \
128 __attribute__((__aligned__(4))) \ 254 __attribute__((__aligned__(4))) \
129 __attribute__((section("__syscalls_metadata"))) \ 255 __attribute__((section("__syscalls_metadata"))) \
130 __syscall_meta_##sname = { \ 256 __syscall_meta_##sname = { \
131 .name = "sys_"#sname, \ 257 .name = "sys_"#sname, \
132 .nb_args = 0, \ 258 .nb_args = 0, \
259 .enter_event = &event_enter__##sname, \
260 .exit_event = &event_exit__##sname, \
133 }; \ 261 }; \
134 asmlinkage long sys_##sname(void) 262 asmlinkage long sys_##sname(void)
135
136#else 263#else
137#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) 264#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
138#endif 265#endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index b9dc4ca0246f..63a3f7a80580 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -23,6 +23,8 @@ struct tracepoint;
23struct tracepoint { 23struct tracepoint {
24 const char *name; /* Tracepoint name */ 24 const char *name; /* Tracepoint name */
25 int state; /* State. */ 25 int state; /* State. */
26 void (*regfunc)(void);
27 void (*unregfunc)(void);
26 void **funcs; 28 void **funcs;
27} __attribute__((aligned(32))); /* 29} __attribute__((aligned(32))); /*
28 * Aligned on 32 bytes because it is 30 * Aligned on 32 bytes because it is
@@ -78,12 +80,16 @@ struct tracepoint {
78 return tracepoint_probe_unregister(#name, (void *)probe);\ 80 return tracepoint_probe_unregister(#name, (void *)probe);\
79 } 81 }
80 82
81#define DEFINE_TRACE(name) \ 83
84#define DEFINE_TRACE_FN(name, reg, unreg) \
82 static const char __tpstrtab_##name[] \ 85 static const char __tpstrtab_##name[] \
83 __attribute__((section("__tracepoints_strings"))) = #name; \ 86 __attribute__((section("__tracepoints_strings"))) = #name; \
84 struct tracepoint __tracepoint_##name \ 87 struct tracepoint __tracepoint_##name \
85 __attribute__((section("__tracepoints"), aligned(32))) = \ 88 __attribute__((section("__tracepoints"), aligned(32))) = \
86 { __tpstrtab_##name, 0, NULL } 89 { __tpstrtab_##name, 0, reg, unreg, NULL }
90
91#define DEFINE_TRACE(name) \
92 DEFINE_TRACE_FN(name, NULL, NULL);
87 93
88#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ 94#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
89 EXPORT_SYMBOL_GPL(__tracepoint_##name) 95 EXPORT_SYMBOL_GPL(__tracepoint_##name)
@@ -108,6 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
108 return -ENOSYS; \ 114 return -ENOSYS; \
109 } 115 }
110 116
117#define DEFINE_TRACE_FN(name, reg, unreg)
111#define DEFINE_TRACE(name) 118#define DEFINE_TRACE(name)
112#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) 119#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
113#define EXPORT_TRACEPOINT_SYMBOL(name) 120#define EXPORT_TRACEPOINT_SYMBOL(name)
@@ -158,6 +165,15 @@ static inline void tracepoint_synchronize_unregister(void)
158 165
159#define PARAMS(args...) args 166#define PARAMS(args...) args
160 167
168#endif /* _LINUX_TRACEPOINT_H */
169
170/*
171 * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
172 * This is due to the way trace events work. If a file includes two
173 * trace event headers under one "CREATE_TRACE_POINTS" the first include
174 * will override the TRACE_EVENT and break the second include.
175 */
176
161#ifndef TRACE_EVENT 177#ifndef TRACE_EVENT
162/* 178/*
163 * For use with the TRACE_EVENT macro: 179 * For use with the TRACE_EVENT macro:
@@ -259,10 +275,15 @@ static inline void tracepoint_synchronize_unregister(void)
259 * can also by used by generic instrumentation like SystemTap), and 275 * can also by used by generic instrumentation like SystemTap), and
260 * it is also used to expose a structured trace record in 276 * it is also used to expose a structured trace record in
261 * /sys/kernel/debug/tracing/events/. 277 * /sys/kernel/debug/tracing/events/.
278 *
279 * A set of (un)registration functions can be passed to the variant
280 * TRACE_EVENT_FN to perform any (un)registration work.
262 */ 281 */
263 282
264#define TRACE_EVENT(name, proto, args, struct, assign, print) \ 283#define TRACE_EVENT(name, proto, args, struct, assign, print) \
265 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) 284 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
266#endif 285#define TRACE_EVENT_FN(name, proto, args, struct, \
286 assign, print, reg, unreg) \
287 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
267 288
268#endif 289#endif /* ifdef TRACE_EVENT (see note above) */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index f7a7ae1e8f90..2a4b3bf74033 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,6 +26,11 @@
26#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ 26#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
27 DEFINE_TRACE(name) 27 DEFINE_TRACE(name)
28 28
29#undef TRACE_EVENT_FN
30#define TRACE_EVENT_FN(name, proto, args, tstruct, \
31 assign, print, reg, unreg) \
32 DEFINE_TRACE_FN(name, reg, unreg)
33
29#undef DECLARE_TRACE 34#undef DECLARE_TRACE
30#define DECLARE_TRACE(name, proto, args) \ 35#define DECLARE_TRACE(name, proto, args) \
31 DEFINE_TRACE(name) 36 DEFINE_TRACE(name)
@@ -56,6 +61,8 @@
56#include <trace/ftrace.h> 61#include <trace/ftrace.h>
57#endif 62#endif
58 63
64#undef TRACE_EVENT
65#undef TRACE_EVENT_FN
59#undef TRACE_HEADER_MULTI_READ 66#undef TRACE_HEADER_MULTI_READ
60 67
61/* Only undef what we defined in this file */ 68/* Only undef what we defined in this file */
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
new file mode 100644
index 000000000000..84160fb18478
--- /dev/null
+++ b/include/trace/events/module.h
@@ -0,0 +1,126 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM module
3
4#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_MODULE_H
6
7#include <linux/tracepoint.h>
8
9#ifdef CONFIG_MODULES
10
11struct module;
12
13#define show_module_flags(flags) __print_flags(flags, "", \
14 { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \
15 { (1UL << TAINT_FORCED_MODULE), "F" }, \
16 { (1UL << TAINT_CRAP), "C" })
17
18TRACE_EVENT(module_load,
19
20 TP_PROTO(struct module *mod),
21
22 TP_ARGS(mod),
23
24 TP_STRUCT__entry(
25 __field( unsigned int, taints )
26 __string( name, mod->name )
27 ),
28
29 TP_fast_assign(
30 __entry->taints = mod->taints;
31 __assign_str(name, mod->name);
32 ),
33
34 TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
35);
36
37TRACE_EVENT(module_free,
38
39 TP_PROTO(struct module *mod),
40
41 TP_ARGS(mod),
42
43 TP_STRUCT__entry(
44 __string( name, mod->name )
45 ),
46
47 TP_fast_assign(
48 __assign_str(name, mod->name);
49 ),
50
51 TP_printk("%s", __get_str(name))
52);
53
54TRACE_EVENT(module_get,
55
56 TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
57
58 TP_ARGS(mod, ip, refcnt),
59
60 TP_STRUCT__entry(
61 __field( unsigned long, ip )
62 __field( int, refcnt )
63 __string( name, mod->name )
64 ),
65
66 TP_fast_assign(
67 __entry->ip = ip;
68 __entry->refcnt = refcnt;
69 __assign_str(name, mod->name);
70 ),
71
72 TP_printk("%s call_site=%pf refcnt=%d",
73 __get_str(name), (void *)__entry->ip, __entry->refcnt)
74);
75
76TRACE_EVENT(module_put,
77
78 TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
79
80 TP_ARGS(mod, ip, refcnt),
81
82 TP_STRUCT__entry(
83 __field( unsigned long, ip )
84 __field( int, refcnt )
85 __string( name, mod->name )
86 ),
87
88 TP_fast_assign(
89 __entry->ip = ip;
90 __entry->refcnt = refcnt;
91 __assign_str(name, mod->name);
92 ),
93
94 TP_printk("%s call_site=%pf refcnt=%d",
95 __get_str(name), (void *)__entry->ip, __entry->refcnt)
96);
97
98TRACE_EVENT(module_request,
99
100 TP_PROTO(char *name, bool wait, unsigned long ip),
101
102 TP_ARGS(name, wait, ip),
103
104 TP_STRUCT__entry(
105 __field( bool, wait )
106 __field( unsigned long, ip )
107 __string( name, name )
108 ),
109
110 TP_fast_assign(
111 __entry->wait = wait;
112 __entry->ip = ip;
113 __assign_str(name, name);
114 ),
115
116 TP_printk("%s wait=%d call_site=%pf",
117 __get_str(name), (int)__entry->wait, (void *)__entry->ip)
118);
119
120#endif /* CONFIG_MODULES */
121
122#endif /* _TRACE_MODULE_H */
123
124/* This part must be outside protection */
125#include <trace/define_trace.h>
126
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index a4c369ec328f..b48f1ad7c946 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -94,6 +94,7 @@ TRACE_EVENT(sched_wakeup,
94 __field( pid_t, pid ) 94 __field( pid_t, pid )
95 __field( int, prio ) 95 __field( int, prio )
96 __field( int, success ) 96 __field( int, success )
97 __field( int, cpu )
97 ), 98 ),
98 99
99 TP_fast_assign( 100 TP_fast_assign(
@@ -101,11 +102,12 @@ TRACE_EVENT(sched_wakeup,
101 __entry->pid = p->pid; 102 __entry->pid = p->pid;
102 __entry->prio = p->prio; 103 __entry->prio = p->prio;
103 __entry->success = success; 104 __entry->success = success;
105 __entry->cpu = task_cpu(p);
104 ), 106 ),
105 107
106 TP_printk("task %s:%d [%d] success=%d", 108 TP_printk("task %s:%d [%d] success=%d [%03d]",
107 __entry->comm, __entry->pid, __entry->prio, 109 __entry->comm, __entry->pid, __entry->prio,
108 __entry->success) 110 __entry->success, __entry->cpu)
109); 111);
110 112
111/* 113/*
@@ -125,6 +127,7 @@ TRACE_EVENT(sched_wakeup_new,
125 __field( pid_t, pid ) 127 __field( pid_t, pid )
126 __field( int, prio ) 128 __field( int, prio )
127 __field( int, success ) 129 __field( int, success )
130 __field( int, cpu )
128 ), 131 ),
129 132
130 TP_fast_assign( 133 TP_fast_assign(
@@ -132,11 +135,12 @@ TRACE_EVENT(sched_wakeup_new,
132 __entry->pid = p->pid; 135 __entry->pid = p->pid;
133 __entry->prio = p->prio; 136 __entry->prio = p->prio;
134 __entry->success = success; 137 __entry->success = success;
138 __entry->cpu = task_cpu(p);
135 ), 139 ),
136 140
137 TP_printk("task %s:%d [%d] success=%d", 141 TP_printk("task %s:%d [%d] success=%d [%03d]",
138 __entry->comm, __entry->pid, __entry->prio, 142 __entry->comm, __entry->pid, __entry->prio,
139 __entry->success) 143 __entry->success, __entry->cpu)
140); 144);
141 145
142/* 146/*
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
new file mode 100644
index 000000000000..397dff2dbd5a
--- /dev/null
+++ b/include/trace/events/syscalls.h
@@ -0,0 +1,70 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM syscalls
3
4#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_EVENTS_SYSCALLS_H
6
7#include <linux/tracepoint.h>
8
9#include <asm/ptrace.h>
10#include <asm/syscall.h>
11
12
13#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
14
15extern void syscall_regfunc(void);
16extern void syscall_unregfunc(void);
17
18TRACE_EVENT_FN(sys_enter,
19
20 TP_PROTO(struct pt_regs *regs, long id),
21
22 TP_ARGS(regs, id),
23
24 TP_STRUCT__entry(
25 __field( long, id )
26 __array( unsigned long, args, 6 )
27 ),
28
29 TP_fast_assign(
30 __entry->id = id;
31 syscall_get_arguments(current, regs, 0, 6, __entry->args);
32 ),
33
34 TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)",
35 __entry->id,
36 __entry->args[0], __entry->args[1], __entry->args[2],
37 __entry->args[3], __entry->args[4], __entry->args[5]),
38
39 syscall_regfunc, syscall_unregfunc
40);
41
42TRACE_EVENT_FN(sys_exit,
43
44 TP_PROTO(struct pt_regs *regs, long ret),
45
46 TP_ARGS(regs, ret),
47
48 TP_STRUCT__entry(
49 __field( long, id )
50 __field( long, ret )
51 ),
52
53 TP_fast_assign(
54 __entry->id = syscall_get_nr(current, regs);
55 __entry->ret = ret;
56 ),
57
58 TP_printk("NR %ld = %ld",
59 __entry->id, __entry->ret),
60
61 syscall_regfunc, syscall_unregfunc
62);
63
64#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
65
66#endif /* _TRACE_EVENTS_SYSCALLS_H */
67
68/* This part must be outside protection */
69#include <trace/define_trace.h>
70
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index f64fbaae781a..308bafd93325 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -21,11 +21,14 @@
21#undef __field 21#undef __field
22#define __field(type, item) type item; 22#define __field(type, item) type item;
23 23
24#undef __field_ext
25#define __field_ext(type, item, filter_type) type item;
26
24#undef __array 27#undef __array
25#define __array(type, item, len) type item[len]; 28#define __array(type, item, len) type item[len];
26 29
27#undef __dynamic_array 30#undef __dynamic_array
28#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; 31#define __dynamic_array(type, item, len) u32 __data_loc_##item;
29 32
30#undef __string 33#undef __string
31#define __string(item, src) __dynamic_array(char, item, -1) 34#define __string(item, src) __dynamic_array(char, item, -1)
@@ -42,6 +45,16 @@
42 }; \ 45 }; \
43 static struct ftrace_event_call event_##name 46 static struct ftrace_event_call event_##name
44 47
48#undef __cpparg
49#define __cpparg(arg...) arg
50
51/* Callbacks are meaningless to ftrace. */
52#undef TRACE_EVENT_FN
53#define TRACE_EVENT_FN(name, proto, args, tstruct, \
54 assign, print, reg, unreg) \
55 TRACE_EVENT(name, __cpparg(proto), __cpparg(args), \
56 __cpparg(tstruct), __cpparg(assign), __cpparg(print)) \
57
45#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 58#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
46 59
47 60
@@ -51,23 +64,27 @@
51 * Include the following: 64 * Include the following:
52 * 65 *
53 * struct ftrace_data_offsets_<call> { 66 * struct ftrace_data_offsets_<call> {
54 * int <item1>; 67 * u32 <item1>;
55 * int <item2>; 68 * u32 <item2>;
56 * [...] 69 * [...]
57 * }; 70 * };
58 * 71 *
59 * The __dynamic_array() macro will create each int <item>, this is 72 * The __dynamic_array() macro will create each u32 <item>, this is
60 * to keep the offset of each array from the beginning of the event. 73 * to keep the offset of each array from the beginning of the event.
74 * The size of an array is also encoded, in the higher 16 bits of <item>.
61 */ 75 */
62 76
63#undef __field 77#undef __field
64#define __field(type, item); 78#define __field(type, item)
79
80#undef __field_ext
81#define __field_ext(type, item, filter_type)
65 82
66#undef __array 83#undef __array
67#define __array(type, item, len) 84#define __array(type, item, len)
68 85
69#undef __dynamic_array 86#undef __dynamic_array
70#define __dynamic_array(type, item, len) int item; 87#define __dynamic_array(type, item, len) u32 item;
71 88
72#undef __string 89#undef __string
73#define __string(item, src) __dynamic_array(char, item, -1) 90#define __string(item, src) __dynamic_array(char, item, -1)
@@ -109,6 +126,9 @@
109 if (!ret) \ 126 if (!ret) \
110 return 0; 127 return 0;
111 128
129#undef __field_ext
130#define __field_ext(type, item, filter_type) __field(type, item)
131
112#undef __array 132#undef __array
113#define __array(type, item, len) \ 133#define __array(type, item, len) \
114 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 134 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
@@ -120,7 +140,7 @@
120 140
121#undef __dynamic_array 141#undef __dynamic_array
122#define __dynamic_array(type, item, len) \ 142#define __dynamic_array(type, item, len) \
123 ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ 143 ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
124 "offset:%u;\tsize:%u;\n", \ 144 "offset:%u;\tsize:%u;\n", \
125 (unsigned int)offsetof(typeof(field), \ 145 (unsigned int)offsetof(typeof(field), \
126 __data_loc_##item), \ 146 __data_loc_##item), \
@@ -150,7 +170,8 @@
150#undef TRACE_EVENT 170#undef TRACE_EVENT
151#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ 171#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
152static int \ 172static int \
153ftrace_format_##call(struct trace_seq *s) \ 173ftrace_format_##call(struct ftrace_event_call *unused, \
174 struct trace_seq *s) \
154{ \ 175{ \
155 struct ftrace_raw_##call field __attribute__((unused)); \ 176 struct ftrace_raw_##call field __attribute__((unused)); \
156 int ret = 0; \ 177 int ret = 0; \
@@ -210,7 +231,7 @@ ftrace_format_##call(struct trace_seq *s) \
210 231
211#undef __get_dynamic_array 232#undef __get_dynamic_array
212#define __get_dynamic_array(field) \ 233#define __get_dynamic_array(field) \
213 ((void *)__entry + __entry->__data_loc_##field) 234 ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
214 235
215#undef __get_str 236#undef __get_str
216#define __get_str(field) (char *)__get_dynamic_array(field) 237#define __get_str(field) (char *)__get_dynamic_array(field)
@@ -263,28 +284,33 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
263 284
264#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 285#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
265 286
266#undef __field 287#undef __field_ext
267#define __field(type, item) \ 288#define __field_ext(type, item, filter_type) \
268 ret = trace_define_field(event_call, #type, #item, \ 289 ret = trace_define_field(event_call, #type, #item, \
269 offsetof(typeof(field), item), \ 290 offsetof(typeof(field), item), \
270 sizeof(field.item), is_signed_type(type)); \ 291 sizeof(field.item), \
292 is_signed_type(type), filter_type); \
271 if (ret) \ 293 if (ret) \
272 return ret; 294 return ret;
273 295
296#undef __field
297#define __field(type, item) __field_ext(type, item, FILTER_OTHER)
298
274#undef __array 299#undef __array
275#define __array(type, item, len) \ 300#define __array(type, item, len) \
276 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 301 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
277 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 302 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
278 offsetof(typeof(field), item), \ 303 offsetof(typeof(field), item), \
279 sizeof(field.item), 0); \ 304 sizeof(field.item), 0, FILTER_OTHER); \
280 if (ret) \ 305 if (ret) \
281 return ret; 306 return ret;
282 307
283#undef __dynamic_array 308#undef __dynamic_array
284#define __dynamic_array(type, item, len) \ 309#define __dynamic_array(type, item, len) \
285 ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ 310 ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \
286 offsetof(typeof(field), __data_loc_##item), \ 311 offsetof(typeof(field), __data_loc_##item), \
287 sizeof(field.__data_loc_##item), 0); 312 sizeof(field.__data_loc_##item), 0, \
313 FILTER_OTHER);
288 314
289#undef __string 315#undef __string
290#define __string(item, src) __dynamic_array(char, item, -1) 316#define __string(item, src) __dynamic_array(char, item, -1)
@@ -292,17 +318,14 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
292#undef TRACE_EVENT 318#undef TRACE_EVENT
293#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ 319#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
294int \ 320int \
295ftrace_define_fields_##call(void) \ 321ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
296{ \ 322{ \
297 struct ftrace_raw_##call field; \ 323 struct ftrace_raw_##call field; \
298 struct ftrace_event_call *event_call = &event_##call; \
299 int ret; \ 324 int ret; \
300 \ 325 \
301 __common_field(int, type, 1); \ 326 ret = trace_define_common_fields(event_call); \
302 __common_field(unsigned char, flags, 0); \ 327 if (ret) \
303 __common_field(unsigned char, preempt_count, 0); \ 328 return ret; \
304 __common_field(int, pid, 1); \
305 __common_field(int, tgid, 1); \
306 \ 329 \
307 tstruct; \ 330 tstruct; \
308 \ 331 \
@@ -321,6 +344,9 @@ ftrace_define_fields_##call(void) \
321#undef __field 344#undef __field
322#define __field(type, item) 345#define __field(type, item)
323 346
347#undef __field_ext
348#define __field_ext(type, item, filter_type)
349
324#undef __array 350#undef __array
325#define __array(type, item, len) 351#define __array(type, item, len)
326 352
@@ -328,6 +354,7 @@ ftrace_define_fields_##call(void) \
328#define __dynamic_array(type, item, len) \ 354#define __dynamic_array(type, item, len) \
329 __data_offsets->item = __data_size + \ 355 __data_offsets->item = __data_size + \
330 offsetof(typeof(*entry), __data); \ 356 offsetof(typeof(*entry), __data); \
357 __data_offsets->item |= (len * sizeof(type)) << 16; \
331 __data_size += (len) * sizeof(type); 358 __data_size += (len) * sizeof(type);
332 359
333#undef __string 360#undef __string
@@ -433,13 +460,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
433 * { 460 * {
434 * struct ring_buffer_event *event; 461 * struct ring_buffer_event *event;
435 * struct ftrace_raw_<call> *entry; <-- defined in stage 1 462 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
463 * struct ring_buffer *buffer;
436 * unsigned long irq_flags; 464 * unsigned long irq_flags;
437 * int pc; 465 * int pc;
438 * 466 *
439 * local_save_flags(irq_flags); 467 * local_save_flags(irq_flags);
440 * pc = preempt_count(); 468 * pc = preempt_count();
441 * 469 *
442 * event = trace_current_buffer_lock_reserve(event_<call>.id, 470 * event = trace_current_buffer_lock_reserve(&buffer,
471 * event_<call>.id,
443 * sizeof(struct ftrace_raw_<call>), 472 * sizeof(struct ftrace_raw_<call>),
444 * irq_flags, pc); 473 * irq_flags, pc);
445 * if (!event) 474 * if (!event)
@@ -449,7 +478,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
449 * <assign>; <-- Here we assign the entries by the __field and 478 * <assign>; <-- Here we assign the entries by the __field and
450 * __array macros. 479 * __array macros.
451 * 480 *
452 * trace_current_buffer_unlock_commit(event, irq_flags, pc); 481 * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
453 * } 482 * }
454 * 483 *
455 * static int ftrace_raw_reg_event_<call>(void) 484 * static int ftrace_raw_reg_event_<call>(void)
@@ -541,6 +570,7 @@ static void ftrace_raw_event_##call(proto) \
541 struct ftrace_event_call *event_call = &event_##call; \ 570 struct ftrace_event_call *event_call = &event_##call; \
542 struct ring_buffer_event *event; \ 571 struct ring_buffer_event *event; \
543 struct ftrace_raw_##call *entry; \ 572 struct ftrace_raw_##call *entry; \
573 struct ring_buffer *buffer; \
544 unsigned long irq_flags; \ 574 unsigned long irq_flags; \
545 int __data_size; \ 575 int __data_size; \
546 int pc; \ 576 int pc; \
@@ -550,7 +580,8 @@ static void ftrace_raw_event_##call(proto) \
550 \ 580 \
551 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 581 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
552 \ 582 \
553 event = trace_current_buffer_lock_reserve(event_##call.id, \ 583 event = trace_current_buffer_lock_reserve(&buffer, \
584 event_##call.id, \
554 sizeof(*entry) + __data_size, \ 585 sizeof(*entry) + __data_size, \
555 irq_flags, pc); \ 586 irq_flags, pc); \
556 if (!event) \ 587 if (!event) \
@@ -562,11 +593,12 @@ static void ftrace_raw_event_##call(proto) \
562 \ 593 \
563 { assign; } \ 594 { assign; } \
564 \ 595 \
565 if (!filter_current_check_discard(event_call, entry, event)) \ 596 if (!filter_current_check_discard(buffer, event_call, entry, event)) \
566 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ 597 trace_nowake_buffer_unlock_commit(buffer, \
598 event, irq_flags, pc); \
567} \ 599} \
568 \ 600 \
569static int ftrace_raw_reg_event_##call(void) \ 601static int ftrace_raw_reg_event_##call(void *ptr) \
570{ \ 602{ \
571 int ret; \ 603 int ret; \
572 \ 604 \
@@ -577,7 +609,7 @@ static int ftrace_raw_reg_event_##call(void) \
577 return ret; \ 609 return ret; \
578} \ 610} \
579 \ 611 \
580static void ftrace_raw_unreg_event_##call(void) \ 612static void ftrace_raw_unreg_event_##call(void *ptr) \
581{ \ 613{ \
582 unregister_trace_##call(ftrace_raw_event_##call); \ 614 unregister_trace_##call(ftrace_raw_event_##call); \
583} \ 615} \
@@ -595,7 +627,6 @@ static int ftrace_raw_init_event_##call(void) \
595 return -ENODEV; \ 627 return -ENODEV; \
596 event_##call.id = id; \ 628 event_##call.id = id; \
597 INIT_LIST_HEAD(&event_##call.fields); \ 629 INIT_LIST_HEAD(&event_##call.fields); \
598 init_preds(&event_##call); \
599 return 0; \ 630 return 0; \
600} \ 631} \
601 \ 632 \
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 8cfe515cbc47..5dc283ba5ae0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -1,8 +1,13 @@
1#ifndef _TRACE_SYSCALL_H 1#ifndef _TRACE_SYSCALL_H
2#define _TRACE_SYSCALL_H 2#define _TRACE_SYSCALL_H
3 3
4#include <linux/tracepoint.h>
5#include <linux/unistd.h>
6#include <linux/ftrace_event.h>
7
4#include <asm/ptrace.h> 8#include <asm/ptrace.h>
5 9
10
6/* 11/*
7 * A syscall entry in the ftrace syscalls array. 12 * A syscall entry in the ftrace syscalls array.
8 * 13 *
@@ -10,26 +15,49 @@
10 * @nb_args: number of parameters it takes 15 * @nb_args: number of parameters it takes
11 * @types: list of types as strings 16 * @types: list of types as strings
12 * @args: list of args as strings (args[i] matches types[i]) 17 * @args: list of args as strings (args[i] matches types[i])
18 * @enter_id: associated ftrace enter event id
19 * @exit_id: associated ftrace exit event id
20 * @enter_event: associated syscall_enter trace event
21 * @exit_event: associated syscall_exit trace event
13 */ 22 */
14struct syscall_metadata { 23struct syscall_metadata {
15 const char *name; 24 const char *name;
16 int nb_args; 25 int nb_args;
17 const char **types; 26 const char **types;
18 const char **args; 27 const char **args;
28 int enter_id;
29 int exit_id;
30
31 struct ftrace_event_call *enter_event;
32 struct ftrace_event_call *exit_event;
19}; 33};
20 34
21#ifdef CONFIG_FTRACE_SYSCALLS 35#ifdef CONFIG_FTRACE_SYSCALLS
22extern void arch_init_ftrace_syscalls(void);
23extern struct syscall_metadata *syscall_nr_to_meta(int nr); 36extern struct syscall_metadata *syscall_nr_to_meta(int nr);
24extern void start_ftrace_syscalls(void); 37extern int syscall_name_to_nr(char *name);
25extern void stop_ftrace_syscalls(void); 38void set_syscall_enter_id(int num, int id);
26extern void ftrace_syscall_enter(struct pt_regs *regs); 39void set_syscall_exit_id(int num, int id);
27extern void ftrace_syscall_exit(struct pt_regs *regs); 40extern struct trace_event event_syscall_enter;
28#else 41extern struct trace_event event_syscall_exit;
29static inline void start_ftrace_syscalls(void) { } 42extern int reg_event_syscall_enter(void *ptr);
30static inline void stop_ftrace_syscalls(void) { } 43extern void unreg_event_syscall_enter(void *ptr);
31static inline void ftrace_syscall_enter(struct pt_regs *regs) { } 44extern int reg_event_syscall_exit(void *ptr);
32static inline void ftrace_syscall_exit(struct pt_regs *regs) { } 45extern void unreg_event_syscall_exit(void *ptr);
46extern int syscall_enter_format(struct ftrace_event_call *call,
47 struct trace_seq *s);
48extern int syscall_exit_format(struct ftrace_event_call *call,
49 struct trace_seq *s);
50extern int syscall_enter_define_fields(struct ftrace_event_call *call);
51extern int syscall_exit_define_fields(struct ftrace_event_call *call);
52enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
53enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
54#endif
55#ifdef CONFIG_EVENT_PROFILE
56int reg_prof_syscall_enter(char *name);
57void unreg_prof_syscall_enter(char *name);
58int reg_prof_syscall_exit(char *name);
59void unreg_prof_syscall_exit(char *name);
60
33#endif 61#endif
34 62
35#endif /* _TRACE_SYSCALL_H */ 63#endif /* _TRACE_SYSCALL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4e8cae2e9148..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
37#include <linux/suspend.h> 37#include <linux/suspend.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39 39
40#include <trace/events/module.h>
41
40extern int max_threads; 42extern int max_threads;
41 43
42static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -112,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
112 return -ENOMEM; 114 return -ENOMEM;
113 } 115 }
114 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
115 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
116 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
117 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 234
241 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
242 if (check_safety()) 236 if (check_safety())
243 return -EAGAIN; 237 return -EAGAIN;
244 238
245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
246 int i; 240 int i;
247 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
248 continue; 242 continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
260void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
261{ 255{
262 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
263 struct hlist_node *pos;
264 257
265 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
266 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
267 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
268 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
269 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
270 if (dirty) { 263 if (dirty) {
271 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
272 kip->ngarbage++; 265 kip->ngarbage++;
273 } else { 266 } else
274 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
275 }
276 break; 268 break;
277 } 269 }
278 } 270 }
diff --git a/kernel/module.c b/kernel/module.c
index 2d537186191f..46580edff0cb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h> 56#include <linux/kmemleak.h>
57 57
58#define CREATE_TRACE_POINTS
59#include <trace/events/module.h>
60
61EXPORT_TRACEPOINT_SYMBOL(module_get);
62
58#if 0 63#if 0
59#define DEBUGP printk 64#define DEBUGP printk
60#else 65#else
@@ -942,6 +947,8 @@ void module_put(struct module *module)
942 if (module) { 947 if (module) {
943 unsigned int cpu = get_cpu(); 948 unsigned int cpu = get_cpu();
944 local_dec(__module_ref_addr(module, cpu)); 949 local_dec(__module_ref_addr(module, cpu));
950 trace_module_put(module, _RET_IP_,
951 local_read(__module_ref_addr(module, cpu)));
945 /* Maybe they're waiting for us to drop reference? */ 952 /* Maybe they're waiting for us to drop reference? */
946 if (unlikely(!module_is_live(module))) 953 if (unlikely(!module_is_live(module)))
947 wake_up_process(module->waiter); 954 wake_up_process(module->waiter);
@@ -1497,6 +1504,8 @@ static int __unlink_module(void *_mod)
1497/* Free a module, remove from lists, etc (must hold module_mutex). */ 1504/* Free a module, remove from lists, etc (must hold module_mutex). */
1498static void free_module(struct module *mod) 1505static void free_module(struct module *mod)
1499{ 1506{
1507 trace_module_free(mod);
1508
1500 /* Delete from various lists */ 1509 /* Delete from various lists */
1501 stop_machine(__unlink_module, mod, NULL); 1510 stop_machine(__unlink_module, mod, NULL);
1502 remove_notes_attrs(mod); 1511 remove_notes_attrs(mod);
@@ -2364,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod,
2364 /* Get rid of temporary copy */ 2373 /* Get rid of temporary copy */
2365 vfree(hdr); 2374 vfree(hdr);
2366 2375
2376 trace_module_load(mod);
2377
2367 /* Done! */ 2378 /* Done! */
2368 return mod; 2379 return mod;
2369 2380
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd764..1ea0d1234f4a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD
41config HAVE_HW_BRANCH_TRACER 41config HAVE_HW_BRANCH_TRACER
42 bool 42 bool
43 43
44config HAVE_FTRACE_SYSCALLS 44config HAVE_SYSCALL_TRACEPOINTS
45 bool 45 bool
46 46
47config TRACER_MAX_TRACE 47config TRACER_MAX_TRACE
@@ -60,9 +60,14 @@ config EVENT_TRACING
60 bool 60 bool
61 61
62config CONTEXT_SWITCH_TRACER 62config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool 63 bool
65 64
65config RING_BUFFER_ALLOW_SWAP
66 bool
67 help
68 Allow the use of ring_buffer_swap_cpu.
69 Adds a very slight overhead to tracing when enabled.
70
66# All tracer options should select GENERIC_TRACER. For those options that are 71# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING. 72# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the 73# This allows those options to appear when no other tracer is selected. But the
@@ -147,6 +152,7 @@ config IRQSOFF_TRACER
147 select TRACE_IRQFLAGS 152 select TRACE_IRQFLAGS
148 select GENERIC_TRACER 153 select GENERIC_TRACER
149 select TRACER_MAX_TRACE 154 select TRACER_MAX_TRACE
155 select RING_BUFFER_ALLOW_SWAP
150 help 156 help
151 This option measures the time spent in irqs-off critical 157 This option measures the time spent in irqs-off critical
152 sections, with microsecond accuracy. 158 sections, with microsecond accuracy.
@@ -168,6 +174,7 @@ config PREEMPT_TRACER
168 depends on PREEMPT 174 depends on PREEMPT
169 select GENERIC_TRACER 175 select GENERIC_TRACER
170 select TRACER_MAX_TRACE 176 select TRACER_MAX_TRACE
177 select RING_BUFFER_ALLOW_SWAP
171 help 178 help
172 This option measures the time spent in preemption off critical 179 This option measures the time spent in preemption off critical
173 sections, with microsecond accuracy. 180 sections, with microsecond accuracy.
@@ -211,7 +218,7 @@ config ENABLE_DEFAULT_TRACERS
211 218
212config FTRACE_SYSCALLS 219config FTRACE_SYSCALLS
213 bool "Trace syscalls" 220 bool "Trace syscalls"
214 depends on HAVE_FTRACE_SYSCALLS 221 depends on HAVE_SYSCALL_TRACEPOINTS
215 select GENERIC_TRACER 222 select GENERIC_TRACER
216 select KALLSYMS 223 select KALLSYMS
217 help 224 help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7a34cb563fec..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
65{ 65{
66 struct blk_io_trace *t; 66 struct blk_io_trace *t;
67 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
68 int pc = 0; 69 int pc = 0;
69 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
70 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
71 72
72 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
73 pc = preempt_count(); 75 pc = preempt_count();
74 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
75 sizeof(*t) + len, 77 sizeof(*t) + len,
76 0, pc); 78 0, pc);
77 if (!event) 79 if (!event)
@@ -96,7 +98,7 @@ record_it:
96 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
97 99
98 if (blk_tracer) 100 if (blk_tracer)
99 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
100 } 102 }
101} 103}
102 104
@@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
179{ 181{
180 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
181 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
182 struct blk_io_trace *t; 185 struct blk_io_trace *t;
183 unsigned long flags = 0; 186 unsigned long flags = 0;
184 unsigned long *sequence; 187 unsigned long *sequence;
@@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
204 if (blk_tracer) { 207 if (blk_tracer) {
205 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
206 209
210 buffer = blk_tr->buffer;
207 pc = preempt_count(); 211 pc = preempt_count();
208 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
209 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
210 0, pc); 214 0, pc);
211 if (!event) 215 if (!event)
@@ -252,7 +256,7 @@ record_it:
252 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
253 257
254 if (blk_tracer) { 258 if (blk_tracer) {
255 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
256 return; 260 return;
257 } 261 }
258 } 262 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25edd5cc5935..8c804e24f96f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1017{ 1017{
1018 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1019 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1020 1020
1021 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1022 1022
1023 ip = rec->ip;
1024
1025 /* 1023 /*
1026 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1027 * it is not enabled then do nothing. 1025 * then disable it.
1028 * 1026 *
1029 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1030 * it is enabled then disable it.
1031 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1032 */ 1031 */
1033 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1034 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1035 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1036 else 1035 }
1037 return 0;
1038
1039 } else if (ftrace_filtered && enable) {
1040 /*
1041 * Filtering is on:
1042 */
1043
1044 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1045
1046 /* Record is filtered and enabled, do nothing */
1047 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1048 return 0;
1049
1050 /* Record is not filtered or enabled, do nothing */
1051 if (!fl)
1052 return 0;
1053
1054 /* Record is not filtered but enabled, disable it */
1055 if (fl == FTRACE_FL_ENABLED)
1056 rec->flags &= ~FTRACE_FL_ENABLED;
1057 else
1058 /* Otherwise record is filtered but not enabled, enable it */
1059 rec->flags |= FTRACE_FL_ENABLED;
1060 } else {
1061 /* Disable or not filtered */
1062
1063 if (enable) {
1064 /* if record is enabled, do nothing */
1065 if (rec->flags & FTRACE_FL_ENABLED)
1066 return 0;
1067
1068 rec->flags |= FTRACE_FL_ENABLED;
1069
1070 } else {
1071 1036
1072 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1073 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1074 return 0; 1039 return 0;
1075 1040
1076 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1077 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1078 } 1044 }
1079 1045
1080 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1081 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1082 else
1083 return ftrace_make_nop(NULL, rec, ftrace_addr);
1084} 1048}
1085 1049
1086static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1375,7 +1339,6 @@ struct ftrace_iterator {
1375 unsigned flags; 1339 unsigned flags;
1376 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1340 unsigned char buffer[FTRACE_BUFF_MAX+1];
1377 unsigned buffer_idx; 1341 unsigned buffer_idx;
1378 unsigned filtered;
1379}; 1342};
1380 1343
1381static void * 1344static void *
@@ -1438,18 +1401,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1438{ 1401{
1439 struct ftrace_func_probe *rec; 1402 struct ftrace_func_probe *rec;
1440 struct hlist_node *hnd = v; 1403 struct hlist_node *hnd = v;
1441 char str[KSYM_SYMBOL_LEN];
1442 1404
1443 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1405 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1444 1406
1445 if (rec->ops->print) 1407 if (rec->ops->print)
1446 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1447 1409
1448 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
1449 seq_printf(m, "%s:", str);
1450
1451 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1452 seq_printf(m, "%s", str);
1453 1411
1454 if (rec->data) 1412 if (rec->data)
1455 seq_printf(m, ":%p", rec->data); 1413 seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
1547{ 1505{
1548 struct ftrace_iterator *iter = m->private; 1506 struct ftrace_iterator *iter = m->private;
1549 struct dyn_ftrace *rec = v; 1507 struct dyn_ftrace *rec = v;
1550 char str[KSYM_SYMBOL_LEN];
1551 1508
1552 if (iter->flags & FTRACE_ITER_HASH) 1509 if (iter->flags & FTRACE_ITER_HASH)
1553 return t_hash_show(m, v); 1510 return t_hash_show(m, v);
@@ -1560,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
1560 if (!rec) 1517 if (!rec)
1561 return 0; 1518 return 0;
1562 1519
1563 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1520 seq_printf(m, "%pf\n", (void *)rec->ip);
1564
1565 seq_printf(m, "%s\n", str);
1566 1521
1567 return 0; 1522 return 0;
1568} 1523}
@@ -1601,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1601 return ret; 1556 return ret;
1602} 1557}
1603 1558
1604int ftrace_avail_release(struct inode *inode, struct file *file)
1605{
1606 struct seq_file *m = (struct seq_file *)file->private_data;
1607 struct ftrace_iterator *iter = m->private;
1608
1609 seq_release(inode, file);
1610 kfree(iter);
1611
1612 return 0;
1613}
1614
1615static int 1559static int
1616ftrace_failures_open(struct inode *inode, struct file *file) 1560ftrace_failures_open(struct inode *inode, struct file *file)
1617{ 1561{
@@ -2317,7 +2261,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2317 } 2261 }
2318 2262
2319 if (isspace(ch)) { 2263 if (isspace(ch)) {
2320 iter->filtered++;
2321 iter->buffer[iter->buffer_idx] = 0; 2264 iter->buffer[iter->buffer_idx] = 0;
2322 ret = ftrace_process_regex(iter->buffer, 2265 ret = ftrace_process_regex(iter->buffer,
2323 iter->buffer_idx, enable); 2266 iter->buffer_idx, enable);
@@ -2448,7 +2391,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2448 iter = file->private_data; 2391 iter = file->private_data;
2449 2392
2450 if (iter->buffer_idx) { 2393 if (iter->buffer_idx) {
2451 iter->filtered++;
2452 iter->buffer[iter->buffer_idx] = 0; 2394 iter->buffer[iter->buffer_idx] = 0;
2453 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
2454 } 2396 }
@@ -2479,14 +2421,14 @@ static const struct file_operations ftrace_avail_fops = {
2479 .open = ftrace_avail_open, 2421 .open = ftrace_avail_open,
2480 .read = seq_read, 2422 .read = seq_read,
2481 .llseek = seq_lseek, 2423 .llseek = seq_lseek,
2482 .release = ftrace_avail_release, 2424 .release = seq_release_private,
2483}; 2425};
2484 2426
2485static const struct file_operations ftrace_failures_fops = { 2427static const struct file_operations ftrace_failures_fops = {
2486 .open = ftrace_failures_open, 2428 .open = ftrace_failures_open,
2487 .read = seq_read, 2429 .read = seq_read,
2488 .llseek = seq_lseek, 2430 .llseek = seq_lseek,
2489 .release = ftrace_avail_release, 2431 .release = seq_release_private,
2490}; 2432};
2491 2433
2492static const struct file_operations ftrace_filter_fops = { 2434static const struct file_operations ftrace_filter_fops = {
@@ -2548,7 +2490,6 @@ static void g_stop(struct seq_file *m, void *p)
2548static int g_show(struct seq_file *m, void *v) 2490static int g_show(struct seq_file *m, void *v)
2549{ 2491{
2550 unsigned long *ptr = v; 2492 unsigned long *ptr = v;
2551 char str[KSYM_SYMBOL_LEN];
2552 2493
2553 if (!ptr) 2494 if (!ptr)
2554 return 0; 2495 return 0;
@@ -2558,9 +2499,7 @@ static int g_show(struct seq_file *m, void *v)
2558 return 0; 2499 return 0;
2559 } 2500 }
2560 2501
2561 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2502 seq_printf(m, "%pf\n", v);
2562
2563 seq_printf(m, "%s\n", str);
2564 2503
2565 return 0; 2504 return 0;
2566} 2505}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu(cpu, cpu_possible_mask) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..454e74e718cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -218,17 +218,12 @@ enum {
218 218
219static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
220{ 220{
221 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
222 && event->time_delta == 0;
223}
224
225static inline int rb_discarded_event(struct ring_buffer_event *event)
226{
227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
228} 222}
229 223
230static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
231{ 225{
226 /* padding has a NULL time_delta */
232 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
233 event->time_delta = 0; 228 event->time_delta = 0;
234} 229}
@@ -322,6 +317,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
323}; 318};
324 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
325struct buffer_page { 328struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -330,6 +333,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
331}; 334};
332 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
333static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
334{ 352{
335 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -403,21 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
404 int cpu; 422 int cpu;
405 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 425 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
409 struct list_head pages; 427 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 432 local_t commit_overrun;
415 unsigned long commit_overrun; 433 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 434 local_t entries;
419 local_t committing; 435 local_t committing;
420 local_t commits; 436 local_t commits;
437 unsigned long read;
421 u64 write_stamp; 438 u64 write_stamp;
422 u64 read_stamp; 439 u64 read_stamp;
423 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -450,14 +467,19 @@ struct ring_buffer_iter {
450}; 467};
451 468
452/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
453#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
454 ({ \ 471 ({ \
455 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
456 if (_____ret) { \ 473 if (_____ret) { \
457 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
458 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
459 } \ 476 (void *)b; \
460 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
461 }) 483 })
462 484
463/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 511}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
492/** 898/**
493 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 904 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 906{
501 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
503 909
910 rb_head_page_deactivate(cpu_buffer);
911
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 913 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 915 return -1;
508 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
509 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
515 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
516 } 929 }
517 930
931 rb_head_page_activate(cpu_buffer);
932
518 return 0; 933 return 0;
519} 934}
520 935
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 937 unsigned nr_pages)
523{ 938{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 940 unsigned long addr;
527 LIST_HEAD(pages); 941 LIST_HEAD(pages);
528 unsigned i; 942 unsigned i;
529 943
944 WARN_ON(!nr_pages);
945
530 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 949 if (!bpage)
534 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
535 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
536 955
537 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
542 } 961 }
543 962
544 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
545 970
546 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
547 972
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 1001
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1004 if (!bpage)
581 goto fail_free_buffer; 1005 goto fail_free_buffer;
582 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
583 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1011 if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1020 goto fail_free_reader;
595 1021
596 cpu_buffer->head_page 1022 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1025
1026 rb_head_page_activate(cpu_buffer);
1027
600 return cpu_buffer; 1028 return cpu_buffer;
601 1029
602 fail_free_reader: 1030 fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1037
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1039{
612 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
614 1042
615 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
616 1044
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
620 } 1054 }
1055
621 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
622} 1057}
623 1058
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
760 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
761 synchronize_sched(); 1196 synchronize_sched();
762 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
763 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
764 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
765 return; 1202 return;
766 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
767 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
768 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
769 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
770 } 1207 }
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1209 return;
773 1210
774 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
790 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
791 synchronize_sched(); 1228 synchronize_sched();
792 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
793 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
794 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
795 return; 1235 return;
796 p = pages->next; 1236 p = pages->next;
797 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
798 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
799 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
800 } 1240 }
801 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
802 1243
803 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
804 1245
@@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
949} 1390}
950 1391
951static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
952rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
953{
954 return __rb_page_index(cpu_buffer->head_page,
955 cpu_buffer->head_page->read);
956}
957
958static inline struct ring_buffer_event *
959rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
960{ 1394{
961 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
962} 1396}
963 1397
964static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
965{ 1399{
966 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
967} 1401}
968 1402
969static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
971 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
972} 1406}
973 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
974/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
975static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
976{ 1415{
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
983 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
984} 1423}
985 1424
986static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
987{
988 return rb_page_commit(cpu_buffer->head_page);
989}
990
991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
992 struct buffer_page **bpage)
993{
994 struct list_head *p = (*bpage)->list.next;
995
996 if (p == &cpu_buffer->pages)
997 p = p->next;
998
999 *bpage = list_entry(p, struct buffer_page, list);
1000}
1001
1002static inline unsigned 1425static inline unsigned
1003rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1004{ 1427{
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1024static void 1447static void
1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1026{ 1449{
1450 unsigned long max_count;
1451
1027 /* 1452 /*
1028 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1029 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033 * assign the commit to the tail. 1458 * assign the commit to the tail.
1034 */ 1459 */
1035 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1036 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1037 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1038 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1039 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1040 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1041 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1044 } 1476 }
1045 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1046 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1047 cpu_buffer->commit_page->page->commit = 1479
1048 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1049 barrier(); 1485 barrier();
1050 } 1486 }
1051 1487
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1078 * to the head page instead of next. 1514 * to the head page instead of next.
1079 */ 1515 */
1080 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1081 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1082 else 1518 else
1083 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1084 1520
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1122 } 1558 }
1123} 1559}
1124 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1125static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1126{ 1719{
1127 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1185,9 +1778,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1185 event->type_len = RINGBUF_TYPE_PADDING; 1778 event->type_len = RINGBUF_TYPE_PADDING;
1186 /* time delta must be non zero */ 1779 /* time delta must be non zero */
1187 event->time_delta = 1; 1780 event->time_delta = 1;
1188 /* Account for this as an entry */
1189 local_inc(&tail_page->entries);
1190 local_inc(&cpu_buffer->entries);
1191 1781
1192 /* Set write to end of buffer */ 1782 /* Set write to end of buffer */
1193 length = (tail + length) - BUF_PAGE_SIZE; 1783 length = (tail + length) - BUF_PAGE_SIZE;
@@ -1200,96 +1790,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1200 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1201 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1202{ 1792{
1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false; 1794 struct buffer_page *next_page;
1206 unsigned long flags; 1795 int ret;
1207 1796
1208 next_page = tail_page; 1797 next_page = tail_page;
1209 1798
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1231 goto out_reset;
1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1235
1236 lock_taken = true;
1237
1238 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1239 1800
1240 head_page = cpu_buffer->head_page;
1241 reader_page = cpu_buffer->reader_page;
1242
1243 /* we grabbed the lock before incrementing */
1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1245 goto out_reset;
1246
1247 /* 1801 /*
1248 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1250 * about it. 1804 * about it.
1251 */ 1805 */
1252 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1254 goto out_reset; 1808 goto out_reset;
1255 } 1809 }
1256 1810
1257 if (next_page == head_page) { 1811 /*
1258 if (!(buffer->flags & RB_FL_OVERWRITE)) 1812 * This is where the fun begins!
1259 goto out_reset; 1813 *
1260 1814 * We are fighting against races between a reader that
1261 /* tail_page has not moved yet? */ 1815 * could be on another CPU trying to swap its reader
1262 if (tail_page == cpu_buffer->tail_page) { 1816 * page with the buffer head.
1263 /* count overflows */ 1817 *
1264 cpu_buffer->overrun += 1818 * We are also fighting against interrupts coming in and
1265 local_read(&head_page->entries); 1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1824 */
1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1266 1826
1267 rb_inc_page(cpu_buffer, &head_page); 1827 /*
1268 cpu_buffer->head_page = head_page; 1828 * If the commit is not on the reader page, then
1269 cpu_buffer->head_page->read = 0; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1270 } 1864 }
1271 } 1865 }
1272 1866
1273 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1274 * If the tail page is still the same as what we think 1868 if (ret) {
1275 * it is, then it is up to us to update the tail 1869 /*
1276 * pointer. 1870 * Nested commits always have zero deltas, so
1277 */ 1871 * just reread the time stamp
1278 if (tail_page == cpu_buffer->tail_page) { 1872 */
1279 local_set(&next_page->write, 0);
1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts; 1874 next_page->page->time_stamp = *ts;
1287 } 1875 }
1288 1876
1289 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1877 out_again:
1290 1878
1291 __raw_spin_unlock(&cpu_buffer->lock); 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1292 local_irq_restore(flags);
1293 1880
1294 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1295 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
@@ -1298,9 +1885,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1298 /* reset write */ 1885 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1300 1887
1301 if (likely(lock_taken))
1302 __raw_spin_unlock(&cpu_buffer->lock);
1303 local_irq_restore(flags);
1304 return NULL; 1888 return NULL;
1305} 1889}
1306 1890
@@ -1317,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1317 barrier(); 1901 barrier();
1318 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1320 tail = write - length; 1907 tail = write - length;
1321 1908
1322 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1361 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1362 1949
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1364 /* 1953 /*
1365 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1367 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1368 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1369 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1370 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index) 1962 if (index == old_index)
1372 return 1; 1963 return 1;
@@ -1482,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1482} 2073}
1483 2074
1484static struct ring_buffer_event * 2075static struct ring_buffer_event *
1485rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1486 unsigned long length) 2078 unsigned long length)
1487{ 2079{
1488 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1492,6 +2084,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1492 2084
1493 rb_start_commit(cpu_buffer); 2085 rb_start_commit(cpu_buffer);
1494 2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1495 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1496 again: 2103 again:
1497 /* 2104 /*
@@ -1652,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1652 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1653 goto out; 2260 goto out;
1654 2261
1655 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1656 if (!event) 2263 if (!event)
1657 goto out; 2264 goto out;
1658 2265
@@ -1675,18 +2282,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1675} 2282}
1676EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1677 2284
1678static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1679 struct ring_buffer_event *event) 2287 struct ring_buffer_event *event)
1680{ 2288{
1681 local_inc(&cpu_buffer->entries);
1682
1683 /* 2289 /*
1684 * The event first in the commit queue updates the 2290 * The event first in the commit queue updates the
1685 * time stamp. 2291 * time stamp.
1686 */ 2292 */
1687 if (rb_event_is_commit(cpu_buffer, event)) 2293 if (rb_event_is_commit(cpu_buffer, event))
1688 cpu_buffer->write_stamp += event->time_delta; 2294 cpu_buffer->write_stamp += event->time_delta;
2295}
1689 2296
2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2298 struct ring_buffer_event *event)
2299{
2300 local_inc(&cpu_buffer->entries);
2301 rb_update_write_stamp(cpu_buffer, event);
1690 rb_end_commit(cpu_buffer); 2302 rb_end_commit(cpu_buffer);
1691} 2303}
1692 2304
@@ -1733,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1733 event->time_delta = 1; 2345 event->time_delta = 1;
1734} 2346}
1735 2347
1736/** 2348/*
1737 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1738 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1739 * 2351 * to the page it is on. This may only be called before the commit
1740 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1741 * This function lets the user discard an event in the ring buffer
1742 * and then that event will not be read later.
1743 *
1744 * Note, it is up to the user to be careful with this, and protect
1745 * against races. If the user discards an event that has been consumed
1746 * it is possible that it could corrupt the ring buffer.
1747 */ 2353 */
1748void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1749{ 2357{
1750 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1751} 2386}
1752EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1753 2387
1754/** 2388/**
1755 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1756 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1757 * @event: non committed event to discard 2391 * @event: non committed event to discard
1758 * 2392 *
1759 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1760 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1761 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1762 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1763 * 2400 *
1764 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1786,14 +2423,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1786 */ 2423 */
1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1788 2425
2426 rb_decrement_entry(cpu_buffer, event);
1789 if (rb_try_to_discard(cpu_buffer, event)) 2427 if (rb_try_to_discard(cpu_buffer, event))
1790 goto out; 2428 goto out;
1791 2429
1792 /* 2430 /*
1793 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1794 * must increment entries. 2432 * must still update the timestamp.
1795 */ 2433 */
1796 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1797 out: 2435 out:
1798 rb_end_commit(cpu_buffer); 2436 rb_end_commit(cpu_buffer);
1799 2437
@@ -1854,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1854 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1855 goto out; 2493 goto out;
1856 2494
1857 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1858 if (!event) 2496 if (!event)
1859 goto out; 2497 goto out;
1860 2498
@@ -1875,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1875static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1876{ 2514{
1877 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1878 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1879 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1880 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1881 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1882 (commit == reader || 2524 (commit == reader ||
1883 (commit == head && 2525 (commit == head &&
@@ -1968,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1968 return 0; 2610 return 0;
1969 2611
1970 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1972 - cpu_buffer->read; 2614 - cpu_buffer->read;
1973 2615
1974 return ret; 2616 return ret;
@@ -1989,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989 return 0; 2631 return 0;
1990 2632
1991 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1992 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1993 2635
1994 return ret; 2636 return ret;
1995} 2637}
1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1997 2639
1998/** 2640/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2030 return 0; 2652 return 0;
2031 2653
2032 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
2034 2656
2035 return ret; 2657 return ret;
2036} 2658}
@@ -2053,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2053 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2054 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2055 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2057 } 2679 }
2058 2680
2059 return entries; 2681 return entries;
@@ -2076,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2076 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2077 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2078 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2079 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2080 } 2702 }
2081 2703
2082 return overruns; 2704 return overruns;
@@ -2089,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2089 2711
2090 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2091 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2092 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2093 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2094 } else { 2718 } else {
2095 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2096 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2208 unsigned long flags; 2832 unsigned long flags;
2209 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2210 2835
2211 local_irq_save(flags); 2836 local_irq_save(flags);
2212 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2240 goto out; 2865 goto out;
2241 2866
2242 /* 2867 /*
2243 * Splice the empty reader page into the list around the head.
2244 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2245 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2246 2873
2247 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2248 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2249 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2250 2881
2251 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2252 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2253 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2254 2888
2255 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2256 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2257 reader->list.next->prev = &cpu_buffer->reader_page->list;
2258 2891
2259 /* 2892 /*
2260 * If the tail is on the reader, then we must set the head 2893 * Here's the tricky part.
2261 * to the inserted page, otherwise we set it one before. 2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2262 */ 2901 */
2263 cpu_buffer->head_page = cpu_buffer->reader_page;
2264 2902
2265 if (cpu_buffer->commit_page != reader) 2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2266 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2904
2905 /*
2906 * If we did not convert it, then we must try again.
2907 */
2908 if (!ret)
2909 goto spin;
2910
2911 /*
2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2267 2918
2268 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2269 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2292,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2292 2943
2293 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2294 2945
2295 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2296 || rb_discarded_event(event))
2297 cpu_buffer->read++; 2947 cpu_buffer->read++;
2298 2948
2299 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2525,10 +3175,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2525 spin_unlock(&cpu_buffer->reader_lock); 3175 spin_unlock(&cpu_buffer->reader_lock);
2526 local_irq_restore(flags); 3176 local_irq_restore(flags);
2527 3177
2528 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3178 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2529 cpu_relax();
2530 goto again; 3179 goto again;
2531 }
2532 3180
2533 return event; 3181 return event;
2534} 3182}
@@ -2553,10 +3201,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2553 event = rb_iter_peek(iter, ts); 3201 event = rb_iter_peek(iter, ts);
2554 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3202 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2555 3203
2556 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3204 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2557 cpu_relax();
2558 goto again; 3205 goto again;
2559 }
2560 3206
2561 return event; 3207 return event;
2562} 3208}
@@ -2602,10 +3248,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2602 out: 3248 out:
2603 preempt_enable(); 3249 preempt_enable();
2604 3250
2605 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3251 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2606 cpu_relax();
2607 goto again; 3252 goto again;
2608 }
2609 3253
2610 return event; 3254 return event;
2611} 3255}
@@ -2685,21 +3329,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2685 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3329 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2686 unsigned long flags; 3330 unsigned long flags;
2687 3331
2688 again:
2689 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3332 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3333 again:
2690 event = rb_iter_peek(iter, ts); 3334 event = rb_iter_peek(iter, ts);
2691 if (!event) 3335 if (!event)
2692 goto out; 3336 goto out;
2693 3337
3338 if (event->type_len == RINGBUF_TYPE_PADDING)
3339 goto again;
3340
2694 rb_advance_iter(iter); 3341 rb_advance_iter(iter);
2695 out: 3342 out:
2696 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3343 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2697 3344
2698 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2699 cpu_relax();
2700 goto again;
2701 }
2702
2703 return event; 3345 return event;
2704} 3346}
2705EXPORT_SYMBOL_GPL(ring_buffer_read); 3347EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2717,8 +3359,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2717static void 3359static void
2718rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3360rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2719{ 3361{
3362 rb_head_page_deactivate(cpu_buffer);
3363
2720 cpu_buffer->head_page 3364 cpu_buffer->head_page
2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3365 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2722 local_set(&cpu_buffer->head_page->write, 0); 3366 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0); 3367 local_set(&cpu_buffer->head_page->entries, 0);
2724 local_set(&cpu_buffer->head_page->page->commit, 0); 3368 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3378,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2734 local_set(&cpu_buffer->reader_page->page->commit, 0); 3378 local_set(&cpu_buffer->reader_page->page->commit, 0);
2735 cpu_buffer->reader_page->read = 0; 3379 cpu_buffer->reader_page->read = 0;
2736 3380
2737 cpu_buffer->nmi_dropped = 0; 3381 local_set(&cpu_buffer->commit_overrun, 0);
2738 cpu_buffer->commit_overrun = 0; 3382 local_set(&cpu_buffer->overrun, 0);
2739 cpu_buffer->overrun = 0;
2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0); 3383 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0); 3384 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0); 3385 local_set(&cpu_buffer->commits, 0);
3386 cpu_buffer->read = 0;
2744 3387
2745 cpu_buffer->write_stamp = 0; 3388 cpu_buffer->write_stamp = 0;
2746 cpu_buffer->read_stamp = 0; 3389 cpu_buffer->read_stamp = 0;
3390
3391 rb_head_page_activate(cpu_buffer);
2747} 3392}
2748 3393
2749/** 3394/**
@@ -2763,12 +3408,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2763 3408
2764 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2765 3410
3411 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3412 goto out;
3413
2766 __raw_spin_lock(&cpu_buffer->lock); 3414 __raw_spin_lock(&cpu_buffer->lock);
2767 3415
2768 rb_reset_cpu(cpu_buffer); 3416 rb_reset_cpu(cpu_buffer);
2769 3417
2770 __raw_spin_unlock(&cpu_buffer->lock); 3418 __raw_spin_unlock(&cpu_buffer->lock);
2771 3419
3420 out:
2772 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2773 3422
2774 atomic_dec(&cpu_buffer->record_disabled); 3423 atomic_dec(&cpu_buffer->record_disabled);
@@ -2851,6 +3500,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2851} 3500}
2852EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3501EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2853 3502
3503#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2854/** 3504/**
2855 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3505 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2856 * @buffer_a: One buffer to swap with 3506 * @buffer_a: One buffer to swap with
@@ -2905,20 +3555,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2905 atomic_inc(&cpu_buffer_a->record_disabled); 3555 atomic_inc(&cpu_buffer_a->record_disabled);
2906 atomic_inc(&cpu_buffer_b->record_disabled); 3556 atomic_inc(&cpu_buffer_b->record_disabled);
2907 3557
3558 ret = -EBUSY;
3559 if (local_read(&cpu_buffer_a->committing))
3560 goto out_dec;
3561 if (local_read(&cpu_buffer_b->committing))
3562 goto out_dec;
3563
2908 buffer_a->buffers[cpu] = cpu_buffer_b; 3564 buffer_a->buffers[cpu] = cpu_buffer_b;
2909 buffer_b->buffers[cpu] = cpu_buffer_a; 3565 buffer_b->buffers[cpu] = cpu_buffer_a;
2910 3566
2911 cpu_buffer_b->buffer = buffer_a; 3567 cpu_buffer_b->buffer = buffer_a;
2912 cpu_buffer_a->buffer = buffer_b; 3568 cpu_buffer_a->buffer = buffer_b;
2913 3569
3570 ret = 0;
3571
3572out_dec:
2914 atomic_dec(&cpu_buffer_a->record_disabled); 3573 atomic_dec(&cpu_buffer_a->record_disabled);
2915 atomic_dec(&cpu_buffer_b->record_disabled); 3574 atomic_dec(&cpu_buffer_b->record_disabled);
2916
2917 ret = 0;
2918out: 3575out:
2919 return ret; 3576 return ret;
2920} 3577}
2921EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3578EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3579#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2922 3580
2923/** 3581/**
2924 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3582 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3091,7 +3749,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3091 read = 0; 3749 read = 0;
3092 } else { 3750 } else {
3093 /* update the entry counter */ 3751 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries); 3752 cpu_buffer->read += rb_page_entries(reader);
3095 3753
3096 /* swap the pages */ 3754 /* swap the pages */
3097 rb_init_page(bpage); 3755 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c358395d338..5c75deeefe30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,14 +43,11 @@
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45 45
46unsigned long __read_mostly tracing_max_latency;
47unsigned long __read_mostly tracing_thresh;
48
49/* 46/*
50 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
51 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
52 */ 49 */
53static int ring_buffer_expanded; 50int ring_buffer_expanded;
54 51
55/* 52/*
56 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -64,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
64/* 61/*
65 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
66 */ 63 */
67static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
68 65
69/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
70static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
89 */ 86 */
90static int tracing_disabled = 1; 87static int tracing_disabled = 1;
91 88
92static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
93 90
94static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
95{ 92{
@@ -172,10 +169,11 @@ static struct trace_array global_trace;
172 169
173static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
174 171
175int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
176 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
177{ 175{
178 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
179} 177}
180EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
181 179
@@ -266,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
266 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 264 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
267 TRACE_ITER_GRAPH_TIME; 265 TRACE_ITER_GRAPH_TIME;
268 266
267static int trace_stop_count;
268static DEFINE_SPINLOCK(tracing_start_lock);
269
269/** 270/**
270 * trace_wake_up - wake up tasks waiting for trace input 271 * trace_wake_up - wake up tasks waiting for trace input
271 * 272 *
@@ -323,50 +324,20 @@ static const char *trace_options[] = {
323 "printk-msg-only", 324 "printk-msg-only",
324 "context-info", 325 "context-info",
325 "latency-format", 326 "latency-format",
326 "global-clock",
327 "sleep-time", 327 "sleep-time",
328 "graph-time", 328 "graph-time",
329 NULL 329 NULL
330}; 330};
331 331
332/* 332static struct {
333 * ftrace_max_lock is used to protect the swapping of buffers 333 u64 (*func)(void);
334 * when taking a max snapshot. The buffers themselves are 334 const char *name;
335 * protected by per_cpu spinlocks. But the action of the swap 335} trace_clocks[] = {
336 * needs its own lock. 336 { trace_clock_local, "local" },
337 * 337 { trace_clock_global, "global" },
338 * This is defined as a raw_spinlock_t in order to help 338};
339 * with performance when lockdep debugging is enabled.
340 */
341static raw_spinlock_t ftrace_max_lock =
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
343
344/*
345 * Copy the new maximum trace into the separate maximum-trace
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */
349static void
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{
352 struct trace_array_cpu *data = tr->data[cpu];
353
354 max_tr.cpu = cpu;
355 max_tr.time_start = data->preempt_timestamp;
356 339
357 data = max_tr.data[cpu]; 340int trace_clock_id;
358 data->saved_latency = tracing_max_latency;
359
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
361 data->pid = tsk->pid;
362 data->uid = task_uid(tsk);
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366
367 /* record this tasks comm */
368 tracing_record_cmdline(tsk);
369}
370 341
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
372{ 343{
@@ -411,6 +382,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 382 return cnt;
412} 383}
413 384
385/*
386 * ftrace_max_lock is used to protect the swapping of buffers
387 * when taking a max snapshot. The buffers themselves are
388 * protected by per_cpu spinlocks. But the action of the swap
389 * needs its own lock.
390 *
391 * This is defined as a raw_spinlock_t in order to help
392 * with performance when lockdep debugging is enabled.
393 *
394 * It is also used in other places outside the update_max_tr
395 * so it needs to be defined outside of the
396 * CONFIG_TRACER_MAX_TRACE.
397 */
398static raw_spinlock_t ftrace_max_lock =
399 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
400
401#ifdef CONFIG_TRACER_MAX_TRACE
402unsigned long __read_mostly tracing_max_latency;
403unsigned long __read_mostly tracing_thresh;
404
405/*
406 * Copy the new maximum trace into the separate maximum-trace
407 * structure. (this way the maximum trace is permanently saved,
408 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
409 */
410static void
411__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
412{
413 struct trace_array_cpu *data = tr->data[cpu];
414 struct trace_array_cpu *max_data = tr->data[cpu];
415
416 max_tr.cpu = cpu;
417 max_tr.time_start = data->preempt_timestamp;
418
419 max_data = max_tr.data[cpu];
420 max_data->saved_latency = tracing_max_latency;
421 max_data->critical_start = data->critical_start;
422 max_data->critical_end = data->critical_end;
423
424 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
425 max_data->pid = tsk->pid;
426 max_data->uid = task_uid(tsk);
427 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
428 max_data->policy = tsk->policy;
429 max_data->rt_priority = tsk->rt_priority;
430
431 /* record this tasks comm */
432 tracing_record_cmdline(tsk);
433}
434
414/** 435/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 436 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 437 * @tr: tracer
@@ -425,16 +446,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 446{
426 struct ring_buffer *buf = tr->buffer; 447 struct ring_buffer *buf = tr->buffer;
427 448
449 if (trace_stop_count)
450 return;
451
428 WARN_ON_ONCE(!irqs_disabled()); 452 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 453 __raw_spin_lock(&ftrace_max_lock);
430 454
431 tr->buffer = max_tr.buffer; 455 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 456 max_tr.buffer = buf;
433 457
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 458 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 459 __raw_spin_unlock(&ftrace_max_lock);
440} 460}
@@ -452,21 +472,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 472{
453 int ret; 473 int ret;
454 474
475 if (trace_stop_count)
476 return;
477
455 WARN_ON_ONCE(!irqs_disabled()); 478 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 479 __raw_spin_lock(&ftrace_max_lock);
457 480
458 ftrace_disable_cpu(); 481 ftrace_disable_cpu();
459 482
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 483 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 484
485 if (ret == -EBUSY) {
486 /*
487 * We failed to swap the buffer due to a commit taking
488 * place on this CPU. We fail to record, but we reset
489 * the max trace buffer (no one writes directly to it)
490 * and flag that it failed.
491 */
492 trace_array_printk(&max_tr, _THIS_IP_,
493 "Failed to swap buffers due to commit in progress\n");
494 }
495
463 ftrace_enable_cpu(); 496 ftrace_enable_cpu();
464 497
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 498 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 499
467 __update_max_tr(tr, tsk, cpu); 500 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 501 __raw_spin_unlock(&ftrace_max_lock);
469} 502}
503#endif /* CONFIG_TRACER_MAX_TRACE */
470 504
471/** 505/**
472 * register_tracer - register a tracer with the ftrace system. 506 * register_tracer - register a tracer with the ftrace system.
@@ -523,7 +557,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 557 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 558 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 559 struct trace_array *tr = &global_trace;
526 int i;
527 560
528 /* 561 /*
529 * Run a selftest on this tracer. 562 * Run a selftest on this tracer.
@@ -532,8 +565,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 565 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 566 * If we fail, we do not register this tracer.
534 */ 567 */
535 for_each_tracing_cpu(i) 568 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 569
538 current_trace = type; 570 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 571 /* the test is responsible for initializing and enabling */
@@ -546,8 +578,7 @@ __acquires(kernel_lock)
546 goto out; 578 goto out;
547 } 579 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 580 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 581 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 582
552 printk(KERN_CONT "PASSED\n"); 583 printk(KERN_CONT "PASSED\n");
553 } 584 }
@@ -622,21 +653,42 @@ void unregister_tracer(struct tracer *type)
622 mutex_unlock(&trace_types_lock); 653 mutex_unlock(&trace_types_lock);
623} 654}
624 655
625void tracing_reset(struct trace_array *tr, int cpu) 656static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 657{
627 ftrace_disable_cpu(); 658 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 659 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 660 ftrace_enable_cpu();
630} 661}
631 662
663void tracing_reset(struct trace_array *tr, int cpu)
664{
665 struct ring_buffer *buffer = tr->buffer;
666
667 ring_buffer_record_disable(buffer);
668
669 /* Make sure all commits have finished */
670 synchronize_sched();
671 __tracing_reset(tr, cpu);
672
673 ring_buffer_record_enable(buffer);
674}
675
632void tracing_reset_online_cpus(struct trace_array *tr) 676void tracing_reset_online_cpus(struct trace_array *tr)
633{ 677{
678 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 679 int cpu;
635 680
681 ring_buffer_record_disable(buffer);
682
683 /* Make sure all commits have finished */
684 synchronize_sched();
685
636 tr->time_start = ftrace_now(tr->cpu); 686 tr->time_start = ftrace_now(tr->cpu);
637 687
638 for_each_online_cpu(cpu) 688 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 689 __tracing_reset(tr, cpu);
690
691 ring_buffer_record_enable(buffer);
640} 692}
641 693
642void tracing_reset_current(int cpu) 694void tracing_reset_current(int cpu)
@@ -667,9 +719,6 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 719 cmdline_idx = 0;
668} 720}
669 721
670static int trace_stop_count;
671static DEFINE_SPINLOCK(tracing_start_lock);
672
673/** 722/**
674 * ftrace_off_permanent - disable all ftrace code permanently 723 * ftrace_off_permanent - disable all ftrace code permanently
675 * 724 *
@@ -850,14 +899,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
850} 899}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 900EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
852 901
853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 902struct ring_buffer_event *
854 int type, 903trace_buffer_lock_reserve(struct ring_buffer *buffer,
855 unsigned long len, 904 int type,
856 unsigned long flags, int pc) 905 unsigned long len,
906 unsigned long flags, int pc)
857{ 907{
858 struct ring_buffer_event *event; 908 struct ring_buffer_event *event;
859 909
860 event = ring_buffer_lock_reserve(tr->buffer, len); 910 event = ring_buffer_lock_reserve(buffer, len);
861 if (event != NULL) { 911 if (event != NULL) {
862 struct trace_entry *ent = ring_buffer_event_data(event); 912 struct trace_entry *ent = ring_buffer_event_data(event);
863 913
@@ -867,58 +917,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
867 917
868 return event; 918 return event;
869} 919}
870static void ftrace_trace_stack(struct trace_array *tr,
871 unsigned long flags, int skip, int pc);
872static void ftrace_trace_userstack(struct trace_array *tr,
873 unsigned long flags, int pc);
874 920
875static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 921static inline void
876 struct ring_buffer_event *event, 922__trace_buffer_unlock_commit(struct ring_buffer *buffer,
877 unsigned long flags, int pc, 923 struct ring_buffer_event *event,
878 int wake) 924 unsigned long flags, int pc,
925 int wake)
879{ 926{
880 ring_buffer_unlock_commit(tr->buffer, event); 927 ring_buffer_unlock_commit(buffer, event);
881 928
882 ftrace_trace_stack(tr, flags, 6, pc); 929 ftrace_trace_stack(buffer, flags, 6, pc);
883 ftrace_trace_userstack(tr, flags, pc); 930 ftrace_trace_userstack(buffer, flags, pc);
884 931
885 if (wake) 932 if (wake)
886 trace_wake_up(); 933 trace_wake_up();
887} 934}
888 935
889void trace_buffer_unlock_commit(struct trace_array *tr, 936void trace_buffer_unlock_commit(struct ring_buffer *buffer,
890 struct ring_buffer_event *event, 937 struct ring_buffer_event *event,
891 unsigned long flags, int pc) 938 unsigned long flags, int pc)
892{ 939{
893 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 940 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
894} 941}
895 942
896struct ring_buffer_event * 943struct ring_buffer_event *
897trace_current_buffer_lock_reserve(int type, unsigned long len, 944trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
945 int type, unsigned long len,
898 unsigned long flags, int pc) 946 unsigned long flags, int pc)
899{ 947{
900 return trace_buffer_lock_reserve(&global_trace, 948 *current_rb = global_trace.buffer;
949 return trace_buffer_lock_reserve(*current_rb,
901 type, len, flags, pc); 950 type, len, flags, pc);
902} 951}
903EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 952EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
904 953
905void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 954void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
955 struct ring_buffer_event *event,
906 unsigned long flags, int pc) 956 unsigned long flags, int pc)
907{ 957{
908 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 958 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
909} 959}
910EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 960EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
911 961
912void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 962void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
913 unsigned long flags, int pc) 963 struct ring_buffer_event *event,
964 unsigned long flags, int pc)
914{ 965{
915 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 966 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
916} 967}
917EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 968EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
918 969
919void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 970void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
971 struct ring_buffer_event *event)
920{ 972{
921 ring_buffer_discard_commit(global_trace.buffer, event); 973 ring_buffer_discard_commit(buffer, event);
922} 974}
923EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 975EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
924 976
@@ -928,6 +980,7 @@ trace_function(struct trace_array *tr,
928 int pc) 980 int pc)
929{ 981{
930 struct ftrace_event_call *call = &event_function; 982 struct ftrace_event_call *call = &event_function;
983 struct ring_buffer *buffer = tr->buffer;
931 struct ring_buffer_event *event; 984 struct ring_buffer_event *event;
932 struct ftrace_entry *entry; 985 struct ftrace_entry *entry;
933 986
@@ -935,7 +988,7 @@ trace_function(struct trace_array *tr,
935 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 988 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
936 return; 989 return;
937 990
938 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 991 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
939 flags, pc); 992 flags, pc);
940 if (!event) 993 if (!event)
941 return; 994 return;
@@ -943,58 +996,10 @@ trace_function(struct trace_array *tr,
943 entry->ip = ip; 996 entry->ip = ip;
944 entry->parent_ip = parent_ip; 997 entry->parent_ip = parent_ip;
945 998
946 if (!filter_check_discard(call, entry, tr->buffer, event)) 999 if (!filter_check_discard(call, entry, buffer, event))
947 ring_buffer_unlock_commit(tr->buffer, event); 1000 ring_buffer_unlock_commit(buffer, event);
948}
949
950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
951static int __trace_graph_entry(struct trace_array *tr,
952 struct ftrace_graph_ent *trace,
953 unsigned long flags,
954 int pc)
955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
957 struct ring_buffer_event *event;
958 struct ftrace_graph_ent_entry *entry;
959
960 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
961 return 0;
962
963 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
964 sizeof(*entry), flags, pc);
965 if (!event)
966 return 0;
967 entry = ring_buffer_event_data(event);
968 entry->graph_ent = *trace;
969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
971
972 return 1;
973} 1001}
974 1002
975static void __trace_graph_return(struct trace_array *tr,
976 struct ftrace_graph_ret *trace,
977 unsigned long flags,
978 int pc)
979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
981 struct ring_buffer_event *event;
982 struct ftrace_graph_ret_entry *entry;
983
984 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
985 return;
986
987 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
988 sizeof(*entry), flags, pc);
989 if (!event)
990 return;
991 entry = ring_buffer_event_data(event);
992 entry->ret = *trace;
993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
995}
996#endif
997
998void 1003void
999ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1004ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1000 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1005 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,17 +1009,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1004 trace_function(tr, ip, parent_ip, flags, pc); 1009 trace_function(tr, ip, parent_ip, flags, pc);
1005} 1010}
1006 1011
1007static void __ftrace_trace_stack(struct trace_array *tr, 1012#ifdef CONFIG_STACKTRACE
1013static void __ftrace_trace_stack(struct ring_buffer *buffer,
1008 unsigned long flags, 1014 unsigned long flags,
1009 int skip, int pc) 1015 int skip, int pc)
1010{ 1016{
1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack; 1017 struct ftrace_event_call *call = &event_kernel_stack;
1013 struct ring_buffer_event *event; 1018 struct ring_buffer_event *event;
1014 struct stack_entry *entry; 1019 struct stack_entry *entry;
1015 struct stack_trace trace; 1020 struct stack_trace trace;
1016 1021
1017 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1022 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1018 sizeof(*entry), flags, pc); 1023 sizeof(*entry), flags, pc);
1019 if (!event) 1024 if (!event)
1020 return; 1025 return;
@@ -1027,32 +1032,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1027 trace.entries = entry->caller; 1032 trace.entries = entry->caller;
1028 1033
1029 save_stack_trace(&trace); 1034 save_stack_trace(&trace);
1030 if (!filter_check_discard(call, entry, tr->buffer, event)) 1035 if (!filter_check_discard(call, entry, buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event); 1036 ring_buffer_unlock_commit(buffer, event);
1032#endif
1033} 1037}
1034 1038
1035static void ftrace_trace_stack(struct trace_array *tr, 1039void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1036 unsigned long flags, 1040 int skip, int pc)
1037 int skip, int pc)
1038{ 1041{
1039 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1042 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1040 return; 1043 return;
1041 1044
1042 __ftrace_trace_stack(tr, flags, skip, pc); 1045 __ftrace_trace_stack(buffer, flags, skip, pc);
1043} 1046}
1044 1047
1045void __trace_stack(struct trace_array *tr, 1048void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1046 unsigned long flags, 1049 int pc)
1047 int skip, int pc)
1048{ 1050{
1049 __ftrace_trace_stack(tr, flags, skip, pc); 1051 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1050} 1052}
1051 1053
1052static void ftrace_trace_userstack(struct trace_array *tr, 1054void
1053 unsigned long flags, int pc) 1055ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1054{ 1056{
1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack; 1057 struct ftrace_event_call *call = &event_user_stack;
1057 struct ring_buffer_event *event; 1058 struct ring_buffer_event *event;
1058 struct userstack_entry *entry; 1059 struct userstack_entry *entry;
@@ -1061,7 +1062,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1061 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1062 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1062 return; 1063 return;
1063 1064
1064 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1065 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1065 sizeof(*entry), flags, pc); 1066 sizeof(*entry), flags, pc);
1066 if (!event) 1067 if (!event)
1067 return; 1068 return;
@@ -1075,9 +1076,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1075 trace.entries = entry->caller; 1076 trace.entries = entry->caller;
1076 1077
1077 save_stack_trace_user(&trace); 1078 save_stack_trace_user(&trace);
1078 if (!filter_check_discard(call, entry, tr->buffer, event)) 1079 if (!filter_check_discard(call, entry, buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event); 1080 ring_buffer_unlock_commit(buffer, event);
1080#endif
1081} 1081}
1082 1082
1083#ifdef UNUSED 1083#ifdef UNUSED
@@ -1087,6 +1087,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1087} 1087}
1088#endif /* UNUSED */ 1088#endif /* UNUSED */
1089 1089
1090#endif /* CONFIG_STACKTRACE */
1091
1090static void 1092static void
1091ftrace_trace_special(void *__tr, 1093ftrace_trace_special(void *__tr,
1092 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1094 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1094,9 +1096,10 @@ ftrace_trace_special(void *__tr,
1094{ 1096{
1095 struct ring_buffer_event *event; 1097 struct ring_buffer_event *event;
1096 struct trace_array *tr = __tr; 1098 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer;
1097 struct special_entry *entry; 1100 struct special_entry *entry;
1098 1101
1099 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1102 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1100 sizeof(*entry), 0, pc); 1103 sizeof(*entry), 0, pc);
1101 if (!event) 1104 if (!event)
1102 return; 1105 return;
@@ -1104,7 +1107,7 @@ ftrace_trace_special(void *__tr,
1104 entry->arg1 = arg1; 1107 entry->arg1 = arg1;
1105 entry->arg2 = arg2; 1108 entry->arg2 = arg2;
1106 entry->arg3 = arg3; 1109 entry->arg3 = arg3;
1107 trace_buffer_unlock_commit(tr, event, 0, pc); 1110 trace_buffer_unlock_commit(buffer, event, 0, pc);
1108} 1111}
1109 1112
1110void 1113void
@@ -1115,62 +1118,6 @@ __trace_special(void *__tr, void *__data,
1115} 1118}
1116 1119
1117void 1120void
1118tracing_sched_switch_trace(struct trace_array *tr,
1119 struct task_struct *prev,
1120 struct task_struct *next,
1121 unsigned long flags, int pc)
1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1124 struct ring_buffer_event *event;
1125 struct ctx_switch_entry *entry;
1126
1127 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1128 sizeof(*entry), flags, pc);
1129 if (!event)
1130 return;
1131 entry = ring_buffer_event_data(event);
1132 entry->prev_pid = prev->pid;
1133 entry->prev_prio = prev->prio;
1134 entry->prev_state = prev->state;
1135 entry->next_pid = next->pid;
1136 entry->next_prio = next->prio;
1137 entry->next_state = next->state;
1138 entry->next_cpu = task_cpu(next);
1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1142}
1143
1144void
1145tracing_sched_wakeup_trace(struct trace_array *tr,
1146 struct task_struct *wakee,
1147 struct task_struct *curr,
1148 unsigned long flags, int pc)
1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1151 struct ring_buffer_event *event;
1152 struct ctx_switch_entry *entry;
1153
1154 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1155 sizeof(*entry), flags, pc);
1156 if (!event)
1157 return;
1158 entry = ring_buffer_event_data(event);
1159 entry->prev_pid = curr->pid;
1160 entry->prev_prio = curr->prio;
1161 entry->prev_state = curr->state;
1162 entry->next_pid = wakee->pid;
1163 entry->next_prio = wakee->prio;
1164 entry->next_state = wakee->state;
1165 entry->next_cpu = task_cpu(wakee);
1166
1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1169 ftrace_trace_stack(tr, flags, 6, pc);
1170 ftrace_trace_userstack(tr, flags, pc);
1171}
1172
1173void
1174ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1121ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1175{ 1122{
1176 struct trace_array *tr = &global_trace; 1123 struct trace_array *tr = &global_trace;
@@ -1194,68 +1141,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1194 local_irq_restore(flags); 1141 local_irq_restore(flags);
1195} 1142}
1196 1143
1197#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1198int trace_graph_entry(struct ftrace_graph_ent *trace)
1199{
1200 struct trace_array *tr = &global_trace;
1201 struct trace_array_cpu *data;
1202 unsigned long flags;
1203 long disabled;
1204 int ret;
1205 int cpu;
1206 int pc;
1207
1208 if (!ftrace_trace_task(current))
1209 return 0;
1210
1211 if (!ftrace_graph_addr(trace->func))
1212 return 0;
1213
1214 local_irq_save(flags);
1215 cpu = raw_smp_processor_id();
1216 data = tr->data[cpu];
1217 disabled = atomic_inc_return(&data->disabled);
1218 if (likely(disabled == 1)) {
1219 pc = preempt_count();
1220 ret = __trace_graph_entry(tr, trace, flags, pc);
1221 } else {
1222 ret = 0;
1223 }
1224 /* Only do the atomic if it is not already set */
1225 if (!test_tsk_trace_graph(current))
1226 set_tsk_trace_graph(current);
1227
1228 atomic_dec(&data->disabled);
1229 local_irq_restore(flags);
1230
1231 return ret;
1232}
1233
1234void trace_graph_return(struct ftrace_graph_ret *trace)
1235{
1236 struct trace_array *tr = &global_trace;
1237 struct trace_array_cpu *data;
1238 unsigned long flags;
1239 long disabled;
1240 int cpu;
1241 int pc;
1242
1243 local_irq_save(flags);
1244 cpu = raw_smp_processor_id();
1245 data = tr->data[cpu];
1246 disabled = atomic_inc_return(&data->disabled);
1247 if (likely(disabled == 1)) {
1248 pc = preempt_count();
1249 __trace_graph_return(tr, trace, flags, pc);
1250 }
1251 if (!trace->depth)
1252 clear_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255}
1256#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1257
1258
1259/** 1144/**
1260 * trace_vbprintk - write binary msg to tracing buffer 1145 * trace_vbprintk - write binary msg to tracing buffer
1261 * 1146 *
@@ -1268,6 +1153,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1268 1153
1269 struct ftrace_event_call *call = &event_bprint; 1154 struct ftrace_event_call *call = &event_bprint;
1270 struct ring_buffer_event *event; 1155 struct ring_buffer_event *event;
1156 struct ring_buffer *buffer;
1271 struct trace_array *tr = &global_trace; 1157 struct trace_array *tr = &global_trace;
1272 struct trace_array_cpu *data; 1158 struct trace_array_cpu *data;
1273 struct bprint_entry *entry; 1159 struct bprint_entry *entry;
@@ -1300,7 +1186,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 goto out_unlock; 1186 goto out_unlock;
1301 1187
1302 size = sizeof(*entry) + sizeof(u32) * len; 1188 size = sizeof(*entry) + sizeof(u32) * len;
1303 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1189 buffer = tr->buffer;
1190 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1191 flags, pc);
1304 if (!event) 1192 if (!event)
1305 goto out_unlock; 1193 goto out_unlock;
1306 entry = ring_buffer_event_data(event); 1194 entry = ring_buffer_event_data(event);
@@ -1308,8 +1196,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1308 entry->fmt = fmt; 1196 entry->fmt = fmt;
1309 1197
1310 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1198 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1311 if (!filter_check_discard(call, entry, tr->buffer, event)) 1199 if (!filter_check_discard(call, entry, buffer, event))
1312 ring_buffer_unlock_commit(tr->buffer, event); 1200 ring_buffer_unlock_commit(buffer, event);
1313 1201
1314out_unlock: 1202out_unlock:
1315 __raw_spin_unlock(&trace_buf_lock); 1203 __raw_spin_unlock(&trace_buf_lock);
@@ -1324,14 +1212,30 @@ out:
1324} 1212}
1325EXPORT_SYMBOL_GPL(trace_vbprintk); 1213EXPORT_SYMBOL_GPL(trace_vbprintk);
1326 1214
1327int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1215int trace_array_printk(struct trace_array *tr,
1216 unsigned long ip, const char *fmt, ...)
1217{
1218 int ret;
1219 va_list ap;
1220
1221 if (!(trace_flags & TRACE_ITER_PRINTK))
1222 return 0;
1223
1224 va_start(ap, fmt);
1225 ret = trace_array_vprintk(tr, ip, fmt, ap);
1226 va_end(ap);
1227 return ret;
1228}
1229
1230int trace_array_vprintk(struct trace_array *tr,
1231 unsigned long ip, const char *fmt, va_list args)
1328{ 1232{
1329 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1233 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1330 static char trace_buf[TRACE_BUF_SIZE]; 1234 static char trace_buf[TRACE_BUF_SIZE];
1331 1235
1332 struct ftrace_event_call *call = &event_print; 1236 struct ftrace_event_call *call = &event_print;
1333 struct ring_buffer_event *event; 1237 struct ring_buffer_event *event;
1334 struct trace_array *tr = &global_trace; 1238 struct ring_buffer *buffer;
1335 struct trace_array_cpu *data; 1239 struct trace_array_cpu *data;
1336 int cpu, len = 0, size, pc; 1240 int cpu, len = 0, size, pc;
1337 struct print_entry *entry; 1241 struct print_entry *entry;
@@ -1359,7 +1263,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1359 trace_buf[len] = 0; 1263 trace_buf[len] = 0;
1360 1264
1361 size = sizeof(*entry) + len + 1; 1265 size = sizeof(*entry) + len + 1;
1362 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1266 buffer = tr->buffer;
1267 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1268 irq_flags, pc);
1363 if (!event) 1269 if (!event)
1364 goto out_unlock; 1270 goto out_unlock;
1365 entry = ring_buffer_event_data(event); 1271 entry = ring_buffer_event_data(event);
@@ -1367,8 +1273,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1367 1273
1368 memcpy(&entry->buf, trace_buf, len); 1274 memcpy(&entry->buf, trace_buf, len);
1369 entry->buf[len] = 0; 1275 entry->buf[len] = 0;
1370 if (!filter_check_discard(call, entry, tr->buffer, event)) 1276 if (!filter_check_discard(call, entry, buffer, event))
1371 ring_buffer_unlock_commit(tr->buffer, event); 1277 ring_buffer_unlock_commit(buffer, event);
1372 1278
1373 out_unlock: 1279 out_unlock:
1374 __raw_spin_unlock(&trace_buf_lock); 1280 __raw_spin_unlock(&trace_buf_lock);
@@ -1380,6 +1286,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1380 1286
1381 return len; 1287 return len;
1382} 1288}
1289
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{
1292 return trace_array_printk(&global_trace, ip, fmt, args);
1293}
1383EXPORT_SYMBOL_GPL(trace_vprintk); 1294EXPORT_SYMBOL_GPL(trace_vprintk);
1384 1295
1385enum trace_file_type { 1296enum trace_file_type {
@@ -1519,6 +1430,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1519 return ent; 1430 return ent;
1520} 1431}
1521 1432
1433static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1434{
1435 struct trace_array *tr = iter->tr;
1436 struct ring_buffer_event *event;
1437 struct ring_buffer_iter *buf_iter;
1438 unsigned long entries = 0;
1439 u64 ts;
1440
1441 tr->data[cpu]->skipped_entries = 0;
1442
1443 if (!iter->buffer_iter[cpu])
1444 return;
1445
1446 buf_iter = iter->buffer_iter[cpu];
1447 ring_buffer_iter_reset(buf_iter);
1448
1449 /*
1450 * We could have the case with the max latency tracers
1451 * that a reset never took place on a cpu. This is evident
1452 * by the timestamp being before the start of the buffer.
1453 */
1454 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1455 if (ts >= iter->tr->time_start)
1456 break;
1457 entries++;
1458 ring_buffer_read(buf_iter, NULL);
1459 }
1460
1461 tr->data[cpu]->skipped_entries = entries;
1462}
1463
1522/* 1464/*
1523 * No necessary locking here. The worst thing which can 1465 * No necessary locking here. The worst thing which can
1524 * happen is loosing events consumed at the same time 1466 * happen is loosing events consumed at the same time
@@ -1557,10 +1499,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1557 1499
1558 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1500 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1559 for_each_tracing_cpu(cpu) 1501 for_each_tracing_cpu(cpu)
1560 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1502 tracing_iter_reset(iter, cpu);
1561 } else 1503 } else
1562 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1504 tracing_iter_reset(iter, cpu_file);
1563
1564 1505
1565 ftrace_enable_cpu(); 1506 ftrace_enable_cpu();
1566 1507
@@ -1609,16 +1550,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1609 struct trace_array *tr = iter->tr; 1550 struct trace_array *tr = iter->tr;
1610 struct trace_array_cpu *data = tr->data[tr->cpu]; 1551 struct trace_array_cpu *data = tr->data[tr->cpu];
1611 struct tracer *type = current_trace; 1552 struct tracer *type = current_trace;
1612 unsigned long total; 1553 unsigned long entries = 0;
1613 unsigned long entries; 1554 unsigned long total = 0;
1555 unsigned long count;
1614 const char *name = "preemption"; 1556 const char *name = "preemption";
1557 int cpu;
1615 1558
1616 if (type) 1559 if (type)
1617 name = type->name; 1560 name = type->name;
1618 1561
1619 entries = ring_buffer_entries(iter->tr->buffer); 1562
1620 total = entries + 1563 for_each_tracing_cpu(cpu) {
1621 ring_buffer_overruns(iter->tr->buffer); 1564 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1565 /*
1566 * If this buffer has skipped entries, then we hold all
1567 * entries for the trace and we need to ignore the
1568 * ones before the time stamp.
1569 */
1570 if (tr->data[cpu]->skipped_entries) {
1571 count -= tr->data[cpu]->skipped_entries;
1572 /* total is the same as the entries */
1573 total += count;
1574 } else
1575 total += count +
1576 ring_buffer_overrun_cpu(tr->buffer, cpu);
1577 entries += count;
1578 }
1622 1579
1623 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1580 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1624 name, UTS_RELEASE); 1581 name, UTS_RELEASE);
@@ -1660,7 +1617,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1660 seq_puts(m, "\n# => ended at: "); 1617 seq_puts(m, "\n# => ended at: ");
1661 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1618 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1662 trace_print_seq(m, &iter->seq); 1619 trace_print_seq(m, &iter->seq);
1663 seq_puts(m, "#\n"); 1620 seq_puts(m, "\n#\n");
1664 } 1621 }
1665 1622
1666 seq_puts(m, "#\n"); 1623 seq_puts(m, "#\n");
@@ -1679,6 +1636,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1679 if (cpumask_test_cpu(iter->cpu, iter->started)) 1636 if (cpumask_test_cpu(iter->cpu, iter->started))
1680 return; 1637 return;
1681 1638
1639 if (iter->tr->data[iter->cpu]->skipped_entries)
1640 return;
1641
1682 cpumask_set_cpu(iter->cpu, iter->started); 1642 cpumask_set_cpu(iter->cpu, iter->started);
1683 1643
1684 /* Don't print started cpu buffer for the first entry of the trace */ 1644 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1941,19 +1901,23 @@ __tracing_open(struct inode *inode, struct file *file)
1941 if (ring_buffer_overruns(iter->tr->buffer)) 1901 if (ring_buffer_overruns(iter->tr->buffer))
1942 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1902 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1943 1903
1904 /* stop the trace while dumping */
1905 tracing_stop();
1906
1944 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 1907 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1945 for_each_tracing_cpu(cpu) { 1908 for_each_tracing_cpu(cpu) {
1946 1909
1947 iter->buffer_iter[cpu] = 1910 iter->buffer_iter[cpu] =
1948 ring_buffer_read_start(iter->tr->buffer, cpu); 1911 ring_buffer_read_start(iter->tr->buffer, cpu);
1912 tracing_iter_reset(iter, cpu);
1949 } 1913 }
1950 } else { 1914 } else {
1951 cpu = iter->cpu_file; 1915 cpu = iter->cpu_file;
1952 iter->buffer_iter[cpu] = 1916 iter->buffer_iter[cpu] =
1953 ring_buffer_read_start(iter->tr->buffer, cpu); 1917 ring_buffer_read_start(iter->tr->buffer, cpu);
1918 tracing_iter_reset(iter, cpu);
1954 } 1919 }
1955 1920
1956 /* TODO stop tracer */
1957 ret = seq_open(file, &tracer_seq_ops); 1921 ret = seq_open(file, &tracer_seq_ops);
1958 if (ret < 0) { 1922 if (ret < 0) {
1959 fail_ret = ERR_PTR(ret); 1923 fail_ret = ERR_PTR(ret);
@@ -1963,9 +1927,6 @@ __tracing_open(struct inode *inode, struct file *file)
1963 m = file->private_data; 1927 m = file->private_data;
1964 m->private = iter; 1928 m->private = iter;
1965 1929
1966 /* stop the trace while dumping */
1967 tracing_stop();
1968
1969 mutex_unlock(&trace_types_lock); 1930 mutex_unlock(&trace_types_lock);
1970 1931
1971 return iter; 1932 return iter;
@@ -1976,6 +1937,7 @@ __tracing_open(struct inode *inode, struct file *file)
1976 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1937 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1977 } 1938 }
1978 free_cpumask_var(iter->started); 1939 free_cpumask_var(iter->started);
1940 tracing_start();
1979 fail: 1941 fail:
1980 mutex_unlock(&trace_types_lock); 1942 mutex_unlock(&trace_types_lock);
1981 kfree(iter->trace); 1943 kfree(iter->trace);
@@ -2257,8 +2219,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2257 len += 3; /* "no" and newline */ 2219 len += 3; /* "no" and newline */
2258 } 2220 }
2259 2221
2260 /* +2 for \n and \0 */ 2222 /* +1 for \0 */
2261 buf = kmalloc(len + 2, GFP_KERNEL); 2223 buf = kmalloc(len + 1, GFP_KERNEL);
2262 if (!buf) { 2224 if (!buf) {
2263 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
2264 return -ENOMEM; 2226 return -ENOMEM;
@@ -2281,7 +2243,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2281 } 2243 }
2282 mutex_unlock(&trace_types_lock); 2244 mutex_unlock(&trace_types_lock);
2283 2245
2284 WARN_ON(r >= len + 2); 2246 WARN_ON(r >= len + 1);
2285 2247
2286 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2248 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2287 2249
@@ -2292,23 +2254,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2292/* Try to assign a tracer specific option */ 2254/* Try to assign a tracer specific option */
2293static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2255static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2294{ 2256{
2295 struct tracer_flags *trace_flags = trace->flags; 2257 struct tracer_flags *tracer_flags = trace->flags;
2296 struct tracer_opt *opts = NULL; 2258 struct tracer_opt *opts = NULL;
2297 int ret = 0, i = 0; 2259 int ret = 0, i = 0;
2298 int len; 2260 int len;
2299 2261
2300 for (i = 0; trace_flags->opts[i].name; i++) { 2262 for (i = 0; tracer_flags->opts[i].name; i++) {
2301 opts = &trace_flags->opts[i]; 2263 opts = &tracer_flags->opts[i];
2302 len = strlen(opts->name); 2264 len = strlen(opts->name);
2303 2265
2304 if (strncmp(cmp, opts->name, len) == 0) { 2266 if (strncmp(cmp, opts->name, len) == 0) {
2305 ret = trace->set_flag(trace_flags->val, 2267 ret = trace->set_flag(tracer_flags->val,
2306 opts->bit, !neg); 2268 opts->bit, !neg);
2307 break; 2269 break;
2308 } 2270 }
2309 } 2271 }
2310 /* Not found */ 2272 /* Not found */
2311 if (!trace_flags->opts[i].name) 2273 if (!tracer_flags->opts[i].name)
2312 return -EINVAL; 2274 return -EINVAL;
2313 2275
2314 /* Refused to handle */ 2276 /* Refused to handle */
@@ -2316,9 +2278,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2316 return ret; 2278 return ret;
2317 2279
2318 if (neg) 2280 if (neg)
2319 trace_flags->val &= ~opts->bit; 2281 tracer_flags->val &= ~opts->bit;
2320 else 2282 else
2321 trace_flags->val |= opts->bit; 2283 tracer_flags->val |= opts->bit;
2322 2284
2323 return 0; 2285 return 0;
2324} 2286}
@@ -2333,22 +2295,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2333 trace_flags |= mask; 2295 trace_flags |= mask;
2334 else 2296 else
2335 trace_flags &= ~mask; 2297 trace_flags &= ~mask;
2336
2337 if (mask == TRACE_ITER_GLOBAL_CLK) {
2338 u64 (*func)(void);
2339
2340 if (enabled)
2341 func = trace_clock_global;
2342 else
2343 func = trace_clock_local;
2344
2345 mutex_lock(&trace_types_lock);
2346 ring_buffer_set_clock(global_trace.buffer, func);
2347
2348 if (max_tr.buffer)
2349 ring_buffer_set_clock(max_tr.buffer, func);
2350 mutex_unlock(&trace_types_lock);
2351 }
2352} 2298}
2353 2299
2354static ssize_t 2300static ssize_t
@@ -3316,6 +3262,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3316 return cnt; 3262 return cnt;
3317} 3263}
3318 3264
3265static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3266 size_t cnt, loff_t *ppos)
3267{
3268 char buf[64];
3269 int bufiter = 0;
3270 int i;
3271
3272 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3273 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3274 "%s%s%s%s", i ? " " : "",
3275 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3276 i == trace_clock_id ? "]" : "");
3277 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3278
3279 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3280}
3281
3282static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3283 size_t cnt, loff_t *fpos)
3284{
3285 char buf[64];
3286 const char *clockstr;
3287 int i;
3288
3289 if (cnt >= sizeof(buf))
3290 return -EINVAL;
3291
3292 if (copy_from_user(&buf, ubuf, cnt))
3293 return -EFAULT;
3294
3295 buf[cnt] = 0;
3296
3297 clockstr = strstrip(buf);
3298
3299 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3300 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3301 break;
3302 }
3303 if (i == ARRAY_SIZE(trace_clocks))
3304 return -EINVAL;
3305
3306 trace_clock_id = i;
3307
3308 mutex_lock(&trace_types_lock);
3309
3310 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3311 if (max_tr.buffer)
3312 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3313
3314 mutex_unlock(&trace_types_lock);
3315
3316 *fpos += cnt;
3317
3318 return cnt;
3319}
3320
3319static const struct file_operations tracing_max_lat_fops = { 3321static const struct file_operations tracing_max_lat_fops = {
3320 .open = tracing_open_generic, 3322 .open = tracing_open_generic,
3321 .read = tracing_max_lat_read, 3323 .read = tracing_max_lat_read,
@@ -3353,6 +3355,12 @@ static const struct file_operations tracing_mark_fops = {
3353 .write = tracing_mark_write, 3355 .write = tracing_mark_write,
3354}; 3356};
3355 3357
3358static const struct file_operations trace_clock_fops = {
3359 .open = tracing_open_generic,
3360 .read = tracing_clock_read,
3361 .write = tracing_clock_write,
3362};
3363
3356struct ftrace_buffer_info { 3364struct ftrace_buffer_info {
3357 struct trace_array *tr; 3365 struct trace_array *tr;
3358 void *spare; 3366 void *spare;
@@ -3633,9 +3641,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3641 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3642 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635 3643
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3644 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640 3645
3641 kfree(s); 3646 kfree(s);
@@ -4066,11 +4071,13 @@ static __init int tracer_init_debugfs(void)
4066 trace_create_file("current_tracer", 0644, d_tracer, 4071 trace_create_file("current_tracer", 0644, d_tracer,
4067 &global_trace, &set_tracer_fops); 4072 &global_trace, &set_tracer_fops);
4068 4073
4074#ifdef CONFIG_TRACER_MAX_TRACE
4069 trace_create_file("tracing_max_latency", 0644, d_tracer, 4075 trace_create_file("tracing_max_latency", 0644, d_tracer,
4070 &tracing_max_latency, &tracing_max_lat_fops); 4076 &tracing_max_latency, &tracing_max_lat_fops);
4071 4077
4072 trace_create_file("tracing_thresh", 0644, d_tracer, 4078 trace_create_file("tracing_thresh", 0644, d_tracer,
4073 &tracing_thresh, &tracing_max_lat_fops); 4079 &tracing_thresh, &tracing_max_lat_fops);
4080#endif
4074 4081
4075 trace_create_file("README", 0444, d_tracer, 4082 trace_create_file("README", 0444, d_tracer,
4076 NULL, &tracing_readme_fops); 4083 NULL, &tracing_readme_fops);
@@ -4087,6 +4094,9 @@ static __init int tracer_init_debugfs(void)
4087 trace_create_file("saved_cmdlines", 0444, d_tracer, 4094 trace_create_file("saved_cmdlines", 0444, d_tracer,
4088 NULL, &tracing_saved_cmdlines_fops); 4095 NULL, &tracing_saved_cmdlines_fops);
4089 4096
4097 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4098 &trace_clock_fops);
4099
4090#ifdef CONFIG_DYNAMIC_FTRACE 4100#ifdef CONFIG_DYNAMIC_FTRACE
4091 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4101 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4092 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4102 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4265,7 +4275,6 @@ void ftrace_dump(void)
4265 4275
4266__init static int tracer_alloc_buffers(void) 4276__init static int tracer_alloc_buffers(void)
4267{ 4277{
4268 struct trace_array_cpu *data;
4269 int ring_buf_size; 4278 int ring_buf_size;
4270 int i; 4279 int i;
4271 int ret = -ENOMEM; 4280 int ret = -ENOMEM;
@@ -4315,7 +4324,7 @@ __init static int tracer_alloc_buffers(void)
4315 4324
4316 /* Allocate the first page for all buffers */ 4325 /* Allocate the first page for all buffers */
4317 for_each_tracing_cpu(i) { 4326 for_each_tracing_cpu(i) {
4318 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4327 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4319 max_tr.data[i] = &per_cpu(max_data, i); 4328 max_tr.data[i] = &per_cpu(max_data, i);
4320 } 4329 }
4321 4330
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e9559..fa1dccb579d5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,8 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 35 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
41 TRACE_POWER, 39 TRACE_POWER,
@@ -236,9 +234,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 234 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 235 void *buffer_page; /* ring buffer spare */
238 236
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 237 unsigned long saved_latency;
243 unsigned long critical_start; 238 unsigned long critical_start;
244 unsigned long critical_end; 239 unsigned long critical_end;
@@ -246,6 +241,7 @@ struct trace_array_cpu {
246 unsigned long nice; 241 unsigned long nice;
247 unsigned long policy; 242 unsigned long policy;
248 unsigned long rt_priority; 243 unsigned long rt_priority;
244 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 245 cycle_t preempt_timestamp;
250 pid_t pid; 246 pid_t pid;
251 uid_t uid; 247 uid_t uid;
@@ -319,10 +315,6 @@ extern void __ftrace_bad_type(void);
319 TRACE_KMEM_ALLOC); \ 315 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 316 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 317 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 318 __ftrace_bad_type(); \
327 } while (0) 319 } while (0)
328 320
@@ -423,12 +415,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 415
424struct ring_buffer_event; 416struct ring_buffer_event;
425 417
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 418struct ring_buffer_event *
427 int type, 419trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 420 int type,
429 unsigned long flags, 421 unsigned long len,
430 int pc); 422 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 423 int pc);
424void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 425 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 426 unsigned long flags, int pc);
434 427
@@ -467,6 +460,7 @@ void trace_function(struct trace_array *tr,
467 460
468void trace_graph_return(struct ftrace_graph_ret *trace); 461void trace_graph_return(struct ftrace_graph_ret *trace);
469int trace_graph_entry(struct ftrace_graph_ent *trace); 462int trace_graph_entry(struct ftrace_graph_ent *trace);
463void set_graph_array(struct trace_array *tr);
470 464
471void tracing_start_cmdline_record(void); 465void tracing_start_cmdline_record(void);
472void tracing_stop_cmdline_record(void); 466void tracing_stop_cmdline_record(void);
@@ -478,16 +472,40 @@ void unregister_tracer(struct tracer *type);
478 472
479extern unsigned long nsecs_to_usecs(unsigned long nsecs); 473extern unsigned long nsecs_to_usecs(unsigned long nsecs);
480 474
475#ifdef CONFIG_TRACER_MAX_TRACE
481extern unsigned long tracing_max_latency; 476extern unsigned long tracing_max_latency;
482extern unsigned long tracing_thresh; 477extern unsigned long tracing_thresh;
483 478
484void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 479void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
485void update_max_tr_single(struct trace_array *tr, 480void update_max_tr_single(struct trace_array *tr,
486 struct task_struct *tsk, int cpu); 481 struct task_struct *tsk, int cpu);
482#endif /* CONFIG_TRACER_MAX_TRACE */
487 483
488void __trace_stack(struct trace_array *tr, 484#ifdef CONFIG_STACKTRACE
489 unsigned long flags, 485void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
490 int skip, int pc); 486 int skip, int pc);
487
488void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
489 int pc);
490
491void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
492 int pc);
493#else
494static inline void ftrace_trace_stack(struct trace_array *tr,
495 unsigned long flags, int skip, int pc)
496{
497}
498
499static inline void ftrace_trace_userstack(struct trace_array *tr,
500 unsigned long flags, int pc)
501{
502}
503
504static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
505 int skip, int pc)
506{
507}
508#endif /* CONFIG_STACKTRACE */
491 509
492extern cycle_t ftrace_now(int cpu); 510extern cycle_t ftrace_now(int cpu);
493 511
@@ -513,6 +531,10 @@ extern unsigned long ftrace_update_tot_cnt;
513extern int DYN_FTRACE_TEST_NAME(void); 531extern int DYN_FTRACE_TEST_NAME(void);
514#endif 532#endif
515 533
534extern int ring_buffer_expanded;
535extern bool tracing_selftest_disabled;
536DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
537
516#ifdef CONFIG_FTRACE_STARTUP_TEST 538#ifdef CONFIG_FTRACE_STARTUP_TEST
517extern int trace_selftest_startup_function(struct tracer *trace, 539extern int trace_selftest_startup_function(struct tracer *trace,
518 struct trace_array *tr); 540 struct trace_array *tr);
@@ -544,9 +566,16 @@ extern int
544trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 566trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
545extern int 567extern int
546trace_vprintk(unsigned long ip, const char *fmt, va_list args); 568trace_vprintk(unsigned long ip, const char *fmt, va_list args);
569extern int
570trace_array_vprintk(struct trace_array *tr,
571 unsigned long ip, const char *fmt, va_list args);
572int trace_array_printk(struct trace_array *tr,
573 unsigned long ip, const char *fmt, ...);
547 574
548extern unsigned long trace_flags; 575extern unsigned long trace_flags;
549 576
577extern int trace_clock_id;
578
550/* Standard output formatting function used for function return traces */ 579/* Standard output formatting function used for function return traces */
551#ifdef CONFIG_FUNCTION_GRAPH_TRACER 580#ifdef CONFIG_FUNCTION_GRAPH_TRACER
552extern enum print_line_t print_graph_function(struct trace_iterator *iter); 581extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -635,9 +664,8 @@ enum trace_iterator_flags {
635 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 664 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
636 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 665 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
637 TRACE_ITER_LATENCY_FMT = 0x40000, 666 TRACE_ITER_LATENCY_FMT = 0x40000,
638 TRACE_ITER_GLOBAL_CLK = 0x80000, 667 TRACE_ITER_SLEEP_TIME = 0x80000,
639 TRACE_ITER_SLEEP_TIME = 0x100000, 668 TRACE_ITER_GRAPH_TIME = 0x100000,
640 TRACE_ITER_GRAPH_TIME = 0x200000,
641}; 669};
642 670
643/* 671/*
@@ -734,6 +762,7 @@ struct ftrace_event_field {
734 struct list_head link; 762 struct list_head link;
735 char *name; 763 char *name;
736 char *type; 764 char *type;
765 int filter_type;
737 int offset; 766 int offset;
738 int size; 767 int size;
739 int is_signed; 768 int is_signed;
@@ -743,13 +772,15 @@ struct event_filter {
743 int n_preds; 772 int n_preds;
744 struct filter_pred **preds; 773 struct filter_pred **preds;
745 char *filter_string; 774 char *filter_string;
775 bool no_reset;
746}; 776};
747 777
748struct event_subsystem { 778struct event_subsystem {
749 struct list_head list; 779 struct list_head list;
750 const char *name; 780 const char *name;
751 struct dentry *entry; 781 struct dentry *entry;
752 void *filter; 782 struct event_filter *filter;
783 int nr_events;
753}; 784};
754 785
755struct filter_pred; 786struct filter_pred;
@@ -777,6 +808,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
777 char *filter_string); 808 char *filter_string);
778extern void print_subsystem_event_filter(struct event_subsystem *system, 809extern void print_subsystem_event_filter(struct event_subsystem *system,
779 struct trace_seq *s); 810 struct trace_seq *s);
811extern int filter_assign_type(const char *type);
780 812
781static inline int 813static inline int
782filter_check_discard(struct ftrace_event_call *call, void *rec, 814filter_check_discard(struct ftrace_event_call *call, void *rec,
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..19bfc75d467e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -132,6 +130,7 @@ struct tracer boot_tracer __read_mostly =
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
134 struct ring_buffer_event *event; 132 struct ring_buffer_event *event;
133 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
137 136
@@ -144,13 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 144 preempt_disable();
146 145
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 146 buffer = tr->buffer;
147 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 148 sizeof(*entry), 0, 0);
149 if (!event) 149 if (!event)
150 goto out; 150 goto out;
151 entry = ring_buffer_event_data(event); 151 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 152 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 153 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 154 out:
155 preempt_enable(); 155 preempt_enable();
156} 156}
@@ -158,6 +158,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 159{
160 struct ring_buffer_event *event; 160 struct ring_buffer_event *event;
161 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 162 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 163 struct trace_array *tr = boot_trace;
163 164
@@ -167,13 +168,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 168 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 169 preempt_disable();
169 170
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 171 buffer = tr->buffer;
172 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 173 sizeof(*entry), 0, 0);
172 if (!event) 174 if (!event)
173 goto out; 175 goto out;
174 entry = ring_buffer_event_data(event); 176 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 177 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 178 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 179 out:
178 preempt_enable(); 180 preempt_enable();
179} 181}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..78b1ed230177 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,6 +17,8 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
22#define TRACE_SYSTEM "TRACE_SYSTEM" 24#define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -25,8 +27,9 @@ DEFINE_MUTEX(event_mutex);
25 27
26LIST_HEAD(ftrace_events); 28LIST_HEAD(ftrace_events);
27 29
28int trace_define_field(struct ftrace_event_call *call, char *type, 30int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 31 const char *name, int offset, int size, int is_signed,
32 int filter_type)
30{ 33{
31 struct ftrace_event_field *field; 34 struct ftrace_event_field *field;
32 35
@@ -42,9 +45,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 45 if (!field->type)
43 goto err; 46 goto err;
44 47
48 if (filter_type == FILTER_OTHER)
49 field->filter_type = filter_assign_type(type);
50 else
51 field->filter_type = filter_type;
52
45 field->offset = offset; 53 field->offset = offset;
46 field->size = size; 54 field->size = size;
47 field->is_signed = is_signed; 55 field->is_signed = is_signed;
56
48 list_add(&field->link, &call->fields); 57 list_add(&field->link, &call->fields);
49 58
50 return 0; 59 return 0;
@@ -60,6 +69,29 @@ err:
60} 69}
61EXPORT_SYMBOL_GPL(trace_define_field); 70EXPORT_SYMBOL_GPL(trace_define_field);
62 71
72#define __common_field(type, item) \
73 ret = trace_define_field(call, #type, "common_" #item, \
74 offsetof(typeof(ent), item), \
75 sizeof(ent.item), \
76 is_signed_type(type), FILTER_OTHER); \
77 if (ret) \
78 return ret;
79
80int trace_define_common_fields(struct ftrace_event_call *call)
81{
82 int ret;
83 struct trace_entry ent;
84
85 __common_field(unsigned short, type);
86 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid);
89 __common_field(int, tgid);
90
91 return ret;
92}
93EXPORT_SYMBOL_GPL(trace_define_common_fields);
94
63#ifdef CONFIG_MODULES 95#ifdef CONFIG_MODULES
64 96
65static void trace_destroy_fields(struct ftrace_event_call *call) 97static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +116,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 116 if (call->enabled) {
85 call->enabled = 0; 117 call->enabled = 0;
86 tracing_stop_cmdline_record(); 118 tracing_stop_cmdline_record();
87 call->unregfunc(); 119 call->unregfunc(call->data);
88 } 120 }
89 break; 121 break;
90 case 1: 122 case 1:
91 if (!call->enabled) { 123 if (!call->enabled) {
92 call->enabled = 1; 124 call->enabled = 1;
93 tracing_start_cmdline_record(); 125 tracing_start_cmdline_record();
94 call->regfunc(); 126 call->regfunc(call->data);
95 } 127 }
96 break; 128 break;
97 } 129 }
@@ -574,7 +606,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
574 trace_seq_printf(s, "format:\n"); 606 trace_seq_printf(s, "format:\n");
575 trace_write_header(s); 607 trace_write_header(s);
576 608
577 r = call->show_format(s); 609 r = call->show_format(call, s);
578 if (!r) { 610 if (!r) {
579 /* 611 /*
580 * ug! The format output is bigger than a PAGE!! 612 * ug! The format output is bigger than a PAGE!!
@@ -849,8 +881,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 881
850 /* First see if we did not already create this dir */ 882 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 883 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 884 if (strcmp(system->name, name) == 0) {
885 system->nr_events++;
853 return system->entry; 886 return system->entry;
887 }
854 } 888 }
855 889
856 /* need to create new entry */ 890 /* need to create new entry */
@@ -869,6 +903,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 903 return d_events;
870 } 904 }
871 905
906 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 907 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 908 if (!system->name) {
874 debugfs_remove(system->entry); 909 debugfs_remove(system->entry);
@@ -920,15 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 955 if (strcmp(call->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 956 d_events = event_subsystem_dir(call->system, d_events);
922 957
923 if (call->raw_init) {
924 ret = call->raw_init();
925 if (ret < 0) {
926 pr_warning("Could not initialize trace point"
927 " events/%s\n", call->name);
928 return ret;
929 }
930 }
931
932 call->dir = debugfs_create_dir(call->name, d_events); 958 call->dir = debugfs_create_dir(call->name, d_events);
933 if (!call->dir) { 959 if (!call->dir) {
934 pr_warning("Could not create debugfs " 960 pr_warning("Could not create debugfs "
@@ -945,7 +971,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 id); 971 id);
946 972
947 if (call->define_fields) { 973 if (call->define_fields) {
948 ret = call->define_fields(); 974 ret = call->define_fields(call);
949 if (ret < 0) { 975 if (ret < 0) {
950 pr_warning("Could not initialize trace point" 976 pr_warning("Could not initialize trace point"
951 " events/%s\n", call->name); 977 " events/%s\n", call->name);
@@ -987,6 +1013,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 1013 struct file_operations filter;
988}; 1014};
989 1015
1016static void remove_subsystem_dir(const char *name)
1017{
1018 struct event_subsystem *system;
1019
1020 if (strcmp(name, TRACE_SYSTEM) == 0)
1021 return;
1022
1023 list_for_each_entry(system, &event_subsystems, list) {
1024 if (strcmp(system->name, name) == 0) {
1025 if (!--system->nr_events) {
1026 struct event_filter *filter = system->filter;
1027
1028 debugfs_remove_recursive(system->entry);
1029 list_del(&system->list);
1030 if (filter) {
1031 kfree(filter->filter_string);
1032 kfree(filter);
1033 }
1034 kfree(system->name);
1035 kfree(system);
1036 }
1037 break;
1038 }
1039 }
1040}
1041
990static struct ftrace_module_file_ops * 1042static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1043trace_create_file_ops(struct module *mod)
992{ 1044{
@@ -1027,6 +1079,7 @@ static void trace_module_add_events(struct module *mod)
1027 struct ftrace_module_file_ops *file_ops = NULL; 1079 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end; 1080 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events; 1081 struct dentry *d_events;
1082 int ret;
1030 1083
1031 start = mod->trace_events; 1084 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events; 1085 end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1095,15 @@ static void trace_module_add_events(struct module *mod)
1042 /* The linker may leave blanks */ 1095 /* The linker may leave blanks */
1043 if (!call->name) 1096 if (!call->name)
1044 continue; 1097 continue;
1045 1098 if (call->raw_init) {
1099 ret = call->raw_init();
1100 if (ret < 0) {
1101 if (ret != -ENOSYS)
1102 pr_warning("Could not initialize trace "
1103 "point events/%s\n", call->name);
1104 continue;
1105 }
1106 }
1046 /* 1107 /*
1047 * This module has events, create file ops for this module 1108 * This module has events, create file ops for this module
1048 * if not already done. 1109 * if not already done.
@@ -1077,6 +1138,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1138 list_del(&call->list);
1078 trace_destroy_fields(call); 1139 trace_destroy_fields(call);
1079 destroy_preds(call); 1140 destroy_preds(call);
1141 remove_subsystem_dir(call->system);
1080 } 1142 }
1081 } 1143 }
1082 1144
@@ -1133,6 +1195,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1195extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1196extern struct ftrace_event_call __stop_ftrace_events[];
1135 1197
1198static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1199
1200static __init int setup_trace_event(char *str)
1201{
1202 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1203 ring_buffer_expanded = 1;
1204 tracing_selftest_disabled = 1;
1205
1206 return 1;
1207}
1208__setup("trace_event=", setup_trace_event);
1209
1136static __init int event_trace_init(void) 1210static __init int event_trace_init(void)
1137{ 1211{
1138 struct ftrace_event_call *call; 1212 struct ftrace_event_call *call;
@@ -1140,6 +1214,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1214 struct dentry *entry;
1141 struct dentry *d_events; 1215 struct dentry *d_events;
1142 int ret; 1216 int ret;
1217 char *buf = bootup_event_buf;
1218 char *token;
1143 1219
1144 d_tracer = tracing_init_dentry(); 1220 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1221 if (!d_tracer)
@@ -1179,12 +1255,34 @@ static __init int event_trace_init(void)
1179 /* The linker may leave blanks */ 1255 /* The linker may leave blanks */
1180 if (!call->name) 1256 if (!call->name)
1181 continue; 1257 continue;
1258 if (call->raw_init) {
1259 ret = call->raw_init();
1260 if (ret < 0) {
1261 if (ret != -ENOSYS)
1262 pr_warning("Could not initialize trace "
1263 "point events/%s\n", call->name);
1264 continue;
1265 }
1266 }
1182 list_add(&call->list, &ftrace_events); 1267 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops, 1268 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops, 1269 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops); 1270 &ftrace_event_format_fops);
1186 } 1271 }
1187 1272
1273 while (true) {
1274 token = strsep(&buf, ",");
1275
1276 if (!token)
1277 break;
1278 if (!*token)
1279 continue;
1280
1281 ret = ftrace_set_clr_event(token, 1);
1282 if (ret)
1283 pr_warning("Failed to enable trace event: %s\n", token);
1284 }
1285
1188 ret = register_module_notifier(&trace_module_nb); 1286 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1287 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1288 pr_warning("Failed to register trace events module notifier\n");
@@ -1340,6 +1438,7 @@ static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1438function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{ 1439{
1342 struct ring_buffer_event *event; 1440 struct ring_buffer_event *event;
1441 struct ring_buffer *buffer;
1343 struct ftrace_entry *entry; 1442 struct ftrace_entry *entry;
1344 unsigned long flags; 1443 unsigned long flags;
1345 long disabled; 1444 long disabled;
@@ -1357,7 +1456,8 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1357 1456
1358 local_save_flags(flags); 1457 local_save_flags(flags);
1359 1458
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1459 event = trace_current_buffer_lock_reserve(&buffer,
1460 TRACE_FN, sizeof(*entry),
1361 flags, pc); 1461 flags, pc);
1362 if (!event) 1462 if (!event)
1363 goto out; 1463 goto out;
@@ -1365,7 +1465,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1365 entry->ip = ip; 1465 entry->ip = ip;
1366 entry->parent_ip = parent_ip; 1466 entry->parent_ip = parent_ip;
1367 1467
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1369 1469
1370 out: 1470 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1471 atomic_dec(&per_cpu(test_event_disable, cpu));
@@ -1392,10 +1492,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1492
1393static __init int event_trace_self_tests_init(void) 1493static __init int event_trace_self_tests_init(void)
1394{ 1494{
1395 1495 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1496 event_trace_self_tests();
1397 1497 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1498 }
1399 1499
1400 return 0; 1500 return 0;
1401} 1501}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..93660fbbf629 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -163,6 +163,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
163 return match; 163 return match;
164} 164}
165 165
166/* Filter predicate for char * pointers */
167static int filter_pred_pchar(struct filter_pred *pred, void *event,
168 int val1, int val2)
169{
170 char **addr = (char **)(event + pred->offset);
171 int cmp, match;
172
173 cmp = strncmp(*addr, pred->str_val, pred->str_len);
174
175 match = (!cmp) ^ pred->not;
176
177 return match;
178}
179
166/* 180/*
167 * Filter predicate for dynamic sized arrays of characters. 181 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end 182 * These are implemented through a list of strings at the end
@@ -176,11 +190,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 190static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 191 int val1, int val2)
178{ 192{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 193 u32 str_item = *(u32 *)(event + pred->offset);
194 int str_loc = str_item & 0xffff;
195 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 196 char *addr = (char *)(event + str_loc);
181 int cmp, match; 197 int cmp, match;
182 198
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 199 cmp = strncmp(addr, pred->str_val, str_len);
184 200
185 match = (!cmp) ^ pred->not; 201 match = (!cmp) ^ pred->not;
186 202
@@ -293,7 +309,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
293 struct event_filter *filter = call->filter; 309 struct event_filter *filter = call->filter;
294 310
295 mutex_lock(&event_mutex); 311 mutex_lock(&event_mutex);
296 if (filter->filter_string) 312 if (filter && filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string); 313 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else 314 else
299 trace_seq_printf(s, "none\n"); 315 trace_seq_printf(s, "none\n");
@@ -306,7 +322,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
306 struct event_filter *filter = system->filter; 322 struct event_filter *filter = system->filter;
307 323
308 mutex_lock(&event_mutex); 324 mutex_lock(&event_mutex);
309 if (filter->filter_string) 325 if (filter && filter->filter_string)
310 trace_seq_printf(s, "%s\n", filter->filter_string); 326 trace_seq_printf(s, "%s\n", filter->filter_string);
311 else 327 else
312 trace_seq_printf(s, "none\n"); 328 trace_seq_printf(s, "none\n");
@@ -374,6 +390,9 @@ void destroy_preds(struct ftrace_event_call *call)
374 struct event_filter *filter = call->filter; 390 struct event_filter *filter = call->filter;
375 int i; 391 int i;
376 392
393 if (!filter)
394 return;
395
377 for (i = 0; i < MAX_FILTER_PRED; i++) { 396 for (i = 0; i < MAX_FILTER_PRED; i++) {
378 if (filter->preds[i]) 397 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]); 398 filter_free_pred(filter->preds[i]);
@@ -384,17 +403,19 @@ void destroy_preds(struct ftrace_event_call *call)
384 call->filter = NULL; 403 call->filter = NULL;
385} 404}
386 405
387int init_preds(struct ftrace_event_call *call) 406static int init_preds(struct ftrace_event_call *call)
388{ 407{
389 struct event_filter *filter; 408 struct event_filter *filter;
390 struct filter_pred *pred; 409 struct filter_pred *pred;
391 int i; 410 int i;
392 411
412 if (call->filter)
413 return 0;
414
393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 415 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
394 if (!call->filter) 416 if (!call->filter)
395 return -ENOMEM; 417 return -ENOMEM;
396 418
397 call->filter_active = 0;
398 filter->n_preds = 0; 419 filter->n_preds = 0;
399 420
400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 421 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -416,30 +437,55 @@ oom:
416 437
417 return -ENOMEM; 438 return -ENOMEM;
418} 439}
419EXPORT_SYMBOL_GPL(init_preds);
420 440
421static void filter_free_subsystem_preds(struct event_subsystem *system) 441static int init_subsystem_preds(struct event_subsystem *system)
422{ 442{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 443 struct ftrace_event_call *call;
425 int i; 444 int err;
426 445
427 if (filter->n_preds) { 446 list_for_each_entry(call, &ftrace_events, list) {
428 for (i = 0; i < filter->n_preds; i++) 447 if (!call->define_fields)
429 filter_free_pred(filter->preds[i]); 448 continue;
430 kfree(filter->preds); 449
431 filter->preds = NULL; 450 if (strcmp(call->system, system->name) != 0)
432 filter->n_preds = 0; 451 continue;
452
453 err = init_preds(call);
454 if (err)
455 return err;
433 } 456 }
434 457
458 return 0;
459}
460
461enum {
462 FILTER_DISABLE_ALL,
463 FILTER_INIT_NO_RESET,
464 FILTER_SKIP_NO_RESET,
465};
466
467static void filter_free_subsystem_preds(struct event_subsystem *system,
468 int flag)
469{
470 struct ftrace_event_call *call;
471
435 list_for_each_entry(call, &ftrace_events, list) { 472 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 473 if (!call->define_fields)
437 continue; 474 continue;
438 475
439 if (!strcmp(call->system, system->name)) { 476 if (strcmp(call->system, system->name) != 0)
440 filter_disable_preds(call); 477 continue;
441 remove_filter_string(call->filter); 478
479 if (flag == FILTER_INIT_NO_RESET) {
480 call->filter->no_reset = false;
481 continue;
442 } 482 }
483
484 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
485 continue;
486
487 filter_disable_preds(call);
488 remove_filter_string(call->filter);
443 } 489 }
444} 490}
445 491
@@ -468,12 +514,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
468 return 0; 514 return 0;
469} 515}
470 516
471enum { 517int filter_assign_type(const char *type)
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
476static int is_string_field(const char *type)
477{ 518{
478 if (strstr(type, "__data_loc") && strstr(type, "char")) 519 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING; 520 return FILTER_DYN_STRING;
@@ -481,12 +522,19 @@ static int is_string_field(const char *type)
481 if (strchr(type, '[') && strstr(type, "char")) 522 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 523 return FILTER_STATIC_STRING;
483 524
484 return 0; 525 return FILTER_OTHER;
526}
527
528static bool is_string_field(struct ftrace_event_field *field)
529{
530 return field->filter_type == FILTER_DYN_STRING ||
531 field->filter_type == FILTER_STATIC_STRING ||
532 field->filter_type == FILTER_PTR_STRING;
485} 533}
486 534
487static int is_legal_op(struct ftrace_event_field *field, int op) 535static int is_legal_op(struct ftrace_event_field *field, int op)
488{ 536{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 537 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
490 return 0; 538 return 0;
491 539
492 return 1; 540 return 1;
@@ -537,22 +585,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 585
538static int filter_add_pred(struct filter_parse_state *ps, 586static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 587 struct ftrace_event_call *call,
540 struct filter_pred *pred) 588 struct filter_pred *pred,
589 bool dry_run)
541{ 590{
542 struct ftrace_event_field *field; 591 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 592 filter_pred_fn_t fn;
544 unsigned long long val; 593 unsigned long long val;
545 int string_type;
546 int ret; 594 int ret;
547 595
548 pred->fn = filter_pred_none; 596 pred->fn = filter_pred_none;
549 597
550 if (pred->op == OP_AND) { 598 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 599 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 600 fn = filter_pred_and;
601 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 602 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 603 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 604 fn = filter_pred_or;
605 goto add_pred_fn;
556 } 606 }
557 607
558 field = find_event_field(call, pred->field_name); 608 field = find_event_field(call, pred->field_name);
@@ -568,16 +618,17 @@ static int filter_add_pred(struct filter_parse_state *ps,
568 return -EINVAL; 618 return -EINVAL;
569 } 619 }
570 620
571 string_type = is_string_field(field->type); 621 if (is_string_field(field)) {
572 if (string_type) { 622 pred->str_len = field->size;
573 if (string_type == FILTER_STATIC_STRING) 623
624 if (field->filter_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string; 625 fn = filter_pred_string;
575 else 626 else if (field->filter_type == FILTER_DYN_STRING)
576 fn = filter_pred_strloc; 627 fn = filter_pred_strloc;
577 pred->str_len = field->size; 628 else {
578 if (pred->op == OP_NE) 629 fn = filter_pred_pchar;
579 pred->not = 1; 630 pred->str_len = strlen(pred->str_val);
580 return filter_add_pred_fn(ps, call, pred, fn); 631 }
581 } else { 632 } else {
582 if (field->is_signed) 633 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 634 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +639,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 639 return -EINVAL;
589 } 640 }
590 pred->val = val; 641 pred->val = val;
591 }
592 642
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 643 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 644 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 645 if (!fn) {
596 return -EINVAL; 646 parse_error(ps, FILT_ERR_INVALID_OP, 0);
647 return -EINVAL;
648 }
597 } 649 }
598 650
599 if (pred->op == OP_NE) 651 if (pred->op == OP_NE)
600 pred->not = 1; 652 pred->not = 1;
601 653
602 return filter_add_pred_fn(ps, call, pred, fn); 654add_pred_fn:
655 if (!dry_run)
656 return filter_add_pred_fn(ps, call, pred, fn);
657 return 0;
603} 658}
604 659
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 660static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 661 struct event_subsystem *system,
607 struct filter_pred *pred, 662 struct filter_pred *pred,
608 char *filter_string) 663 char *filter_string,
664 bool dry_run)
609{ 665{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 666 struct ftrace_event_call *call;
612 int err = 0; 667 int err = 0;
613 668 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626 669
627 list_for_each_entry(call, &ftrace_events, list) { 670 list_for_each_entry(call, &ftrace_events, list) {
628 671
@@ -632,19 +675,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
632 if (strcmp(call->system, system->name)) 675 if (strcmp(call->system, system->name))
633 continue; 676 continue;
634 677
635 err = filter_add_pred(ps, call, pred); 678 if (call->filter->no_reset)
636 if (err) { 679 continue;
637 filter_free_subsystem_preds(system); 680
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 681 err = filter_add_pred(ps, call, pred, dry_run);
639 goto out; 682 if (err)
640 } 683 call->filter->no_reset = true;
641 replace_filter_string(call->filter, filter_string); 684 else
685 fail = false;
686
687 if (!dry_run)
688 replace_filter_string(call->filter, filter_string);
642 } 689 }
643 690
644 filter->preds[filter->n_preds] = pred; 691 if (fail) {
645 filter->n_preds++; 692 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
646out: 693 return err;
647 return err; 694 }
695 return 0;
648} 696}
649 697
650static void parse_init(struct filter_parse_state *ps, 698static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1051,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1051static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1052 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1053 struct filter_parse_state *ps,
1006 char *filter_string) 1054 char *filter_string,
1055 bool dry_run)
1007{ 1056{
1008 char *operand1 = NULL, *operand2 = NULL; 1057 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1058 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1059 struct postfix_elt *elt;
1011 int err; 1060 int err;
1061 int n_preds = 0;
1012 1062
1013 err = check_preds(ps); 1063 err = check_preds(ps);
1014 if (err) 1064 if (err)
@@ -1027,24 +1077,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1077 continue;
1028 } 1078 }
1029 1079
1080 if (n_preds++ == MAX_FILTER_PRED) {
1081 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1082 return -ENOSPC;
1083 }
1084
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1085 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1086 pred = create_logical_pred(elt->op);
1032 if (!pred) 1087 goto add_pred;
1033 return -ENOMEM;
1034 if (call) {
1035 err = filter_add_pred(ps, call, pred);
1036 filter_free_pred(pred);
1037 } else {
1038 err = filter_add_subsystem_pred(ps, system,
1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
1047 continue;
1048 } 1088 }
1049 1089
1050 if (!operand1 || !operand2) { 1090 if (!operand1 || !operand2) {
@@ -1053,17 +1093,15 @@ static int replace_preds(struct event_subsystem *system,
1053 } 1093 }
1054 1094
1055 pred = create_pred(elt->op, operand1, operand2); 1095 pred = create_pred(elt->op, operand1, operand2);
1096add_pred:
1056 if (!pred) 1097 if (!pred)
1057 return -ENOMEM; 1098 return -ENOMEM;
1058 if (call) { 1099 if (call)
1059 err = filter_add_pred(ps, call, pred); 1100 err = filter_add_pred(ps, call, pred, false);
1060 filter_free_pred(pred); 1101 else
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred, 1102 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string); 1103 filter_string, dry_run);
1064 if (err) 1104 filter_free_pred(pred);
1065 filter_free_pred(pred);
1066 }
1067 if (err) 1105 if (err)
1068 return err; 1106 return err;
1069 1107
@@ -1081,6 +1119,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1081 1119
1082 mutex_lock(&event_mutex); 1120 mutex_lock(&event_mutex);
1083 1121
1122 err = init_preds(call);
1123 if (err)
1124 goto out_unlock;
1125
1084 if (!strcmp(strstrip(filter_string), "0")) { 1126 if (!strcmp(strstrip(filter_string), "0")) {
1085 filter_disable_preds(call); 1127 filter_disable_preds(call);
1086 remove_filter_string(call->filter); 1128 remove_filter_string(call->filter);
@@ -1103,7 +1145,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1103 goto out; 1145 goto out;
1104 } 1146 }
1105 1147
1106 err = replace_preds(NULL, call, ps, filter_string); 1148 err = replace_preds(NULL, call, ps, filter_string, false);
1107 if (err) 1149 if (err)
1108 append_filter_err(ps, call->filter); 1150 append_filter_err(ps, call->filter);
1109 1151
@@ -1126,8 +1168,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1126 1168
1127 mutex_lock(&event_mutex); 1169 mutex_lock(&event_mutex);
1128 1170
1171 err = init_subsystem_preds(system);
1172 if (err)
1173 goto out_unlock;
1174
1129 if (!strcmp(strstrip(filter_string), "0")) { 1175 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system); 1176 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1131 remove_filter_string(system->filter); 1177 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex); 1178 mutex_unlock(&event_mutex);
1133 return 0; 1179 return 0;
@@ -1138,7 +1184,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 if (!ps) 1184 if (!ps)
1139 goto out_unlock; 1185 goto out_unlock;
1140 1186
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string); 1187 replace_filter_string(system->filter, filter_string);
1143 1188
1144 parse_init(ps, filter_ops, filter_string); 1189 parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1193,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1148 goto out; 1193 goto out;
1149 } 1194 }
1150 1195
1151 err = replace_preds(system, NULL, ps, filter_string); 1196 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1152 if (err) 1197
1198 /* try to see the filter can be applied to which events */
1199 err = replace_preds(system, NULL, ps, filter_string, true);
1200 if (err) {
1153 append_filter_err(ps, system->filter); 1201 append_filter_err(ps, system->filter);
1202 goto out;
1203 }
1204
1205 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1206
1207 /* really apply the filter to the events */
1208 err = replace_preds(system, NULL, ps, filter_string, false);
1209 if (err) {
1210 append_filter_err(ps, system->filter);
1211 filter_free_subsystem_preds(system, 2);
1212 }
1154 1213
1155out: 1214out:
1156 filter_opstack_clear(ps); 1215 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..df1bf6e48bb9 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -60,7 +60,8 @@ extern void __bad_type_size(void);
60#undef TRACE_EVENT_FORMAT 60#undef TRACE_EVENT_FORMAT
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
62static int \ 62static int \
63ftrace_format_##call(struct trace_seq *s) \ 63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
64{ \ 65{ \
65 struct args field; \ 66 struct args field; \
66 int ret; \ 67 int ret; \
@@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \ 78 tpfmt) \
78static int \ 79static int \
79ftrace_format_##call(struct trace_seq *s) \ 80ftrace_format_##call(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \
80{ \ 82{ \
81 struct args field; \ 83 struct args field; \
82 int ret; \ 84 int ret; \
@@ -117,7 +119,7 @@ ftrace_format_##call(struct trace_seq *s) \
117 119
118#undef TRACE_EVENT_FORMAT 120#undef TRACE_EVENT_FORMAT
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \ 122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
121static int ftrace_raw_init_event_##call(void); \ 123static int ftrace_raw_init_event_##call(void); \
122 \ 124 \
123struct ftrace_event_call __used \ 125struct ftrace_event_call __used \
@@ -133,7 +135,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 135static int ftrace_raw_init_event_##call(void) \
134{ \ 136{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 137 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 138 return 0; \
138} \ 139} \
139 140
@@ -156,7 +157,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
156#define TRACE_FIELD(type, item, assign) \ 157#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \ 158 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \ 159 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \ 160 sizeof(field.item), \
161 is_signed_type(type), FILTER_OTHER); \
160 if (ret) \ 162 if (ret) \
161 return ret; 163 return ret;
162 164
@@ -164,7 +166,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \ 168 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \ 169 sizeof(field.item), 0, FILTER_OTHER); \
168 if (ret) \ 170 if (ret) \
169 return ret; 171 return ret;
170 172
@@ -172,7 +174,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \ 175 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \ 176 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \ 177 sizeof(field.item), is_signed, \
178 FILTER_OTHER); \
176 if (ret) \ 179 if (ret) \
177 return ret; 180 return ret;
178 181
@@ -182,17 +185,14 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
182#undef TRACE_EVENT_FORMAT 185#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \ 187int \
185ftrace_define_fields_##call(void) \ 188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
186{ \ 189{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \ 190 struct args field; \
189 int ret; \ 191 int ret; \
190 \ 192 \
191 __common_field(unsigned char, type, 0); \ 193 ret = trace_define_common_fields(event_call); \
192 __common_field(unsigned char, flags, 0); \ 194 if (ret) \
193 __common_field(unsigned char, preempt_count, 0); \ 195 return ret; \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \ 196 \
197 tstruct; \ 197 tstruct; \
198 \ 198 \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..5b01b94518fc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%pf:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..b3749a2c3132 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
169static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
170{ 280{
171 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
172 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
173 if (ret) 286 if (ret)
174 return ret; 287 return ret;
175 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 290 return 0;
178} 291}
179 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
180static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
181{ 299{
182 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
184} 302}
185 303
186static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 305
195static enum print_line_t 306static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
197{ 308{
198 int i;
199 int ret; 309 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 310
204 /* 311 /*
205 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
207 * email: 314 * email:
208 */ 315 */
209 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 317 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
225 319
@@ -565,11 +659,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 659 return TRACE_TYPE_PARTIAL_LINE;
566 } 660 }
567 661
568 ret = seq_print_ip_sym(s, call->func, 0); 662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 663 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 664 return TRACE_TYPE_PARTIAL_LINE;
575 665
@@ -612,11 +702,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 702 return TRACE_TYPE_PARTIAL_LINE;
613 } 703 }
614 704
615 ret = seq_print_ip_sym(s, call->func, 0); 705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 706 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 707 return TRACE_TYPE_PARTIAL_LINE;
622 708
@@ -934,6 +1020,8 @@ static struct tracer graph_trace __read_mostly = {
934 1020
935static __init int init_graph_trace(void) 1021static __init int init_graph_trace(void)
936{ 1022{
1023 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1024
937 return register_tracer(&graph_trace); 1025 return register_tracer(&graph_trace);
938} 1026}
939 1027
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..5555b75a0d12 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -178,7 +178,6 @@ out_unlock:
178out: 178out:
179 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 181 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 182}
184 183
@@ -208,7 +207,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 207 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 208 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 209 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 210
213 local_save_flags(flags); 211 local_save_flags(flags);
214 212
@@ -379,6 +377,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 377 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 378 /* make sure that the tracer is visible */
381 smp_wmb(); 379 smp_wmb();
380 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 381 start_irqsoff_tracer(tr);
383} 382}
384 383
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..c4c9bbda53d3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,12 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 311 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 312 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 313 int pc = preempt_count();
313 314
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 315 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 316 sizeof(*entry), 0, pc);
316 if (!event) { 317 if (!event) {
317 atomic_inc(&dropped_count); 318 atomic_inc(&dropped_count);
@@ -319,7 +320,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 320 }
320 entry = ring_buffer_event_data(event); 321 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 322 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 323 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 324}
324 325
325void mmio_trace_rw(struct mmiotrace_rw *rw) 326void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +334,12 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 334 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 335 struct mmiotrace_map *map)
335{ 336{
337 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 338 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 339 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 340 int pc = preempt_count();
339 341
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 342 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 343 sizeof(*entry), 0, pc);
342 if (!event) { 344 if (!event) {
343 atomic_inc(&dropped_count); 345 atomic_inc(&dropped_count);
@@ -345,7 +347,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 347 }
346 entry = ring_buffer_event_data(event); 348 entry = ring_buffer_event_data(event);
347 entry->map = *map; 349 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 350 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 351}
350 352
351void mmio_trace_mapping(struct mmiotrace_map *map) 353void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8a30d9874cd4..fe1a00f1445a 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power; 39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
41 struct trace_power *entry; 42 struct trace_power *entry;
42 struct trace_array_cpu *data; 43 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace; 44 struct trace_array *tr = power_trace;
@@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it)
45 if (!trace_power_enabled) 46 if (!trace_power_enabled)
46 return; 47 return;
47 48
49 buffer = tr->buffer;
50
48 preempt_disable(); 51 preempt_disable();
49 it->end = ktime_get(); 52 it->end = ktime_get();
50 data = tr->data[smp_processor_id()]; 53 data = tr->data[smp_processor_id()];
51 54
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
53 sizeof(*entry), 0, 0); 56 sizeof(*entry), 0, 0);
54 if (!event) 57 if (!event)
55 goto out; 58 goto out;
56 entry = ring_buffer_event_data(event); 59 entry = ring_buffer_event_data(event);
57 entry->state_data = *it; 60 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event)) 61 if (!filter_check_discard(call, entry, buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0); 62 trace_buffer_unlock_commit(buffer, event, 0, 0);
60 out: 63 out:
61 preempt_enable(); 64 preempt_enable();
62} 65}
@@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
66{ 69{
67 struct ftrace_event_call *call = &event_power; 70 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event; 71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
69 struct trace_power *entry; 73 struct trace_power *entry;
70 struct trace_array_cpu *data; 74 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace; 75 struct trace_array *tr = power_trace;
@@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
73 if (!trace_power_enabled) 77 if (!trace_power_enabled)
74 return; 78 return;
75 79
80 buffer = tr->buffer;
81
76 memset(it, 0, sizeof(struct power_trace)); 82 memset(it, 0, sizeof(struct power_trace));
77 it->state = level; 83 it->state = level;
78 it->type = type; 84 it->type = type;
@@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
81 it->end = it->stamp; 87 it->end = it->stamp;
82 data = tr->data[smp_processor_id()]; 88 data = tr->data[smp_processor_id()];
83 89
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
85 sizeof(*entry), 0, 0); 91 sizeof(*entry), 0, 0);
86 if (!event) 92 if (!event)
87 goto out; 93 goto out;
88 entry = ring_buffer_event_data(event); 94 entry = ring_buffer_event_data(event);
89 entry->state_data = *it; 95 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event)) 96 if (!filter_check_discard(call, entry, buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0); 97 trace_buffer_unlock_commit(buffer, event, 0, 0);
92 out: 98 out:
93 preempt_enable(); 99 preempt_enable();
94} 100}
@@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr)
144 150
145static int power_trace_init(struct trace_array *tr) 151static int power_trace_init(struct trace_array *tr)
146{ 152{
147 int cpu;
148 power_trace = tr; 153 power_trace = tr;
149 154
150 trace_power_enabled = 1; 155 trace_power_enabled = 1;
151 tracing_power_register(); 156 tracing_power_register();
152 157
153 for_each_cpu(cpu, cpu_possible_mask) 158 tracing_reset_online_cpus(tr);
154 tracing_reset(tr, cpu);
155 return 0; 159 return 0;
156} 160}
157 161
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..ad69f105a7c6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -186,11 +186,6 @@ out:
186 186
187static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
188{ 188{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 189 wakeup_cpu = -1;
195 wakeup_prio = -1; 190 wakeup_prio = -1;
196 191
@@ -204,6 +199,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 199{
205 unsigned long flags; 200 unsigned long flags;
206 201
202 tracing_reset_online_cpus(tr);
203
207 local_irq_save(flags); 204 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 205 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 206 __wakeup_reset(tr);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192 192
193 (*pos)++; 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201
202 if (i >= max_stack_trace.nr_entries ||
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239
240 sprint_symbol(str, addr);
241 227
242 return seq_printf(m, "%s\n", str); 228 return seq_printf(m, "%pF\n", (void *)addr);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
@@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
200{ 203{
201 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
202 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
203 int i; 207 int i;
204 208
205 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
206 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
207 211
208 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
209 if (!*pos && session->ts->stat_headers) 213 if (session->ts->stat_headers) {
210 return SEQ_START_TOKEN; 214 if (n == 0)
215 return SEQ_START_TOKEN;
216 n--;
217 }
211 218
212 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
213 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
214 node = rb_next(node); 221 node = rb_next(node);
215 222
216 return node; 223 return node;
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..8712ce3c6a0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_counter.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data;
389 int syscall_nr;
390 int size;
233 391
234 ret = register_ftrace_event(&syscall_enter_event); 392 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 394 return;
237 syscall_enter_event.type); 395
238 WARN_ON_ONCE(1); 396 sys_data = syscall_nr_to_meta(syscall_nr);
397 if (!sys_data)
398 return;
399
400 /* get the size after alignment with the u32 buffer size field */
401 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
402 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32);
404
405 do {
406 char raw_data[size];
407
408 /* zero the dead bytes from align to not leak stack to user */
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410
411 rec = (struct syscall_trace_enter *) raw_data;
412 tracing_generic_entry_update(&rec->ent, 0, 0);
413 rec->ent.type = sys_data->enter_id;
414 rec->nr = syscall_nr;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419}
420
421int reg_prof_syscall_enter(char *name)
422{
423 int ret = 0;
424 int num;
425
426 num = syscall_name_to_nr(name);
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429
430 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter)
432 ret = register_trace_sys_enter(prof_syscall_enter);
433 if (ret) {
434 pr_info("event trace: Could not activate"
435 "syscall entry trace point");
436 } else {
437 set_bit(num, enabled_prof_enter_syscalls);
438 sys_prof_refcount_enter++;
239 } 439 }
440 mutex_unlock(&syscall_trace_lock);
441 return ret;
442}
240 443
241 ret = register_ftrace_event(&syscall_exit_event); 444void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 445{
243 printk(KERN_WARNING "event %d failed to register\n", 446 int num;
244 syscall_exit_event.type); 447
245 WARN_ON_ONCE(1); 448 num = syscall_name_to_nr(name);
449 if (num < 0 || num >= NR_syscalls)
450 return;
451
452 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--;
454 clear_bit(num, enabled_prof_enter_syscalls);
455 if (!sys_prof_refcount_enter)
456 unregister_trace_sys_enter(prof_syscall_enter);
457 mutex_unlock(&syscall_trace_lock);
458}
459
460static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{
462 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec;
464 int syscall_nr;
465
466 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
468 return;
469
470 sys_data = syscall_nr_to_meta(syscall_nr);
471 if (!sys_data)
472 return;
473
474 tracing_generic_entry_update(&rec.ent, 0, 0);
475 rec.ent.type = sys_data->exit_id;
476 rec.nr = syscall_nr;
477 rec.ret = syscall_get_return_value(current, regs);
478
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
480}
481
482int reg_prof_syscall_exit(char *name)
483{
484 int ret = 0;
485 int num;
486
487 num = syscall_name_to_nr(name);
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490
491 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit)
493 ret = register_trace_sys_exit(prof_syscall_exit);
494 if (ret) {
495 pr_info("event trace: Could not activate"
496 "syscall entry trace point");
497 } else {
498 set_bit(num, enabled_prof_exit_syscalls);
499 sys_prof_refcount_exit++;
246 } 500 }
501 mutex_unlock(&syscall_trace_lock);
502 return ret;
503}
247 504
248 return register_tracer(&syscall_tracer); 505void unreg_prof_syscall_exit(char *name)
506{
507 int num;
508
509 num = syscall_name_to_nr(name);
510 if (num < 0 || num >= NR_syscalls)
511 return;
512
513 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--;
515 clear_bit(num, enabled_prof_exit_syscalls);
516 if (!sys_prof_refcount_exit)
517 unregister_trace_sys_exit(prof_syscall_exit);
518 mutex_unlock(&syscall_trace_lock);
249} 519}
250device_initcall(register_ftrace_syscalls); 520
521#endif
522
523
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..9489a0a9b1be 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 911ba7ffab84..090d300d7394 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -57,7 +57,6 @@
57# call mcount (offset: 0x5) 57# call mcount (offset: 0x5)
58# [...] 58# [...]
59# ret 59# ret
60# .globl my_func
61# other_func: 60# other_func:
62# [...] 61# [...]
63# call mcount (offset: 0x1b) 62# call mcount (offset: 0x1b)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 52219d5a507b..a587d41ae3c9 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -653,7 +653,7 @@ static void print_tracepoint_events(void)
653 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { 653 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
654 snprintf(evt_path, MAXPATHLEN, "%s:%s", 654 snprintf(evt_path, MAXPATHLEN, "%s:%s",
655 sys_dirent.d_name, evt_dirent.d_name); 655 sys_dirent.d_name, evt_dirent.d_name);
656 fprintf(stderr, " %-40s [%s]\n", evt_path, 656 fprintf(stderr, " %-42s [%s]\n", evt_path,
657 event_type_descriptors[PERF_TYPE_TRACEPOINT+1]); 657 event_type_descriptors[PERF_TYPE_TRACEPOINT+1]);
658 } 658 }
659 closedir(evt_dir); 659 closedir(evt_dir);
@@ -687,7 +687,7 @@ void print_events(void)
687 sprintf(name, "%s OR %s", syms->symbol, syms->alias); 687 sprintf(name, "%s OR %s", syms->symbol, syms->alias);
688 else 688 else
689 strcpy(name, syms->symbol); 689 strcpy(name, syms->symbol);
690 fprintf(stderr, " %-40s [%s]\n", name, 690 fprintf(stderr, " %-42s [%s]\n", name,
691 event_type_descriptors[type]); 691 event_type_descriptors[type]);
692 692
693 prev_type = type; 693 prev_type = type;
@@ -701,7 +701,7 @@ void print_events(void)
701 continue; 701 continue;
702 702
703 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { 703 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
704 fprintf(stderr, " %-40s [%s]\n", 704 fprintf(stderr, " %-42s [%s]\n",
705 event_cache_name(type, op, i), 705 event_cache_name(type, op, i),
706 event_type_descriptors[4]); 706 event_type_descriptors[4]);
707 } 707 }
@@ -709,7 +709,7 @@ void print_events(void)
709 } 709 }
710 710
711 fprintf(stderr, "\n"); 711 fprintf(stderr, "\n");
712 fprintf(stderr, " %-40s [raw hardware event descriptor]\n", 712 fprintf(stderr, " %-42s [raw hardware event descriptor]\n",
713 "rNNN"); 713 "rNNN");
714 fprintf(stderr, "\n"); 714 fprintf(stderr, "\n");
715 715