diff options
author | Changbin Du <changbin.du@intel.com> | 2018-02-17 00:39:37 -0500 |
---|---|---|
committer | Jonathan Corbet <corbet@lwn.net> | 2018-03-07 12:23:06 -0500 |
commit | 8fa4e720e8d919271cdf0da3c0856333246398a4 (patch) | |
tree | 171aeabff773c8398647f0f133ee510ce9fec920 /Documentation/trace/tracepoint-analysis.txt | |
parent | b3fdd1f92c1a12b2feda08fcad8ef29a40759bd2 (diff) |
trace doc: convert trace/tracepoint-analysis.txt to rst format
This converts the plain text documentation to reStructuredText format and
add it into Sphinx TOC tree. No essential content change.
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Diffstat (limited to 'Documentation/trace/tracepoint-analysis.txt')
-rw-r--r-- | Documentation/trace/tracepoint-analysis.txt | 327 |
1 files changed, 0 insertions, 327 deletions
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt deleted file mode 100644 index 058cc6c9dc56..000000000000 --- a/Documentation/trace/tracepoint-analysis.txt +++ /dev/null | |||
@@ -1,327 +0,0 @@ | |||
1 | Notes on Analysing Behaviour Using Events and Tracepoints | ||
2 | |||
3 | Documentation written by Mel Gorman | ||
4 | PCL information heavily based on email from Ingo Molnar | ||
5 | |||
6 | 1. Introduction | ||
7 | =============== | ||
8 | |||
9 | Tracepoints (see Documentation/trace/tracepoints.txt) can be used without | ||
10 | creating custom kernel modules to register probe functions using the event | ||
11 | tracing infrastructure. | ||
12 | |||
13 | Simplistically, tracepoints represent important events that can be | ||
14 | taken in conjunction with other tracepoints to build a "Big Picture" of | ||
15 | what is going on within the system. There are a large number of methods for | ||
16 | gathering and interpreting these events. Lacking any current Best Practises, | ||
17 | this document describes some of the methods that can be used. | ||
18 | |||
19 | This document assumes that debugfs is mounted on /sys/kernel/debug and that | ||
20 | the appropriate tracing options have been configured into the kernel. It is | ||
21 | assumed that the PCL tool tools/perf has been installed and is in your path. | ||
22 | |||
23 | 2. Listing Available Events | ||
24 | =========================== | ||
25 | |||
26 | 2.1 Standard Utilities | ||
27 | ---------------------- | ||
28 | |||
29 | All possible events are visible from /sys/kernel/debug/tracing/events. Simply | ||
30 | calling | ||
31 | |||
32 | $ find /sys/kernel/debug/tracing/events -type d | ||
33 | |||
34 | will give a fair indication of the number of events available. | ||
35 | |||
36 | 2.2 PCL (Performance Counters for Linux) | ||
37 | ------- | ||
38 | |||
39 | Discovery and enumeration of all counters and events, including tracepoints, | ||
40 | are available with the perf tool. Getting a list of available events is a | ||
41 | simple case of: | ||
42 | |||
43 | $ perf list 2>&1 | grep Tracepoint | ||
44 | ext4:ext4_free_inode [Tracepoint event] | ||
45 | ext4:ext4_request_inode [Tracepoint event] | ||
46 | ext4:ext4_allocate_inode [Tracepoint event] | ||
47 | ext4:ext4_write_begin [Tracepoint event] | ||
48 | ext4:ext4_ordered_write_end [Tracepoint event] | ||
49 | [ .... remaining output snipped .... ] | ||
50 | |||
51 | |||
52 | 3. Enabling Events | ||
53 | ================== | ||
54 | |||
55 | 3.1 System-Wide Event Enabling | ||
56 | ------------------------------ | ||
57 | |||
58 | See Documentation/trace/events.txt for a proper description on how events | ||
59 | can be enabled system-wide. A short example of enabling all events related | ||
60 | to page allocation would look something like: | ||
61 | |||
62 | $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done | ||
63 | |||
64 | 3.2 System-Wide Event Enabling with SystemTap | ||
65 | --------------------------------------------- | ||
66 | |||
67 | In SystemTap, tracepoints are accessible using the kernel.trace() function | ||
68 | call. The following is an example that reports every 5 seconds what processes | ||
69 | were allocating the pages. | ||
70 | |||
71 | global page_allocs | ||
72 | |||
73 | probe kernel.trace("mm_page_alloc") { | ||
74 | page_allocs[execname()]++ | ||
75 | } | ||
76 | |||
77 | function print_count() { | ||
78 | printf ("%-25s %-s\n", "#Pages Allocated", "Process Name") | ||
79 | foreach (proc in page_allocs-) | ||
80 | printf("%-25d %s\n", page_allocs[proc], proc) | ||
81 | printf ("\n") | ||
82 | delete page_allocs | ||
83 | } | ||
84 | |||
85 | probe timer.s(5) { | ||
86 | print_count() | ||
87 | } | ||
88 | |||
89 | 3.3 System-Wide Event Enabling with PCL | ||
90 | --------------------------------------- | ||
91 | |||
92 | By specifying the -a switch and analysing sleep, the system-wide events | ||
93 | for a duration of time can be examined. | ||
94 | |||
95 | $ perf stat -a \ | ||
96 | -e kmem:mm_page_alloc -e kmem:mm_page_free \ | ||
97 | -e kmem:mm_page_free_batched \ | ||
98 | sleep 10 | ||
99 | Performance counter stats for 'sleep 10': | ||
100 | |||
101 | 9630 kmem:mm_page_alloc | ||
102 | 2143 kmem:mm_page_free | ||
103 | 7424 kmem:mm_page_free_batched | ||
104 | |||
105 | 10.002577764 seconds time elapsed | ||
106 | |||
107 | Similarly, one could execute a shell and exit it as desired to get a report | ||
108 | at that point. | ||
109 | |||
110 | 3.4 Local Event Enabling | ||
111 | ------------------------ | ||
112 | |||
113 | Documentation/trace/ftrace.txt describes how to enable events on a per-thread | ||
114 | basis using set_ftrace_pid. | ||
115 | |||
116 | 3.5 Local Event Enablement with PCL | ||
117 | ----------------------------------- | ||
118 | |||
119 | Events can be activated and tracked for the duration of a process on a local | ||
120 | basis using PCL such as follows. | ||
121 | |||
122 | $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ | ||
123 | -e kmem:mm_page_free_batched ./hackbench 10 | ||
124 | Time: 0.909 | ||
125 | |||
126 | Performance counter stats for './hackbench 10': | ||
127 | |||
128 | 17803 kmem:mm_page_alloc | ||
129 | 12398 kmem:mm_page_free | ||
130 | 4827 kmem:mm_page_free_batched | ||
131 | |||
132 | 0.973913387 seconds time elapsed | ||
133 | |||
134 | 4. Event Filtering | ||
135 | ================== | ||
136 | |||
137 | Documentation/trace/ftrace.txt covers in-depth how to filter events in | ||
138 | ftrace. Obviously using grep and awk of trace_pipe is an option as well | ||
139 | as any script reading trace_pipe. | ||
140 | |||
141 | 5. Analysing Event Variances with PCL | ||
142 | ===================================== | ||
143 | |||
144 | Any workload can exhibit variances between runs and it can be important | ||
145 | to know what the standard deviation is. By and large, this is left to the | ||
146 | performance analyst to do it by hand. In the event that the discrete event | ||
147 | occurrences are useful to the performance analyst, then perf can be used. | ||
148 | |||
149 | $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free | ||
150 | -e kmem:mm_page_free_batched ./hackbench 10 | ||
151 | Time: 0.890 | ||
152 | Time: 0.895 | ||
153 | Time: 0.915 | ||
154 | Time: 1.001 | ||
155 | Time: 0.899 | ||
156 | |||
157 | Performance counter stats for './hackbench 10' (5 runs): | ||
158 | |||
159 | 16630 kmem:mm_page_alloc ( +- 3.542% ) | ||
160 | 11486 kmem:mm_page_free ( +- 4.771% ) | ||
161 | 4730 kmem:mm_page_free_batched ( +- 2.325% ) | ||
162 | |||
163 | 0.982653002 seconds time elapsed ( +- 1.448% ) | ||
164 | |||
165 | In the event that some higher-level event is required that depends on some | ||
166 | aggregation of discrete events, then a script would need to be developed. | ||
167 | |||
168 | Using --repeat, it is also possible to view how events are fluctuating over | ||
169 | time on a system-wide basis using -a and sleep. | ||
170 | |||
171 | $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ | ||
172 | -e kmem:mm_page_free_batched \ | ||
173 | -a --repeat 10 \ | ||
174 | sleep 1 | ||
175 | Performance counter stats for 'sleep 1' (10 runs): | ||
176 | |||
177 | 1066 kmem:mm_page_alloc ( +- 26.148% ) | ||
178 | 182 kmem:mm_page_free ( +- 5.464% ) | ||
179 | 890 kmem:mm_page_free_batched ( +- 30.079% ) | ||
180 | |||
181 | 1.002251757 seconds time elapsed ( +- 0.005% ) | ||
182 | |||
183 | 6. Higher-Level Analysis with Helper Scripts | ||
184 | ============================================ | ||
185 | |||
186 | When events are enabled the events that are triggering can be read from | ||
187 | /sys/kernel/debug/tracing/trace_pipe in human-readable format although binary | ||
188 | options exist as well. By post-processing the output, further information can | ||
189 | be gathered on-line as appropriate. Examples of post-processing might include | ||
190 | |||
191 | o Reading information from /proc for the PID that triggered the event | ||
192 | o Deriving a higher-level event from a series of lower-level events. | ||
193 | o Calculating latencies between two events | ||
194 | |||
195 | Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example | ||
196 | script that can read trace_pipe from STDIN or a copy of a trace. When used | ||
197 | on-line, it can be interrupted once to generate a report without exiting | ||
198 | and twice to exit. | ||
199 | |||
200 | Simplistically, the script just reads STDIN and counts up events but it | ||
201 | also can do more such as | ||
202 | |||
203 | o Derive high-level events from many low-level events. If a number of pages | ||
204 | are freed to the main allocator from the per-CPU lists, it recognises | ||
205 | that as one per-CPU drain even though there is no specific tracepoint | ||
206 | for that event | ||
207 | o It can aggregate based on PID or individual process number | ||
208 | o In the event memory is getting externally fragmented, it reports | ||
209 | on whether the fragmentation event was severe or moderate. | ||
210 | o When receiving an event about a PID, it can record who the parent was so | ||
211 | that if large numbers of events are coming from very short-lived | ||
212 | processes, the parent process responsible for creating all the helpers | ||
213 | can be identified | ||
214 | |||
215 | 7. Lower-Level Analysis with PCL | ||
216 | ================================ | ||
217 | |||
218 | There may also be a requirement to identify what functions within a program | ||
219 | were generating events within the kernel. To begin this sort of analysis, the | ||
220 | data must be recorded. At the time of writing, this required root: | ||
221 | |||
222 | $ perf record -c 1 \ | ||
223 | -e kmem:mm_page_alloc -e kmem:mm_page_free \ | ||
224 | -e kmem:mm_page_free_batched \ | ||
225 | ./hackbench 10 | ||
226 | Time: 0.894 | ||
227 | [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] | ||
228 | |||
229 | Note the use of '-c 1' to set the event period to sample. The default sample | ||
230 | period is quite high to minimise overhead but the information collected can be | ||
231 | very coarse as a result. | ||
232 | |||
233 | This record outputted a file called perf.data which can be analysed using | ||
234 | perf report. | ||
235 | |||
236 | $ perf report | ||
237 | # Samples: 30922 | ||
238 | # | ||
239 | # Overhead Command Shared Object | ||
240 | # ........ ......... ................................ | ||
241 | # | ||
242 | 87.27% hackbench [vdso] | ||
243 | 6.85% hackbench /lib/i686/cmov/libc-2.9.so | ||
244 | 2.62% hackbench /lib/ld-2.9.so | ||
245 | 1.52% perf [vdso] | ||
246 | 1.22% hackbench ./hackbench | ||
247 | 0.48% hackbench [kernel] | ||
248 | 0.02% perf /lib/i686/cmov/libc-2.9.so | ||
249 | 0.01% perf /usr/bin/perf | ||
250 | 0.01% perf /lib/ld-2.9.so | ||
251 | 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so | ||
252 | # | ||
253 | # (For more details, try: perf report --sort comm,dso,symbol) | ||
254 | # | ||
255 | |||
256 | According to this, the vast majority of events triggered on events | ||
257 | within the VDSO. With simple binaries, this will often be the case so let's | ||
258 | take a slightly different example. In the course of writing this, it was | ||
259 | noticed that X was generating an insane amount of page allocations so let's look | ||
260 | at it: | ||
261 | |||
262 | $ perf record -c 1 -f \ | ||
263 | -e kmem:mm_page_alloc -e kmem:mm_page_free \ | ||
264 | -e kmem:mm_page_free_batched \ | ||
265 | -p `pidof X` | ||
266 | |||
267 | This was interrupted after a few seconds and | ||
268 | |||
269 | $ perf report | ||
270 | # Samples: 27666 | ||
271 | # | ||
272 | # Overhead Command Shared Object | ||
273 | # ........ ....... ....................................... | ||
274 | # | ||
275 | 51.95% Xorg [vdso] | ||
276 | 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 | ||
277 | 0.09% Xorg /lib/i686/cmov/libc-2.9.so | ||
278 | 0.01% Xorg [kernel] | ||
279 | # | ||
280 | # (For more details, try: perf report --sort comm,dso,symbol) | ||
281 | # | ||
282 | |||
283 | So, almost half of the events are occurring in a library. To get an idea which | ||
284 | symbol: | ||
285 | |||
286 | $ perf report --sort comm,dso,symbol | ||
287 | # Samples: 27666 | ||
288 | # | ||
289 | # Overhead Command Shared Object Symbol | ||
290 | # ........ ....... ....................................... ...... | ||
291 | # | ||
292 | 51.95% Xorg [vdso] [.] 0x000000ffffe424 | ||
293 | 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2 | ||
294 | 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc | ||
295 | 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f | ||
296 | 0.01% Xorg [kernel] [k] read_hpet | ||
297 | 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path | ||
298 | 0.00% Xorg [kernel] [k] ftrace_trace_userstack | ||
299 | |||
300 | To see where within the function pixmanFillsse2 things are going wrong: | ||
301 | |||
302 | $ perf annotate pixmanFillsse2 | ||
303 | [ ... ] | ||
304 | 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax) | ||
305 | : } | ||
306 | : | ||
307 | : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _ | ||
308 | : _mm_store_si128 (__m128i *__P, __m128i __B) : { | ||
309 | : *__P = __B; | ||
310 | 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax) | ||
311 | 0.00 : 34ef5: ff | ||
312 | 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax) | ||
313 | 0.00 : 34efd: ff | ||
314 | 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax) | ||
315 | 0.00 : 34f05: ff | ||
316 | 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax) | ||
317 | 0.00 : 34f0d: ff | ||
318 | 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax) | ||
319 | 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax) | ||
320 | 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax) | ||
321 | 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax) | ||
322 | |||
323 | At a glance, it looks like the time is being spent copying pixmaps to | ||
324 | the card. Further investigation would be needed to determine why pixmaps | ||
325 | are being copied around so much but a starting point would be to take an | ||
326 | ancient build of libpixmap out of the library path where it was totally | ||
327 | forgotten about from months ago! | ||