aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteven Rostedt (Red Hat) <rostedt@goodmis.org>2016-06-23 12:45:36 -0400
committerSteven Rostedt <rostedt@goodmis.org>2016-09-02 12:47:51 -0400
commite7c15cd8a113335cf7154f027c9c8da1a92238ee (patch)
treeb6f7c3ac217af4203489e874c359c409d63f7c27
parent8861dd303cba879bae9a9dcee74042fb642bf03b (diff)
tracing: Added hardware latency tracer
The hardware latency tracer has been in the PREEMPT_RT patch for some time. It is used to detect possible SMIs or any other hardware interruptions that the kernel is unaware of. Note, NMIs may also be detected, but that may be good to note as well. The logic is pretty simple. It simply creates a thread that spins on a single CPU for a specified amount of time (width) within a periodic window (window). These numbers may be adjusted by their cooresponding names in /sys/kernel/tracing/hwlat_detector/ The defaults are window = 1000000 us (1 second) width = 500000 us (1/2 second) The loop consists of: t1 = trace_clock_local(); t2 = trace_clock_local(); Where trace_clock_local() is a variant of sched_clock(). The difference of t2 - t1 is recorded as the "inner" timestamp and also the timestamp t1 - prev_t2 is recorded as the "outer" timestamp. If either of these differences are greater than the time denoted in /sys/kernel/tracing/tracing_thresh then it records the event. When this tracer is started, and tracing_thresh is zero, it changes to the default threshold of 10 us. The hwlat tracer in the PREEMPT_RT patch was originally written by Jon Masters. I have modified it quite a bit and turned it into a tracer. Based-on-code-by: Jon Masters <jcm@redhat.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
-rw-r--r--kernel/trace/Kconfig35
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_hwlat.c527
-rw-r--r--kernel/trace/trace_output.c52
7 files changed, 642 insertions, 1 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f4b86e8ca1e7..72c07c2ffd79 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -221,6 +221,41 @@ config SCHED_TRACER
221 This tracer tracks the latency of the highest priority task 221 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 222 to be scheduled in, starting from the point it has woken up.
223 223
224config HWLAT_TRACER
225 bool "Tracer to detect hardware latencies (like SMIs)"
226 select GENERIC_TRACER
227 help
228 This tracer, when enabled will create one or more kernel threads,
229 depening on what the cpumask file is set to, which each thread
230 spinning in a loop looking for interruptions caused by
231 something other than the kernel. For example, if a
232 System Management Interrupt (SMI) takes a noticeable amount of
233 time, this tracer will detect it. This is useful for testing
234 if a system is reliable for Real Time tasks.
235
236 Some files are created in the tracing directory when this
237 is enabled:
238
239 hwlat_detector/width - time in usecs for how long to spin for
240 hwlat_detector/window - time in usecs between the start of each
241 iteration
242
243 A kernel thread is created that will spin with interrupts disabled
244 for "width" microseconds in every "widow" cycle. It will not spin
245 for "window - width" microseconds, where the system can
246 continue to operate.
247
248 The output will appear in the trace and trace_pipe files.
249
250 When the tracer is not running, it has no affect on the system,
251 but when it is running, it can cause the system to be
252 periodically non responsive. Do not run this tracer on a
253 production system.
254
255 To enable this tracer, echo in "hwlat" into the current_tracer
256 file. Every time a latency is greater than tracing_thresh, it will
257 be recorded into the ring buffer.
258
224config ENABLE_DEFAULT_TRACERS 259config ENABLE_DEFAULT_TRACERS
225 bool "Trace process context switches and events" 260 bool "Trace process context switches and events"
226 depends on !GENERIC_TRACER 261 depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d0a1617b52b4..992ab9d99f35 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
41obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 41obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
42obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 42obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
43obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o 43obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
44obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
44obj-$(CONFIG_NOP_TRACER) += trace_nop.o 45obj-$(CONFIG_NOP_TRACER) += trace_nop.o
45obj-$(CONFIG_STACK_TRACER) += trace_stack.o 46obj-$(CONFIG_STACK_TRACER) += trace_stack.o
46obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 47obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dade4c9559cc..474cc814e16d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void)
1047 * 1047 *
1048 * Shows real state of the ring buffer if it is enabled or not. 1048 * Shows real state of the ring buffer if it is enabled or not.
1049 */ 1049 */
1050static int tracer_tracing_is_on(struct trace_array *tr) 1050int tracer_tracing_is_on(struct trace_array *tr)
1051{ 1051{
1052 if (tr->trace_buffer.buffer) 1052 if (tr->trace_buffer.buffer)
1053 return ring_buffer_record_is_on(tr->trace_buffer.buffer); 1053 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f783df416726..1d866b0c1567 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,7 @@ enum trace_type {
38 TRACE_USER_STACK, 38 TRACE_USER_STACK,
39 TRACE_BLK, 39 TRACE_BLK,
40 TRACE_BPUTS, 40 TRACE_BPUTS,
41 TRACE_HWLAT,
41 42
42 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
43}; 44};
@@ -326,6 +327,7 @@ extern void __ftrace_bad_type(void);
326 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 327 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
327 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 328 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
328 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ 329 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
330 IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
329 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 331 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
330 TRACE_MMIO_RW); \ 332 TRACE_MMIO_RW); \
331 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 333 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -571,6 +573,7 @@ void tracing_reset_current(int cpu);
571void tracing_reset_all_online_cpus(void); 573void tracing_reset_all_online_cpus(void);
572int tracing_open_generic(struct inode *inode, struct file *filp); 574int tracing_open_generic(struct inode *inode, struct file *filp);
573bool tracing_is_disabled(void); 575bool tracing_is_disabled(void);
576int tracer_tracing_is_on(struct trace_array *tr);
574struct dentry *trace_create_file(const char *name, 577struct dentry *trace_create_file(const char *name,
575 umode_t mode, 578 umode_t mode,
576 struct dentry *parent, 579 struct dentry *parent,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 5c30efcda5e6..70d47dd99359 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -322,3 +322,26 @@ FTRACE_ENTRY(branch, trace_branch,
322 FILTER_OTHER 322 FILTER_OTHER
323); 323);
324 324
325
326FTRACE_ENTRY(hwlat, hwlat_entry,
327
328 TRACE_HWLAT,
329
330 F_STRUCT(
331 __field( u64, duration )
332 __field( u64, outer_duration )
333 __field_struct( struct timespec, timestamp )
334 __field_desc( long, timestamp, tv_sec )
335 __field_desc( long, timestamp, tv_nsec )
336 __field( unsigned int, seqnum )
337 ),
338
339 F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llu\n",
340 __entry->seqnum,
341 __entry->tv_sec,
342 __entry->tv_nsec,
343 __entry->duration,
344 __entry->outer_duration),
345
346 FILTER_OTHER
347);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
new file mode 100644
index 000000000000..08dfabe4e862
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,527 @@
1/*
2 * trace_hwlatdetect.c - A simple Hardware Latency detector.
3 *
4 * Use this tracer to detect large system latencies induced by the behavior of
5 * certain underlying system hardware or firmware, independent of Linux itself.
6 * The code was developed originally to detect the presence of SMIs on Intel
7 * and AMD systems, although there is no dependency upon x86 herein.
8 *
9 * The classical example usage of this tracer is in detecting the presence of
10 * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
11 * somewhat special form of hardware interrupt spawned from earlier CPU debug
12 * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
13 * LPC (or other device) to generate a special interrupt under certain
14 * circumstances, for example, upon expiration of a special SMI timer device,
15 * due to certain external thermal readings, on certain I/O address accesses,
16 * and other situations. An SMI hits a special CPU pin, triggers a special
17 * SMI mode (complete with special memory map), and the OS is unaware.
18 *
19 * Although certain hardware-inducing latencies are necessary (for example,
20 * a modern system often requires an SMI handler for correct thermal control
21 * and remote management) they can wreak havoc upon any OS-level performance
22 * guarantees toward low-latency, especially when the OS is not even made
23 * aware of the presence of these interrupts. For this reason, we need a
24 * somewhat brute force mechanism to detect these interrupts. In this case,
25 * we do it by hogging all of the CPU(s) for configurable timer intervals,
26 * sampling the built-in CPU timer, looking for discontiguous readings.
27 *
28 * WARNING: This implementation necessarily introduces latencies. Therefore,
29 * you should NEVER use this tracer while running in a production
30 * environment requiring any kind of low-latency performance
31 * guarantee(s).
32 *
33 * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
34 * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
35 *
36 * Includes useful feedback from Clark Williams <clark@redhat.com>
37 *
38 * This file is licensed under the terms of the GNU General Public
39 * License version 2. This program is licensed "as is" without any
40 * warranty of any kind, whether express or implied.
41 */
42#include <linux/kthread.h>
43#include <linux/tracefs.h>
44#include <linux/uaccess.h>
45#include <linux/delay.h>
46#include "trace.h"
47
48static struct trace_array *hwlat_trace;
49
50#define U64STR_SIZE 22 /* 20 digits max */
51
52#define BANNER "hwlat_detector: "
53#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
54#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
55#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
56
57/* sampling thread*/
58static struct task_struct *hwlat_kthread;
59
60static struct dentry *hwlat_sample_width; /* sample width us */
61static struct dentry *hwlat_sample_window; /* sample window us */
62
63/* Save the previous tracing_thresh value */
64static unsigned long save_tracing_thresh;
65
66/* If the user changed threshold, remember it */
67static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
68
69/* Individual latency samples are stored here when detected. */
70struct hwlat_sample {
71 u64 seqnum; /* unique sequence */
72 u64 duration; /* delta */
73 u64 outer_duration; /* delta (outer loop) */
74 struct timespec timestamp; /* wall time */
75};
76
77/* keep the global state somewhere. */
78static struct hwlat_data {
79
80 struct mutex lock; /* protect changes */
81
82 u64 count; /* total since reset */
83
84 u64 sample_window; /* total sampling window (on+off) */
85 u64 sample_width; /* active sampling portion of window */
86
87} hwlat_data = {
88 .sample_window = DEFAULT_SAMPLE_WINDOW,
89 .sample_width = DEFAULT_SAMPLE_WIDTH,
90};
91
92static void trace_hwlat_sample(struct hwlat_sample *sample)
93{
94 struct trace_array *tr = hwlat_trace;
95 struct trace_event_call *call = &event_hwlat;
96 struct ring_buffer *buffer = tr->trace_buffer.buffer;
97 struct ring_buffer_event *event;
98 struct hwlat_entry *entry;
99 unsigned long flags;
100 int pc;
101
102 pc = preempt_count();
103 local_save_flags(flags);
104
105 event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
106 flags, pc);
107 if (!event)
108 return;
109 entry = ring_buffer_event_data(event);
110 entry->seqnum = sample->seqnum;
111 entry->duration = sample->duration;
112 entry->outer_duration = sample->outer_duration;
113 entry->timestamp = sample->timestamp;
114
115 if (!call_filter_check_discard(call, entry, buffer, event))
116 __buffer_unlock_commit(buffer, event);
117}
118
119/* Macros to encapsulate the time capturing infrastructure */
120#define time_type u64
121#define time_get() trace_clock_local()
122#define time_to_us(x) div_u64(x, 1000)
123#define time_sub(a, b) ((a) - (b))
124#define init_time(a, b) (a = b)
125#define time_u64(a) a
126
127/**
128 * get_sample - sample the CPU TSC and look for likely hardware latencies
129 *
130 * Used to repeatedly capture the CPU TSC (or similar), looking for potential
131 * hardware-induced latency. Called with interrupts disabled and with
132 * hwlat_data.lock held.
133 */
134static int get_sample(void)
135{
136 struct trace_array *tr = hwlat_trace;
137 time_type start, t1, t2, last_t2;
138 s64 diff, total, last_total = 0;
139 u64 sample = 0;
140 u64 thresh = tracing_thresh;
141 u64 outer_sample = 0;
142 int ret = -1;
143
144 do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
145
146 init_time(last_t2, 0);
147 start = time_get(); /* start timestamp */
148
149 do {
150
151 t1 = time_get(); /* we'll look for a discontinuity */
152 t2 = time_get();
153
154 if (time_u64(last_t2)) {
155 /* Check the delta from outer loop (t2 to next t1) */
156 diff = time_to_us(time_sub(t1, last_t2));
157 /* This shouldn't happen */
158 if (diff < 0) {
159 pr_err(BANNER "time running backwards\n");
160 goto out;
161 }
162 if (diff > outer_sample)
163 outer_sample = diff;
164 }
165 last_t2 = t2;
166
167 total = time_to_us(time_sub(t2, start)); /* sample width */
168
169 /* Check for possible overflows */
170 if (total < last_total) {
171 pr_err("Time total overflowed\n");
172 break;
173 }
174 last_total = total;
175
176 /* This checks the inner loop (t1 to t2) */
177 diff = time_to_us(time_sub(t2, t1)); /* current diff */
178
179 /* This shouldn't happen */
180 if (diff < 0) {
181 pr_err(BANNER "time running backwards\n");
182 goto out;
183 }
184
185 if (diff > sample)
186 sample = diff; /* only want highest value */
187
188 } while (total <= hwlat_data.sample_width);
189
190 ret = 0;
191
192 /* If we exceed the threshold value, we have found a hardware latency */
193 if (sample > thresh || outer_sample > thresh) {
194 struct hwlat_sample s;
195
196 ret = 1;
197
198 hwlat_data.count++;
199 s.seqnum = hwlat_data.count;
200 s.duration = sample;
201 s.outer_duration = outer_sample;
202 s.timestamp = CURRENT_TIME;
203 trace_hwlat_sample(&s);
204
205 /* Keep a running maximum ever recorded hardware latency */
206 if (sample > tr->max_latency)
207 tr->max_latency = sample;
208 }
209
210out:
211 return ret;
212}
213
214/*
215 * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
216 *
217 * Used to periodically sample the CPU TSC via a call to get_sample. We
218 * disable interrupts, which does (intentionally) introduce latency since we
219 * need to ensure nothing else might be running (and thus preempting).
220 * Obviously this should never be used in production environments.
221 *
222 * Currently this runs on which ever CPU it was scheduled on, but most
223 * real-world hardware latency situations occur across several CPUs,
224 * but we might later generalize this if we find there are any actualy
225 * systems with alternate SMI delivery or other hardware latencies.
226 */
227static int kthread_fn(void *data)
228{
229 u64 interval;
230
231 while (!kthread_should_stop()) {
232
233 local_irq_disable();
234 get_sample();
235 local_irq_enable();
236
237 mutex_lock(&hwlat_data.lock);
238 interval = hwlat_data.sample_window - hwlat_data.sample_width;
239 mutex_unlock(&hwlat_data.lock);
240
241 do_div(interval, USEC_PER_MSEC); /* modifies interval value */
242
243 /* Always sleep for at least 1ms */
244 if (interval < 1)
245 interval = 1;
246
247 if (msleep_interruptible(interval))
248 break;
249 }
250
251 return 0;
252}
253
254/**
255 * start_kthread - Kick off the hardware latency sampling/detector kthread
256 *
257 * This starts the kernel thread that will sit and sample the CPU timestamp
258 * counter (TSC or similar) and look for potential hardware latencies.
259 */
260static int start_kthread(struct trace_array *tr)
261{
262 struct task_struct *kthread;
263
264 kthread = kthread_create(kthread_fn, NULL, "hwlatd");
265 if (IS_ERR(kthread)) {
266 pr_err(BANNER "could not start sampling thread\n");
267 return -ENOMEM;
268 }
269 hwlat_kthread = kthread;
270 wake_up_process(kthread);
271
272 return 0;
273}
274
275/**
276 * stop_kthread - Inform the hardware latency samping/detector kthread to stop
277 *
278 * This kicks the running hardware latency sampling/detector kernel thread and
279 * tells it to stop sampling now. Use this on unload and at system shutdown.
280 */
281static void stop_kthread(void)
282{
283 if (!hwlat_kthread)
284 return;
285 kthread_stop(hwlat_kthread);
286 hwlat_kthread = NULL;
287}
288
289/*
290 * hwlat_read - Wrapper read function for reading both window and width
291 * @filp: The active open file structure
292 * @ubuf: The userspace provided buffer to read value into
293 * @cnt: The maximum number of bytes to read
294 * @ppos: The current "file" position
295 *
296 * This function provides a generic read implementation for the global state
297 * "hwlat_data" structure filesystem entries.
298 */
299static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
300 size_t cnt, loff_t *ppos)
301{
302 char buf[U64STR_SIZE];
303 u64 *entry = filp->private_data;
304 u64 val;
305 int len;
306
307 if (!entry)
308 return -EFAULT;
309
310 if (cnt > sizeof(buf))
311 cnt = sizeof(buf);
312
313 val = *entry;
314
315 len = snprintf(buf, sizeof(buf), "%llu\n", val);
316
317 return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
318}
319
320/**
321 * hwlat_width_write - Write function for "width" entry
322 * @filp: The active open file structure
323 * @ubuf: The user buffer that contains the value to write
324 * @cnt: The maximum number of bytes to write to "file"
325 * @ppos: The current position in @file
326 *
327 * This function provides a write implementation for the "width" interface
328 * to the hardware latency detector. It can be used to configure
329 * for how many us of the total window us we will actively sample for any
330 * hardware-induced latency periods. Obviously, it is not possible to
331 * sample constantly and have the system respond to a sample reader, or,
332 * worse, without having the system appear to have gone out to lunch. It
333 * is enforced that width is less that the total window size.
334 */
335static ssize_t
336hwlat_width_write(struct file *filp, const char __user *ubuf,
337 size_t cnt, loff_t *ppos)
338{
339 u64 val;
340 int err;
341
342 err = kstrtoull_from_user(ubuf, cnt, 10, &val);
343 if (err)
344 return err;
345
346 mutex_lock(&hwlat_data.lock);
347 if (val < hwlat_data.sample_window)
348 hwlat_data.sample_width = val;
349 else
350 err = -EINVAL;
351 mutex_unlock(&hwlat_data.lock);
352
353 if (err)
354 return err;
355
356 return cnt;
357}
358
359/**
360 * hwlat_window_write - Write function for "window" entry
361 * @filp: The active open file structure
362 * @ubuf: The user buffer that contains the value to write
363 * @cnt: The maximum number of bytes to write to "file"
364 * @ppos: The current position in @file
365 *
366 * This function provides a write implementation for the "window" interface
367 * to the hardware latency detetector. The window is the total time
368 * in us that will be considered one sample period. Conceptually, windows
369 * occur back-to-back and contain a sample width period during which
370 * actual sampling occurs. Can be used to write a new total window size. It
371 * is enfoced that any value written must be greater than the sample width
372 * size, or an error results.
373 */
374static ssize_t
375hwlat_window_write(struct file *filp, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 u64 val;
379 int err;
380
381 err = kstrtoull_from_user(ubuf, cnt, 10, &val);
382 if (err)
383 return err;
384
385 mutex_lock(&hwlat_data.lock);
386 if (hwlat_data.sample_width < val)
387 hwlat_data.sample_window = val;
388 else
389 err = -EINVAL;
390 mutex_unlock(&hwlat_data.lock);
391
392 if (err)
393 return err;
394
395 return cnt;
396}
397
398static const struct file_operations width_fops = {
399 .open = tracing_open_generic,
400 .read = hwlat_read,
401 .write = hwlat_width_write,
402};
403
404static const struct file_operations window_fops = {
405 .open = tracing_open_generic,
406 .read = hwlat_read,
407 .write = hwlat_window_write,
408};
409
410/**
411 * init_tracefs - A function to initialize the tracefs interface files
412 *
413 * This function creates entries in tracefs for "hwlat_detector".
414 * It creates the hwlat_detector directory in the tracing directory,
415 * and within that directory is the count, width and window files to
416 * change and view those values.
417 */
418static int init_tracefs(void)
419{
420 struct dentry *d_tracer;
421 struct dentry *top_dir;
422
423 d_tracer = tracing_init_dentry();
424 if (IS_ERR(d_tracer))
425 return -ENOMEM;
426
427 top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
428 if (!top_dir)
429 return -ENOMEM;
430
431 hwlat_sample_window = tracefs_create_file("window", 0640,
432 top_dir,
433 &hwlat_data.sample_window,
434 &window_fops);
435 if (!hwlat_sample_window)
436 goto err;
437
438 hwlat_sample_width = tracefs_create_file("width", 0644,
439 top_dir,
440 &hwlat_data.sample_width,
441 &width_fops);
442 if (!hwlat_sample_width)
443 goto err;
444
445 return 0;
446
447 err:
448 tracefs_remove_recursive(top_dir);
449 return -ENOMEM;
450}
451
452static void hwlat_tracer_start(struct trace_array *tr)
453{
454 int err;
455
456 err = start_kthread(tr);
457 if (err)
458 pr_err(BANNER "Cannot start hwlat kthread\n");
459}
460
461static void hwlat_tracer_stop(struct trace_array *tr)
462{
463 stop_kthread();
464}
465
466static bool hwlat_busy;
467
468static int hwlat_tracer_init(struct trace_array *tr)
469{
470 /* Only allow one instance to enable this */
471 if (hwlat_busy)
472 return -EBUSY;
473
474 hwlat_trace = tr;
475
476 hwlat_data.count = 0;
477 tr->max_latency = 0;
478 save_tracing_thresh = tracing_thresh;
479
480 /* tracing_thresh is in nsecs, we speak in usecs */
481 if (!tracing_thresh)
482 tracing_thresh = last_tracing_thresh;
483
484 if (tracer_tracing_is_on(tr))
485 hwlat_tracer_start(tr);
486
487 hwlat_busy = true;
488
489 return 0;
490}
491
492static void hwlat_tracer_reset(struct trace_array *tr)
493{
494 stop_kthread();
495
496 /* the tracing threshold is static between runs */
497 last_tracing_thresh = tracing_thresh;
498
499 tracing_thresh = save_tracing_thresh;
500 hwlat_busy = false;
501}
502
503static struct tracer hwlat_tracer __read_mostly =
504{
505 .name = "hwlat",
506 .init = hwlat_tracer_init,
507 .reset = hwlat_tracer_reset,
508 .start = hwlat_tracer_start,
509 .stop = hwlat_tracer_stop,
510 .allow_instances = true,
511};
512
513__init static int init_hwlat_tracer(void)
514{
515 int ret;
516
517 mutex_init(&hwlat_data.lock);
518
519 ret = register_tracer(&hwlat_tracer);
520 if (ret)
521 return ret;
522
523 init_tracefs();
524
525 return 0;
526}
527late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0bb9cf2d53e6..d67a562df259 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,57 @@ static struct trace_event trace_user_stack_event = {
1098 .funcs = &trace_user_stack_funcs, 1098 .funcs = &trace_user_stack_funcs,
1099}; 1099};
1100 1100
1101/* TRACE_HWLAT */
1102static enum print_line_t
1103trace_hwlat_print(struct trace_iterator *iter, int flags,
1104 struct trace_event *event)
1105{
1106 struct trace_entry *entry = iter->ent;
1107 struct trace_seq *s = &iter->seq;
1108 struct hwlat_entry *field;
1109
1110 trace_assign_type(field, entry);
1111
1112 trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld\n",
1113 field->seqnum,
1114 field->duration,
1115 field->outer_duration,
1116 field->timestamp.tv_sec,
1117 field->timestamp.tv_nsec);
1118
1119 return trace_handle_return(s);
1120}
1121
1122
1123static enum print_line_t
1124trace_hwlat_raw(struct trace_iterator *iter, int flags,
1125 struct trace_event *event)
1126{
1127 struct hwlat_entry *field;
1128 struct trace_seq *s = &iter->seq;
1129
1130 trace_assign_type(field, iter->ent);
1131
1132 trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
1133 field->duration,
1134 field->outer_duration,
1135 field->timestamp.tv_sec,
1136 field->timestamp.tv_nsec,
1137 field->seqnum);
1138
1139 return trace_handle_return(s);
1140}
1141
1142static struct trace_event_functions trace_hwlat_funcs = {
1143 .trace = trace_hwlat_print,
1144 .raw = trace_hwlat_raw,
1145};
1146
1147static struct trace_event trace_hwlat_event = {
1148 .type = TRACE_HWLAT,
1149 .funcs = &trace_hwlat_funcs,
1150};
1151
1101/* TRACE_BPUTS */ 1152/* TRACE_BPUTS */
1102static enum print_line_t 1153static enum print_line_t
1103trace_bputs_print(struct trace_iterator *iter, int flags, 1154trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1233,6 +1284,7 @@ static struct trace_event *events[] __initdata = {
1233 &trace_bputs_event, 1284 &trace_bputs_event,
1234 &trace_bprint_event, 1285 &trace_bprint_event,
1235 &trace_print_event, 1286 &trace_print_event,
1287 &trace_hwlat_event,
1236 NULL 1288 NULL
1237}; 1289};
1238 1290