diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 14:48:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 14:48:41 -0400 |
commit | 95107b30be68953e3a4f1c3994c2233500502ccf (patch) | |
tree | ed1824ce842d9813f2ee8a5fa8683e0d6b13f662 | |
parent | 541efb7632642cab55361178d73d544f025b593c (diff) | |
parent | a0d0c6216afad4b2b1704a72bd76bea259e07655 (diff) |
Merge tag 'trace-v4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
Pull tracing updates from Steven Rostedt:
"This release cycle is rather small. Just a few fixes to tracing.
The big change is the addition of the hwlat tracer. It not only
detects SMIs, but also other latency that's caused by the hardware. I
have detected some latency from large boxes having bus contention"
* tag 'trace-v4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace:
tracing: Call traceoff trigger after event is recorded
ftrace/scripts: Add helper script to bisect function tracing problem functions
tracing: Have max_latency be defined for HWLAT_TRACER as well
tracing: Add NMI tracing in hwlat detector
tracing: Have hwlat trace migrate across tracing_cpumask CPUs
tracing: Add documentation for hwlat_detector tracer
tracing: Added hardware latency tracer
ftrace: Access ret_stack->subtime only in the function profiler
function_graph: Handle TRACE_BPUTS in print_graph_comment
tracing/uprobe: Drop isdigit() check in create_trace_uprobe
-rw-r--r-- | Documentation/trace/ftrace.txt | 10 | ||||
-rw-r--r-- | Documentation/trace/hwlat_detector.txt | 79 | ||||
-rw-r--r-- | arch/sh/kernel/ftrace.c | 4 | ||||
-rw-r--r-- | include/linux/ftrace.h | 2 | ||||
-rw-r--r-- | include/linux/ftrace_irq.h | 31 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 35 | ||||
-rw-r--r-- | kernel/trace/Makefile | 1 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace.h | 5 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 27 | ||||
-rw-r--r-- | kernel/trace/trace_events_trigger.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_hwlat.c | 633 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 66 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 4 | ||||
-rwxr-xr-x | scripts/tracing/ftrace-bisect.sh | 115 |
17 files changed, 1013 insertions, 20 deletions
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a6b3705e62a6..185c39fea2a0 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -858,11 +858,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | |||
858 | When enabled, it will account time the task has been | 858 | When enabled, it will account time the task has been |
859 | scheduled out as part of the function call. | 859 | scheduled out as part of the function call. |
860 | 860 | ||
861 | graph-time - When running function graph tracer, to include the | 861 | graph-time - When running function profiler with function graph tracer, |
862 | time to call nested functions. When this is not set, | 862 | to include the time to call nested functions. When this is |
863 | the time reported for the function will only include | 863 | not set, the time reported for the function will only |
864 | the time the function itself executed for, not the time | 864 | include the time the function itself executed for, not the |
865 | for functions that it called. | 865 | time for functions that it called. |
866 | 866 | ||
867 | record-cmd - When any event or tracer is enabled, a hook is enabled | 867 | record-cmd - When any event or tracer is enabled, a hook is enabled |
868 | in the sched_switch trace point to fill comm cache | 868 | in the sched_switch trace point to fill comm cache |
diff --git a/Documentation/trace/hwlat_detector.txt b/Documentation/trace/hwlat_detector.txt new file mode 100644 index 000000000000..3207717a0d1a --- /dev/null +++ b/Documentation/trace/hwlat_detector.txt | |||
@@ -0,0 +1,79 @@ | |||
1 | Introduction: | ||
2 | ------------- | ||
3 | |||
4 | The tracer hwlat_detector is a special purpose tracer that is used to | ||
5 | detect large system latencies induced by the behavior of certain underlying | ||
6 | hardware or firmware, independent of Linux itself. The code was developed | ||
7 | originally to detect SMIs (System Management Interrupts) on x86 systems, | ||
8 | however there is nothing x86 specific about this patchset. It was | ||
9 | originally written for use by the "RT" patch since the Real Time | ||
10 | kernel is highly latency sensitive. | ||
11 | |||
12 | SMIs are not serviced by the Linux kernel, which means that it does not | ||
13 | even know that they are occuring. SMIs are instead set up by BIOS code | ||
14 | and are serviced by BIOS code, usually for "critical" events such as | ||
15 | management of thermal sensors and fans. Sometimes though, SMIs are used for | ||
16 | other tasks and those tasks can spend an inordinate amount of time in the | ||
17 | handler (sometimes measured in milliseconds). Obviously this is a problem if | ||
18 | you are trying to keep event service latencies down in the microsecond range. | ||
19 | |||
20 | The hardware latency detector works by hogging one of the cpus for configurable | ||
21 | amounts of time (with interrupts disabled), polling the CPU Time Stamp Counter | ||
22 | for some period, then looking for gaps in the TSC data. Any gap indicates a | ||
23 | time when the polling was interrupted and since the interrupts are disabled, | ||
24 | the only thing that could do that would be an SMI or other hardware hiccup | ||
25 | (or an NMI, but those can be tracked). | ||
26 | |||
27 | Note that the hwlat detector should *NEVER* be used in a production environment. | ||
28 | It is intended to be run manually to determine if the hardware platform has a | ||
29 | problem with long system firmware service routines. | ||
30 | |||
31 | Usage: | ||
32 | ------ | ||
33 | |||
34 | Write the ASCII text "hwlat" into the current_tracer file of the tracing system | ||
35 | (mounted at /sys/kernel/tracing or /sys/kernel/tracing). It is possible to | ||
36 | redefine the threshold in microseconds (us) above which latency spikes will | ||
37 | be taken into account. | ||
38 | |||
39 | Example: | ||
40 | |||
41 | # echo hwlat > /sys/kernel/tracing/current_tracer | ||
42 | # echo 100 > /sys/kernel/tracing/tracing_thresh | ||
43 | |||
44 | The /sys/kernel/tracing/hwlat_detector interface contains the following files: | ||
45 | |||
46 | width - time period to sample with CPUs held (usecs) | ||
47 | must be less than the total window size (enforced) | ||
48 | window - total period of sampling, width being inside (usecs) | ||
49 | |||
50 | By default the width is set to 500,000 and window to 1,000,000, meaning that | ||
51 | for every 1,000,000 usecs (1s) the hwlat detector will spin for 500,000 usecs | ||
52 | (0.5s). If tracing_thresh contains zero when hwlat tracer is enabled, it will | ||
53 | change to a default of 10 usecs. If any latencies that exceed the threshold is | ||
54 | observed then the data will be written to the tracing ring buffer. | ||
55 | |||
56 | The minimum sleep time between periods is 1 millisecond. Even if width | ||
57 | is less than 1 millisecond apart from window, to allow the system to not | ||
58 | be totally starved. | ||
59 | |||
60 | If tracing_thresh was zero when hwlat detector was started, it will be set | ||
61 | back to zero if another tracer is loaded. Note, the last value in | ||
62 | tracing_thresh that hwlat detector had will be saved and this value will | ||
63 | be restored in tracing_thresh if it is still zero when hwlat detector is | ||
64 | started again. | ||
65 | |||
66 | The following tracing directory files are used by the hwlat_detector: | ||
67 | |||
68 | in /sys/kernel/tracing: | ||
69 | |||
70 | tracing_threshold - minimum latency value to be considered (usecs) | ||
71 | tracing_max_latency - maximum hardware latency actually observed (usecs) | ||
72 | tracing_cpumask - the CPUs to move the hwlat thread across | ||
73 | hwlat_detector/width - specified amount of time to spin within window (usecs) | ||
74 | hwlat_detector/window - amount of time between (width) runs (usecs) | ||
75 | |||
76 | The hwlat detector's kernel thread will migrate across each CPU specified in | ||
77 | tracing_cpumask between each window. To limit the migration, either modify | ||
78 | tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU | ||
79 | affinity directly, and the migration will stop. | ||
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 95eccd49672f..53783978162e 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c | |||
@@ -139,7 +139,7 @@ static void ftrace_mod_code(void) | |||
139 | clear_mod_flag(); | 139 | clear_mod_flag(); |
140 | } | 140 | } |
141 | 141 | ||
142 | void ftrace_nmi_enter(void) | 142 | void arch_ftrace_nmi_enter(void) |
143 | { | 143 | { |
144 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { | 144 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { |
145 | smp_rmb(); | 145 | smp_rmb(); |
@@ -150,7 +150,7 @@ void ftrace_nmi_enter(void) | |||
150 | smp_mb(); | 150 | smp_mb(); |
151 | } | 151 | } |
152 | 152 | ||
153 | void ftrace_nmi_exit(void) | 153 | void arch_ftrace_nmi_exit(void) |
154 | { | 154 | { |
155 | /* Finish all executions before clearing nmi_running */ | 155 | /* Finish all executions before clearing nmi_running */ |
156 | smp_mb(); | 156 | smp_mb(); |
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6f93ac46e7f0..b3d34d3e0e7e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
@@ -794,7 +794,9 @@ struct ftrace_ret_stack { | |||
794 | unsigned long ret; | 794 | unsigned long ret; |
795 | unsigned long func; | 795 | unsigned long func; |
796 | unsigned long long calltime; | 796 | unsigned long long calltime; |
797 | #ifdef CONFIG_FUNCTION_PROFILER | ||
797 | unsigned long long subtime; | 798 | unsigned long long subtime; |
799 | #endif | ||
798 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST | 800 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
799 | unsigned long fp; | 801 | unsigned long fp; |
800 | #endif | 802 | #endif |
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index dca7bf8cffe2..4ec2c9b205f2 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h | |||
@@ -3,11 +3,34 @@ | |||
3 | 3 | ||
4 | 4 | ||
5 | #ifdef CONFIG_FTRACE_NMI_ENTER | 5 | #ifdef CONFIG_FTRACE_NMI_ENTER |
6 | extern void ftrace_nmi_enter(void); | 6 | extern void arch_ftrace_nmi_enter(void); |
7 | extern void ftrace_nmi_exit(void); | 7 | extern void arch_ftrace_nmi_exit(void); |
8 | #else | 8 | #else |
9 | static inline void ftrace_nmi_enter(void) { } | 9 | static inline void arch_ftrace_nmi_enter(void) { } |
10 | static inline void ftrace_nmi_exit(void) { } | 10 | static inline void arch_ftrace_nmi_exit(void) { } |
11 | #endif | 11 | #endif |
12 | 12 | ||
13 | #ifdef CONFIG_HWLAT_TRACER | ||
14 | extern bool trace_hwlat_callback_enabled; | ||
15 | extern void trace_hwlat_callback(bool enter); | ||
16 | #endif | ||
17 | |||
18 | static inline void ftrace_nmi_enter(void) | ||
19 | { | ||
20 | #ifdef CONFIG_HWLAT_TRACER | ||
21 | if (trace_hwlat_callback_enabled) | ||
22 | trace_hwlat_callback(true); | ||
23 | #endif | ||
24 | arch_ftrace_nmi_enter(); | ||
25 | } | ||
26 | |||
27 | static inline void ftrace_nmi_exit(void) | ||
28 | { | ||
29 | arch_ftrace_nmi_exit(); | ||
30 | #ifdef CONFIG_HWLAT_TRACER | ||
31 | if (trace_hwlat_callback_enabled) | ||
32 | trace_hwlat_callback(false); | ||
33 | #endif | ||
34 | } | ||
35 | |||
13 | #endif /* _LINUX_FTRACE_IRQ_H */ | 36 | #endif /* _LINUX_FTRACE_IRQ_H */ |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ba3326785ca4..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -216,6 +216,41 @@ config SCHED_TRACER | |||
216 | This tracer tracks the latency of the highest priority task | 216 | This tracer tracks the latency of the highest priority task |
217 | to be scheduled in, starting from the point it has woken up. | 217 | to be scheduled in, starting from the point it has woken up. |
218 | 218 | ||
219 | config HWLAT_TRACER | ||
220 | bool "Tracer to detect hardware latencies (like SMIs)" | ||
221 | select GENERIC_TRACER | ||
222 | help | ||
223 | This tracer, when enabled will create one or more kernel threads, | ||
224 | depening on what the cpumask file is set to, which each thread | ||
225 | spinning in a loop looking for interruptions caused by | ||
226 | something other than the kernel. For example, if a | ||
227 | System Management Interrupt (SMI) takes a noticeable amount of | ||
228 | time, this tracer will detect it. This is useful for testing | ||
229 | if a system is reliable for Real Time tasks. | ||
230 | |||
231 | Some files are created in the tracing directory when this | ||
232 | is enabled: | ||
233 | |||
234 | hwlat_detector/width - time in usecs for how long to spin for | ||
235 | hwlat_detector/window - time in usecs between the start of each | ||
236 | iteration | ||
237 | |||
238 | A kernel thread is created that will spin with interrupts disabled | ||
239 | for "width" microseconds in every "widow" cycle. It will not spin | ||
240 | for "window - width" microseconds, where the system can | ||
241 | continue to operate. | ||
242 | |||
243 | The output will appear in the trace and trace_pipe files. | ||
244 | |||
245 | When the tracer is not running, it has no affect on the system, | ||
246 | but when it is running, it can cause the system to be | ||
247 | periodically non responsive. Do not run this tracer on a | ||
248 | production system. | ||
249 | |||
250 | To enable this tracer, echo in "hwlat" into the current_tracer | ||
251 | file. Every time a latency is greater than tracing_thresh, it will | ||
252 | be recorded into the ring buffer. | ||
253 | |||
219 | config ENABLE_DEFAULT_TRACERS | 254 | config ENABLE_DEFAULT_TRACERS |
220 | bool "Trace process context switches and events" | 255 | bool "Trace process context switches and events" |
221 | depends on !GENERIC_TRACER | 256 | depends on !GENERIC_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..992ab9d99f35 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | |||
41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | 43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o |
44 | obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o | ||
44 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 45 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
45 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 46 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
46 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 47 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..2050a7652a86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
874 | { | 874 | { |
875 | int index = trace->depth; | ||
876 | |||
875 | function_profile_call(trace->func, 0, NULL, NULL); | 877 | function_profile_call(trace->func, 0, NULL, NULL); |
878 | |||
879 | if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) | ||
880 | current->ret_stack[index].subtime = 0; | ||
881 | |||
876 | return 1; | 882 | return 1; |
877 | } | 883 | } |
878 | 884 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 37824d98ae71..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) | |||
1047 | * | 1047 | * |
1048 | * Shows real state of the ring buffer if it is enabled or not. | 1048 | * Shows real state of the ring buffer if it is enabled or not. |
1049 | */ | 1049 | */ |
1050 | static int tracer_tracing_is_on(struct trace_array *tr) | 1050 | int tracer_tracing_is_on(struct trace_array *tr) |
1051 | { | 1051 | { |
1052 | if (tr->trace_buffer.buffer) | 1052 | if (tr->trace_buffer.buffer) |
1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); | 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); |
@@ -4969,7 +4969,7 @@ out: | |||
4969 | return ret; | 4969 | return ret; |
4970 | } | 4970 | } |
4971 | 4971 | ||
4972 | #ifdef CONFIG_TRACER_MAX_TRACE | 4972 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
4973 | 4973 | ||
4974 | static ssize_t | 4974 | static ssize_t |
4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | 4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, |
@@ -5892,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = { | |||
5892 | .llseek = generic_file_llseek, | 5892 | .llseek = generic_file_llseek, |
5893 | }; | 5893 | }; |
5894 | 5894 | ||
5895 | #ifdef CONFIG_TRACER_MAX_TRACE | 5895 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
5896 | static const struct file_operations tracing_max_lat_fops = { | 5896 | static const struct file_operations tracing_max_lat_fops = { |
5897 | .open = tracing_open_generic, | 5897 | .open = tracing_open_generic, |
5898 | .read = tracing_max_lat_read, | 5898 | .read = tracing_max_lat_read, |
@@ -7222,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
7222 | 7222 | ||
7223 | create_trace_options_dir(tr); | 7223 | create_trace_options_dir(tr); |
7224 | 7224 | ||
7225 | #ifdef CONFIG_TRACER_MAX_TRACE | 7225 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, | 7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, |
7227 | &tr->max_latency, &tracing_max_lat_fops); | 7227 | &tr->max_latency, &tracing_max_lat_fops); |
7228 | #endif | 7228 | #endif |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -38,6 +38,7 @@ enum trace_type { | |||
38 | TRACE_USER_STACK, | 38 | TRACE_USER_STACK, |
39 | TRACE_BLK, | 39 | TRACE_BLK, |
40 | TRACE_BPUTS, | 40 | TRACE_BPUTS, |
41 | TRACE_HWLAT, | ||
41 | 42 | ||
42 | __TRACE_LAST_TYPE, | 43 | __TRACE_LAST_TYPE, |
43 | }; | 44 | }; |
@@ -213,6 +214,8 @@ struct trace_array { | |||
213 | */ | 214 | */ |
214 | struct trace_buffer max_buffer; | 215 | struct trace_buffer max_buffer; |
215 | bool allocated_snapshot; | 216 | bool allocated_snapshot; |
217 | #endif | ||
218 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) | ||
216 | unsigned long max_latency; | 219 | unsigned long max_latency; |
217 | #endif | 220 | #endif |
218 | struct trace_pid_list __rcu *filtered_pids; | 221 | struct trace_pid_list __rcu *filtered_pids; |
@@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); | |||
326 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 329 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
327 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 330 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
328 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ | 331 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ |
332 | IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ | ||
329 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 333 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
330 | TRACE_MMIO_RW); \ | 334 | TRACE_MMIO_RW); \ |
331 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 335 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
@@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); | |||
571 | void tracing_reset_all_online_cpus(void); | 575 | void tracing_reset_all_online_cpus(void); |
572 | int tracing_open_generic(struct inode *inode, struct file *filp); | 576 | int tracing_open_generic(struct inode *inode, struct file *filp); |
573 | bool tracing_is_disabled(void); | 577 | bool tracing_is_disabled(void); |
578 | int tracer_tracing_is_on(struct trace_array *tr); | ||
574 | struct dentry *trace_create_file(const char *name, | 579 | struct dentry *trace_create_file(const char *name, |
575 | umode_t mode, | 580 | umode_t mode, |
576 | struct dentry *parent, | 581 | struct dentry *parent, |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, | |||
322 | FILTER_OTHER | 322 | FILTER_OTHER |
323 | ); | 323 | ); |
324 | 324 | ||
325 | |||
326 | FTRACE_ENTRY(hwlat, hwlat_entry, | ||
327 | |||
328 | TRACE_HWLAT, | ||
329 | |||
330 | F_STRUCT( | ||
331 | __field( u64, duration ) | ||
332 | __field( u64, outer_duration ) | ||
333 | __field( u64, nmi_total_ts ) | ||
334 | __field_struct( struct timespec, timestamp ) | ||
335 | __field_desc( long, timestamp, tv_sec ) | ||
336 | __field_desc( long, timestamp, tv_nsec ) | ||
337 | __field( unsigned int, nmi_count ) | ||
338 | __field( unsigned int, seqnum ) | ||
339 | ), | ||
340 | |||
341 | F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", | ||
342 | __entry->seqnum, | ||
343 | __entry->tv_sec, | ||
344 | __entry->tv_nsec, | ||
345 | __entry->duration, | ||
346 | __entry->outer_duration, | ||
347 | __entry->nmi_total_ts, | ||
348 | __entry->nmi_count), | ||
349 | |||
350 | FILTER_OTHER | ||
351 | ); | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { | |||
1028 | static struct event_command trigger_traceoff_cmd = { | 1028 | static struct event_command trigger_traceoff_cmd = { |
1029 | .name = "traceoff", | 1029 | .name = "traceoff", |
1030 | .trigger_type = ETT_TRACE_ONOFF, | 1030 | .trigger_type = ETT_TRACE_ONOFF, |
1031 | .flags = EVENT_CMD_FL_POST_TRIGGER, | ||
1031 | .func = event_trigger_callback, | 1032 | .func = event_trigger_callback, |
1032 | .reg = register_trigger, | 1033 | .reg = register_trigger, |
1033 | .unreg = unregister_trigger, | 1034 | .unreg = unregister_trigger, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0cbe38a844fa..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -170,7 +170,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
170 | current->ret_stack[index].ret = ret; | 170 | current->ret_stack[index].ret = ret; |
171 | current->ret_stack[index].func = func; | 171 | current->ret_stack[index].func = func; |
172 | current->ret_stack[index].calltime = calltime; | 172 | current->ret_stack[index].calltime = calltime; |
173 | current->ret_stack[index].subtime = 0; | ||
174 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST | 173 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
175 | current->ret_stack[index].fp = frame_pointer; | 174 | current->ret_stack[index].fp = frame_pointer; |
176 | #endif | 175 | #endif |
@@ -1183,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1183 | trace_seq_puts(s, "/* "); | 1182 | trace_seq_puts(s, "/* "); |
1184 | 1183 | ||
1185 | switch (iter->ent->type) { | 1184 | switch (iter->ent->type) { |
1185 | case TRACE_BPUTS: | ||
1186 | ret = trace_print_bputs_msg_only(iter); | ||
1187 | if (ret != TRACE_TYPE_HANDLED) | ||
1188 | return ret; | ||
1189 | break; | ||
1186 | case TRACE_BPRINT: | 1190 | case TRACE_BPRINT: |
1187 | ret = trace_print_bprintk_msg_only(iter); | 1191 | ret = trace_print_bprintk_msg_only(iter); |
1188 | if (ret != TRACE_TYPE_HANDLED) | 1192 | if (ret != TRACE_TYPE_HANDLED) |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c | |||
@@ -0,0 +1,633 @@ | |||
1 | /* | ||
2 | * trace_hwlatdetect.c - A simple Hardware Latency detector. | ||
3 | * | ||
4 | * Use this tracer to detect large system latencies induced by the behavior of | ||
5 | * certain underlying system hardware or firmware, independent of Linux itself. | ||
6 | * The code was developed originally to detect the presence of SMIs on Intel | ||
7 | * and AMD systems, although there is no dependency upon x86 herein. | ||
8 | * | ||
9 | * The classical example usage of this tracer is in detecting the presence of | ||
10 | * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a | ||
11 | * somewhat special form of hardware interrupt spawned from earlier CPU debug | ||
12 | * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge | ||
13 | * LPC (or other device) to generate a special interrupt under certain | ||
14 | * circumstances, for example, upon expiration of a special SMI timer device, | ||
15 | * due to certain external thermal readings, on certain I/O address accesses, | ||
16 | * and other situations. An SMI hits a special CPU pin, triggers a special | ||
17 | * SMI mode (complete with special memory map), and the OS is unaware. | ||
18 | * | ||
19 | * Although certain hardware-inducing latencies are necessary (for example, | ||
20 | * a modern system often requires an SMI handler for correct thermal control | ||
21 | * and remote management) they can wreak havoc upon any OS-level performance | ||
22 | * guarantees toward low-latency, especially when the OS is not even made | ||
23 | * aware of the presence of these interrupts. For this reason, we need a | ||
24 | * somewhat brute force mechanism to detect these interrupts. In this case, | ||
25 | * we do it by hogging all of the CPU(s) for configurable timer intervals, | ||
26 | * sampling the built-in CPU timer, looking for discontiguous readings. | ||
27 | * | ||
28 | * WARNING: This implementation necessarily introduces latencies. Therefore, | ||
29 | * you should NEVER use this tracer while running in a production | ||
30 | * environment requiring any kind of low-latency performance | ||
31 | * guarantee(s). | ||
32 | * | ||
33 | * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> | ||
34 | * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> | ||
35 | * | ||
36 | * Includes useful feedback from Clark Williams <clark@redhat.com> | ||
37 | * | ||
38 | * This file is licensed under the terms of the GNU General Public | ||
39 | * License version 2. This program is licensed "as is" without any | ||
40 | * warranty of any kind, whether express or implied. | ||
41 | */ | ||
42 | #include <linux/kthread.h> | ||
43 | #include <linux/tracefs.h> | ||
44 | #include <linux/uaccess.h> | ||
45 | #include <linux/cpumask.h> | ||
46 | #include <linux/delay.h> | ||
47 | #include "trace.h" | ||
48 | |||
49 | static struct trace_array *hwlat_trace; | ||
50 | |||
51 | #define U64STR_SIZE 22 /* 20 digits max */ | ||
52 | |||
53 | #define BANNER "hwlat_detector: " | ||
54 | #define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ | ||
55 | #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ | ||
56 | #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ | ||
57 | |||
58 | /* sampling thread*/ | ||
59 | static struct task_struct *hwlat_kthread; | ||
60 | |||
61 | static struct dentry *hwlat_sample_width; /* sample width us */ | ||
62 | static struct dentry *hwlat_sample_window; /* sample window us */ | ||
63 | |||
64 | /* Save the previous tracing_thresh value */ | ||
65 | static unsigned long save_tracing_thresh; | ||
66 | |||
67 | /* NMI timestamp counters */ | ||
68 | static u64 nmi_ts_start; | ||
69 | static u64 nmi_total_ts; | ||
70 | static int nmi_count; | ||
71 | static int nmi_cpu; | ||
72 | |||
73 | /* Tells NMIs to call back to the hwlat tracer to record timestamps */ | ||
74 | bool trace_hwlat_callback_enabled; | ||
75 | |||
76 | /* If the user changed threshold, remember it */ | ||
77 | static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; | ||
78 | |||
79 | /* Individual latency samples are stored here when detected. */ | ||
80 | struct hwlat_sample { | ||
81 | u64 seqnum; /* unique sequence */ | ||
82 | u64 duration; /* delta */ | ||
83 | u64 outer_duration; /* delta (outer loop) */ | ||
84 | u64 nmi_total_ts; /* Total time spent in NMIs */ | ||
85 | struct timespec timestamp; /* wall time */ | ||
86 | int nmi_count; /* # NMIs during this sample */ | ||
87 | }; | ||
88 | |||
89 | /* keep the global state somewhere. */ | ||
90 | static struct hwlat_data { | ||
91 | |||
92 | struct mutex lock; /* protect changes */ | ||
93 | |||
94 | u64 count; /* total since reset */ | ||
95 | |||
96 | u64 sample_window; /* total sampling window (on+off) */ | ||
97 | u64 sample_width; /* active sampling portion of window */ | ||
98 | |||
99 | } hwlat_data = { | ||
100 | .sample_window = DEFAULT_SAMPLE_WINDOW, | ||
101 | .sample_width = DEFAULT_SAMPLE_WIDTH, | ||
102 | }; | ||
103 | |||
104 | static void trace_hwlat_sample(struct hwlat_sample *sample) | ||
105 | { | ||
106 | struct trace_array *tr = hwlat_trace; | ||
107 | struct trace_event_call *call = &event_hwlat; | ||
108 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
109 | struct ring_buffer_event *event; | ||
110 | struct hwlat_entry *entry; | ||
111 | unsigned long flags; | ||
112 | int pc; | ||
113 | |||
114 | pc = preempt_count(); | ||
115 | local_save_flags(flags); | ||
116 | |||
117 | event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), | ||
118 | flags, pc); | ||
119 | if (!event) | ||
120 | return; | ||
121 | entry = ring_buffer_event_data(event); | ||
122 | entry->seqnum = sample->seqnum; | ||
123 | entry->duration = sample->duration; | ||
124 | entry->outer_duration = sample->outer_duration; | ||
125 | entry->timestamp = sample->timestamp; | ||
126 | entry->nmi_total_ts = sample->nmi_total_ts; | ||
127 | entry->nmi_count = sample->nmi_count; | ||
128 | |||
129 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
130 | __buffer_unlock_commit(buffer, event); | ||
131 | } | ||
132 | |||
133 | /* Macros to encapsulate the time capturing infrastructure */ | ||
134 | #define time_type u64 | ||
135 | #define time_get() trace_clock_local() | ||
136 | #define time_to_us(x) div_u64(x, 1000) | ||
137 | #define time_sub(a, b) ((a) - (b)) | ||
138 | #define init_time(a, b) (a = b) | ||
139 | #define time_u64(a) a | ||
140 | |||
141 | void trace_hwlat_callback(bool enter) | ||
142 | { | ||
143 | if (smp_processor_id() != nmi_cpu) | ||
144 | return; | ||
145 | |||
146 | /* | ||
147 | * Currently trace_clock_local() calls sched_clock() and the | ||
148 | * generic version is not NMI safe. | ||
149 | */ | ||
150 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { | ||
151 | if (enter) | ||
152 | nmi_ts_start = time_get(); | ||
153 | else | ||
154 | nmi_total_ts = time_get() - nmi_ts_start; | ||
155 | } | ||
156 | |||
157 | if (enter) | ||
158 | nmi_count++; | ||
159 | } | ||
160 | |||
161 | /** | ||
162 | * get_sample - sample the CPU TSC and look for likely hardware latencies | ||
163 | * | ||
164 | * Used to repeatedly capture the CPU TSC (or similar), looking for potential | ||
165 | * hardware-induced latency. Called with interrupts disabled and with | ||
166 | * hwlat_data.lock held. | ||
167 | */ | ||
168 | static int get_sample(void) | ||
169 | { | ||
170 | struct trace_array *tr = hwlat_trace; | ||
171 | time_type start, t1, t2, last_t2; | ||
172 | s64 diff, total, last_total = 0; | ||
173 | u64 sample = 0; | ||
174 | u64 thresh = tracing_thresh; | ||
175 | u64 outer_sample = 0; | ||
176 | int ret = -1; | ||
177 | |||
178 | do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ | ||
179 | |||
180 | nmi_cpu = smp_processor_id(); | ||
181 | nmi_total_ts = 0; | ||
182 | nmi_count = 0; | ||
183 | /* Make sure NMIs see this first */ | ||
184 | barrier(); | ||
185 | |||
186 | trace_hwlat_callback_enabled = true; | ||
187 | |||
188 | init_time(last_t2, 0); | ||
189 | start = time_get(); /* start timestamp */ | ||
190 | |||
191 | do { | ||
192 | |||
193 | t1 = time_get(); /* we'll look for a discontinuity */ | ||
194 | t2 = time_get(); | ||
195 | |||
196 | if (time_u64(last_t2)) { | ||
197 | /* Check the delta from outer loop (t2 to next t1) */ | ||
198 | diff = time_to_us(time_sub(t1, last_t2)); | ||
199 | /* This shouldn't happen */ | ||
200 | if (diff < 0) { | ||
201 | pr_err(BANNER "time running backwards\n"); | ||
202 | goto out; | ||
203 | } | ||
204 | if (diff > outer_sample) | ||
205 | outer_sample = diff; | ||
206 | } | ||
207 | last_t2 = t2; | ||
208 | |||
209 | total = time_to_us(time_sub(t2, start)); /* sample width */ | ||
210 | |||
211 | /* Check for possible overflows */ | ||
212 | if (total < last_total) { | ||
213 | pr_err("Time total overflowed\n"); | ||
214 | break; | ||
215 | } | ||
216 | last_total = total; | ||
217 | |||
218 | /* This checks the inner loop (t1 to t2) */ | ||
219 | diff = time_to_us(time_sub(t2, t1)); /* current diff */ | ||
220 | |||
221 | /* This shouldn't happen */ | ||
222 | if (diff < 0) { | ||
223 | pr_err(BANNER "time running backwards\n"); | ||
224 | goto out; | ||
225 | } | ||
226 | |||
227 | if (diff > sample) | ||
228 | sample = diff; /* only want highest value */ | ||
229 | |||
230 | } while (total <= hwlat_data.sample_width); | ||
231 | |||
232 | barrier(); /* finish the above in the view for NMIs */ | ||
233 | trace_hwlat_callback_enabled = false; | ||
234 | barrier(); /* Make sure nmi_total_ts is no longer updated */ | ||
235 | |||
236 | ret = 0; | ||
237 | |||
238 | /* If we exceed the threshold value, we have found a hardware latency */ | ||
239 | if (sample > thresh || outer_sample > thresh) { | ||
240 | struct hwlat_sample s; | ||
241 | |||
242 | ret = 1; | ||
243 | |||
244 | /* We read in microseconds */ | ||
245 | if (nmi_total_ts) | ||
246 | do_div(nmi_total_ts, NSEC_PER_USEC); | ||
247 | |||
248 | hwlat_data.count++; | ||
249 | s.seqnum = hwlat_data.count; | ||
250 | s.duration = sample; | ||
251 | s.outer_duration = outer_sample; | ||
252 | s.timestamp = CURRENT_TIME; | ||
253 | s.nmi_total_ts = nmi_total_ts; | ||
254 | s.nmi_count = nmi_count; | ||
255 | trace_hwlat_sample(&s); | ||
256 | |||
257 | /* Keep a running maximum ever recorded hardware latency */ | ||
258 | if (sample > tr->max_latency) | ||
259 | tr->max_latency = sample; | ||
260 | } | ||
261 | |||
262 | out: | ||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | static struct cpumask save_cpumask; | ||
267 | static bool disable_migrate; | ||
268 | |||
269 | static void move_to_next_cpu(void) | ||
270 | { | ||
271 | static struct cpumask *current_mask; | ||
272 | int next_cpu; | ||
273 | |||
274 | if (disable_migrate) | ||
275 | return; | ||
276 | |||
277 | /* Just pick the first CPU on first iteration */ | ||
278 | if (!current_mask) { | ||
279 | current_mask = &save_cpumask; | ||
280 | get_online_cpus(); | ||
281 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
282 | put_online_cpus(); | ||
283 | next_cpu = cpumask_first(current_mask); | ||
284 | goto set_affinity; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * If for some reason the user modifies the CPU affinity | ||
289 | * of this thread, than stop migrating for the duration | ||
290 | * of the current test. | ||
291 | */ | ||
292 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | ||
293 | goto disable; | ||
294 | |||
295 | get_online_cpus(); | ||
296 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
297 | next_cpu = cpumask_next(smp_processor_id(), current_mask); | ||
298 | put_online_cpus(); | ||
299 | |||
300 | if (next_cpu >= nr_cpu_ids) | ||
301 | next_cpu = cpumask_first(current_mask); | ||
302 | |||
303 | set_affinity: | ||
304 | if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ | ||
305 | goto disable; | ||
306 | |||
307 | cpumask_clear(current_mask); | ||
308 | cpumask_set_cpu(next_cpu, current_mask); | ||
309 | |||
310 | sched_setaffinity(0, current_mask); | ||
311 | return; | ||
312 | |||
313 | disable: | ||
314 | disable_migrate = true; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * kthread_fn - The CPU time sampling/hardware latency detection kernel thread | ||
319 | * | ||
320 | * Used to periodically sample the CPU TSC via a call to get_sample. We | ||
321 | * disable interrupts, which does (intentionally) introduce latency since we | ||
322 | * need to ensure nothing else might be running (and thus preempting). | ||
323 | * Obviously this should never be used in production environments. | ||
324 | * | ||
325 | * Currently this runs on which ever CPU it was scheduled on, but most | ||
326 | * real-world hardware latency situations occur across several CPUs, | ||
327 | * but we might later generalize this if we find there are any actualy | ||
328 | * systems with alternate SMI delivery or other hardware latencies. | ||
329 | */ | ||
330 | static int kthread_fn(void *data) | ||
331 | { | ||
332 | u64 interval; | ||
333 | |||
334 | while (!kthread_should_stop()) { | ||
335 | |||
336 | move_to_next_cpu(); | ||
337 | |||
338 | local_irq_disable(); | ||
339 | get_sample(); | ||
340 | local_irq_enable(); | ||
341 | |||
342 | mutex_lock(&hwlat_data.lock); | ||
343 | interval = hwlat_data.sample_window - hwlat_data.sample_width; | ||
344 | mutex_unlock(&hwlat_data.lock); | ||
345 | |||
346 | do_div(interval, USEC_PER_MSEC); /* modifies interval value */ | ||
347 | |||
348 | /* Always sleep for at least 1ms */ | ||
349 | if (interval < 1) | ||
350 | interval = 1; | ||
351 | |||
352 | if (msleep_interruptible(interval)) | ||
353 | break; | ||
354 | } | ||
355 | |||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | /** | ||
360 | * start_kthread - Kick off the hardware latency sampling/detector kthread | ||
361 | * | ||
362 | * This starts the kernel thread that will sit and sample the CPU timestamp | ||
363 | * counter (TSC or similar) and look for potential hardware latencies. | ||
364 | */ | ||
365 | static int start_kthread(struct trace_array *tr) | ||
366 | { | ||
367 | struct task_struct *kthread; | ||
368 | |||
369 | kthread = kthread_create(kthread_fn, NULL, "hwlatd"); | ||
370 | if (IS_ERR(kthread)) { | ||
371 | pr_err(BANNER "could not start sampling thread\n"); | ||
372 | return -ENOMEM; | ||
373 | } | ||
374 | hwlat_kthread = kthread; | ||
375 | wake_up_process(kthread); | ||
376 | |||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * stop_kthread - Inform the hardware latency samping/detector kthread to stop | ||
382 | * | ||
383 | * This kicks the running hardware latency sampling/detector kernel thread and | ||
384 | * tells it to stop sampling now. Use this on unload and at system shutdown. | ||
385 | */ | ||
386 | static void stop_kthread(void) | ||
387 | { | ||
388 | if (!hwlat_kthread) | ||
389 | return; | ||
390 | kthread_stop(hwlat_kthread); | ||
391 | hwlat_kthread = NULL; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * hwlat_read - Wrapper read function for reading both window and width | ||
396 | * @filp: The active open file structure | ||
397 | * @ubuf: The userspace provided buffer to read value into | ||
398 | * @cnt: The maximum number of bytes to read | ||
399 | * @ppos: The current "file" position | ||
400 | * | ||
401 | * This function provides a generic read implementation for the global state | ||
402 | * "hwlat_data" structure filesystem entries. | ||
403 | */ | ||
404 | static ssize_t hwlat_read(struct file *filp, char __user *ubuf, | ||
405 | size_t cnt, loff_t *ppos) | ||
406 | { | ||
407 | char buf[U64STR_SIZE]; | ||
408 | u64 *entry = filp->private_data; | ||
409 | u64 val; | ||
410 | int len; | ||
411 | |||
412 | if (!entry) | ||
413 | return -EFAULT; | ||
414 | |||
415 | if (cnt > sizeof(buf)) | ||
416 | cnt = sizeof(buf); | ||
417 | |||
418 | val = *entry; | ||
419 | |||
420 | len = snprintf(buf, sizeof(buf), "%llu\n", val); | ||
421 | |||
422 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); | ||
423 | } | ||
424 | |||
425 | /** | ||
426 | * hwlat_width_write - Write function for "width" entry | ||
427 | * @filp: The active open file structure | ||
428 | * @ubuf: The user buffer that contains the value to write | ||
429 | * @cnt: The maximum number of bytes to write to "file" | ||
430 | * @ppos: The current position in @file | ||
431 | * | ||
432 | * This function provides a write implementation for the "width" interface | ||
433 | * to the hardware latency detector. It can be used to configure | ||
434 | * for how many us of the total window us we will actively sample for any | ||
435 | * hardware-induced latency periods. Obviously, it is not possible to | ||
436 | * sample constantly and have the system respond to a sample reader, or, | ||
437 | * worse, without having the system appear to have gone out to lunch. It | ||
438 | * is enforced that width is less that the total window size. | ||
439 | */ | ||
440 | static ssize_t | ||
441 | hwlat_width_write(struct file *filp, const char __user *ubuf, | ||
442 | size_t cnt, loff_t *ppos) | ||
443 | { | ||
444 | u64 val; | ||
445 | int err; | ||
446 | |||
447 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
448 | if (err) | ||
449 | return err; | ||
450 | |||
451 | mutex_lock(&hwlat_data.lock); | ||
452 | if (val < hwlat_data.sample_window) | ||
453 | hwlat_data.sample_width = val; | ||
454 | else | ||
455 | err = -EINVAL; | ||
456 | mutex_unlock(&hwlat_data.lock); | ||
457 | |||
458 | if (err) | ||
459 | return err; | ||
460 | |||
461 | return cnt; | ||
462 | } | ||
463 | |||
464 | /** | ||
465 | * hwlat_window_write - Write function for "window" entry | ||
466 | * @filp: The active open file structure | ||
467 | * @ubuf: The user buffer that contains the value to write | ||
468 | * @cnt: The maximum number of bytes to write to "file" | ||
469 | * @ppos: The current position in @file | ||
470 | * | ||
471 | * This function provides a write implementation for the "window" interface | ||
472 | * to the hardware latency detetector. The window is the total time | ||
473 | * in us that will be considered one sample period. Conceptually, windows | ||
474 | * occur back-to-back and contain a sample width period during which | ||
475 | * actual sampling occurs. Can be used to write a new total window size. It | ||
476 | * is enfoced that any value written must be greater than the sample width | ||
477 | * size, or an error results. | ||
478 | */ | ||
479 | static ssize_t | ||
480 | hwlat_window_write(struct file *filp, const char __user *ubuf, | ||
481 | size_t cnt, loff_t *ppos) | ||
482 | { | ||
483 | u64 val; | ||
484 | int err; | ||
485 | |||
486 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
487 | if (err) | ||
488 | return err; | ||
489 | |||
490 | mutex_lock(&hwlat_data.lock); | ||
491 | if (hwlat_data.sample_width < val) | ||
492 | hwlat_data.sample_window = val; | ||
493 | else | ||
494 | err = -EINVAL; | ||
495 | mutex_unlock(&hwlat_data.lock); | ||
496 | |||
497 | if (err) | ||
498 | return err; | ||
499 | |||
500 | return cnt; | ||
501 | } | ||
502 | |||
503 | static const struct file_operations width_fops = { | ||
504 | .open = tracing_open_generic, | ||
505 | .read = hwlat_read, | ||
506 | .write = hwlat_width_write, | ||
507 | }; | ||
508 | |||
509 | static const struct file_operations window_fops = { | ||
510 | .open = tracing_open_generic, | ||
511 | .read = hwlat_read, | ||
512 | .write = hwlat_window_write, | ||
513 | }; | ||
514 | |||
515 | /** | ||
516 | * init_tracefs - A function to initialize the tracefs interface files | ||
517 | * | ||
518 | * This function creates entries in tracefs for "hwlat_detector". | ||
519 | * It creates the hwlat_detector directory in the tracing directory, | ||
520 | * and within that directory is the count, width and window files to | ||
521 | * change and view those values. | ||
522 | */ | ||
523 | static int init_tracefs(void) | ||
524 | { | ||
525 | struct dentry *d_tracer; | ||
526 | struct dentry *top_dir; | ||
527 | |||
528 | d_tracer = tracing_init_dentry(); | ||
529 | if (IS_ERR(d_tracer)) | ||
530 | return -ENOMEM; | ||
531 | |||
532 | top_dir = tracefs_create_dir("hwlat_detector", d_tracer); | ||
533 | if (!top_dir) | ||
534 | return -ENOMEM; | ||
535 | |||
536 | hwlat_sample_window = tracefs_create_file("window", 0640, | ||
537 | top_dir, | ||
538 | &hwlat_data.sample_window, | ||
539 | &window_fops); | ||
540 | if (!hwlat_sample_window) | ||
541 | goto err; | ||
542 | |||
543 | hwlat_sample_width = tracefs_create_file("width", 0644, | ||
544 | top_dir, | ||
545 | &hwlat_data.sample_width, | ||
546 | &width_fops); | ||
547 | if (!hwlat_sample_width) | ||
548 | goto err; | ||
549 | |||
550 | return 0; | ||
551 | |||
552 | err: | ||
553 | tracefs_remove_recursive(top_dir); | ||
554 | return -ENOMEM; | ||
555 | } | ||
556 | |||
557 | static void hwlat_tracer_start(struct trace_array *tr) | ||
558 | { | ||
559 | int err; | ||
560 | |||
561 | err = start_kthread(tr); | ||
562 | if (err) | ||
563 | pr_err(BANNER "Cannot start hwlat kthread\n"); | ||
564 | } | ||
565 | |||
566 | static void hwlat_tracer_stop(struct trace_array *tr) | ||
567 | { | ||
568 | stop_kthread(); | ||
569 | } | ||
570 | |||
571 | static bool hwlat_busy; | ||
572 | |||
573 | static int hwlat_tracer_init(struct trace_array *tr) | ||
574 | { | ||
575 | /* Only allow one instance to enable this */ | ||
576 | if (hwlat_busy) | ||
577 | return -EBUSY; | ||
578 | |||
579 | hwlat_trace = tr; | ||
580 | |||
581 | disable_migrate = false; | ||
582 | hwlat_data.count = 0; | ||
583 | tr->max_latency = 0; | ||
584 | save_tracing_thresh = tracing_thresh; | ||
585 | |||
586 | /* tracing_thresh is in nsecs, we speak in usecs */ | ||
587 | if (!tracing_thresh) | ||
588 | tracing_thresh = last_tracing_thresh; | ||
589 | |||
590 | if (tracer_tracing_is_on(tr)) | ||
591 | hwlat_tracer_start(tr); | ||
592 | |||
593 | hwlat_busy = true; | ||
594 | |||
595 | return 0; | ||
596 | } | ||
597 | |||
598 | static void hwlat_tracer_reset(struct trace_array *tr) | ||
599 | { | ||
600 | stop_kthread(); | ||
601 | |||
602 | /* the tracing threshold is static between runs */ | ||
603 | last_tracing_thresh = tracing_thresh; | ||
604 | |||
605 | tracing_thresh = save_tracing_thresh; | ||
606 | hwlat_busy = false; | ||
607 | } | ||
608 | |||
609 | static struct tracer hwlat_tracer __read_mostly = | ||
610 | { | ||
611 | .name = "hwlat", | ||
612 | .init = hwlat_tracer_init, | ||
613 | .reset = hwlat_tracer_reset, | ||
614 | .start = hwlat_tracer_start, | ||
615 | .stop = hwlat_tracer_stop, | ||
616 | .allow_instances = true, | ||
617 | }; | ||
618 | |||
619 | __init static int init_hwlat_tracer(void) | ||
620 | { | ||
621 | int ret; | ||
622 | |||
623 | mutex_init(&hwlat_data.lock); | ||
624 | |||
625 | ret = register_tracer(&hwlat_tracer); | ||
626 | if (ret) | ||
627 | return ret; | ||
628 | |||
629 | init_tracefs(); | ||
630 | |||
631 | return 0; | ||
632 | } | ||
633 | late_initcall(init_hwlat_tracer); | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { | |||
1098 | .funcs = &trace_user_stack_funcs, | 1098 | .funcs = &trace_user_stack_funcs, |
1099 | }; | 1099 | }; |
1100 | 1100 | ||
1101 | /* TRACE_HWLAT */ | ||
1102 | static enum print_line_t | ||
1103 | trace_hwlat_print(struct trace_iterator *iter, int flags, | ||
1104 | struct trace_event *event) | ||
1105 | { | ||
1106 | struct trace_entry *entry = iter->ent; | ||
1107 | struct trace_seq *s = &iter->seq; | ||
1108 | struct hwlat_entry *field; | ||
1109 | |||
1110 | trace_assign_type(field, entry); | ||
1111 | |||
1112 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", | ||
1113 | field->seqnum, | ||
1114 | field->duration, | ||
1115 | field->outer_duration, | ||
1116 | field->timestamp.tv_sec, | ||
1117 | field->timestamp.tv_nsec); | ||
1118 | |||
1119 | if (field->nmi_count) { | ||
1120 | /* | ||
1121 | * The generic sched_clock() is not NMI safe, thus | ||
1122 | * we only record the count and not the time. | ||
1123 | */ | ||
1124 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) | ||
1125 | trace_seq_printf(s, " nmi-total:%llu", | ||
1126 | field->nmi_total_ts); | ||
1127 | trace_seq_printf(s, " nmi-count:%u", | ||
1128 | field->nmi_count); | ||
1129 | } | ||
1130 | |||
1131 | trace_seq_putc(s, '\n'); | ||
1132 | |||
1133 | return trace_handle_return(s); | ||
1134 | } | ||
1135 | |||
1136 | |||
1137 | static enum print_line_t | ||
1138 | trace_hwlat_raw(struct trace_iterator *iter, int flags, | ||
1139 | struct trace_event *event) | ||
1140 | { | ||
1141 | struct hwlat_entry *field; | ||
1142 | struct trace_seq *s = &iter->seq; | ||
1143 | |||
1144 | trace_assign_type(field, iter->ent); | ||
1145 | |||
1146 | trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", | ||
1147 | field->duration, | ||
1148 | field->outer_duration, | ||
1149 | field->timestamp.tv_sec, | ||
1150 | field->timestamp.tv_nsec, | ||
1151 | field->seqnum); | ||
1152 | |||
1153 | return trace_handle_return(s); | ||
1154 | } | ||
1155 | |||
1156 | static struct trace_event_functions trace_hwlat_funcs = { | ||
1157 | .trace = trace_hwlat_print, | ||
1158 | .raw = trace_hwlat_raw, | ||
1159 | }; | ||
1160 | |||
1161 | static struct trace_event trace_hwlat_event = { | ||
1162 | .type = TRACE_HWLAT, | ||
1163 | .funcs = &trace_hwlat_funcs, | ||
1164 | }; | ||
1165 | |||
1101 | /* TRACE_BPUTS */ | 1166 | /* TRACE_BPUTS */ |
1102 | static enum print_line_t | 1167 | static enum print_line_t |
1103 | trace_bputs_print(struct trace_iterator *iter, int flags, | 1168 | trace_bputs_print(struct trace_iterator *iter, int flags, |
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { | |||
1233 | &trace_bputs_event, | 1298 | &trace_bputs_event, |
1234 | &trace_bprint_event, | 1299 | &trace_bprint_event, |
1235 | &trace_print_event, | 1300 | &trace_print_event, |
1301 | &trace_hwlat_event, | ||
1236 | NULL | 1302 | NULL |
1237 | }; | 1303 | }; |
1238 | 1304 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7a687320f867..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -431,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) | |||
431 | pr_info("Probe point is not specified.\n"); | 431 | pr_info("Probe point is not specified.\n"); |
432 | return -EINVAL; | 432 | return -EINVAL; |
433 | } | 433 | } |
434 | if (isdigit(argv[1][0])) { | ||
435 | pr_info("probe point must be have a filename.\n"); | ||
436 | return -EINVAL; | ||
437 | } | ||
438 | arg = strchr(argv[1], ':'); | 434 | arg = strchr(argv[1], ':'); |
439 | if (!arg) { | 435 | if (!arg) { |
440 | ret = -EINVAL; | 436 | ret = -EINVAL; |
diff --git a/scripts/tracing/ftrace-bisect.sh b/scripts/tracing/ftrace-bisect.sh new file mode 100755 index 000000000000..9ff8ac5fc53c --- /dev/null +++ b/scripts/tracing/ftrace-bisect.sh | |||
@@ -0,0 +1,115 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Here's how to use this: | ||
4 | # | ||
5 | # This script is used to help find functions that are being traced by function | ||
6 | # tracer or function graph tracing that causes the machine to reboot, hang, or | ||
7 | # crash. Here's the steps to take. | ||
8 | # | ||
9 | # First, determine if function tracing is working with a single function: | ||
10 | # | ||
11 | # (note, if this is a problem with function_graph tracing, then simply | ||
12 | # replace "function" with "function_graph" in the following steps). | ||
13 | # | ||
14 | # # cd /sys/kernel/debug/tracing | ||
15 | # # echo schedule > set_ftrace_filter | ||
16 | # # echo function > current_tracer | ||
17 | # | ||
18 | # If this works, then we know that something is being traced that shouldn't be. | ||
19 | # | ||
20 | # # echo nop > current_tracer | ||
21 | # | ||
22 | # # cat available_filter_functions > ~/full-file | ||
23 | # # ftrace-bisect ~/full-file ~/test-file ~/non-test-file | ||
24 | # # cat ~/test-file > set_ftrace_filter | ||
25 | # | ||
26 | # *** Note *** this will take several minutes. Setting multiple functions is | ||
27 | # an O(n^2) operation, and we are dealing with thousands of functions. So go | ||
28 | # have coffee, talk with your coworkers, read facebook. And eventually, this | ||
29 | # operation will end. | ||
30 | # | ||
31 | # # echo function > current_tracer | ||
32 | # | ||
33 | # If it crashes, we know that ~/test-file has a bad function. | ||
34 | # | ||
35 | # Reboot back to test kernel. | ||
36 | # | ||
37 | # # cd /sys/kernel/debug/tracing | ||
38 | # # mv ~/test-file ~/full-file | ||
39 | # | ||
40 | # If it didn't crash. | ||
41 | # | ||
42 | # # echo nop > current_tracer | ||
43 | # # mv ~/non-test-file ~/full-file | ||
44 | # | ||
45 | # Get rid of the other test file from previous run (or save them off somewhere). | ||
46 | # # rm -f ~/test-file ~/non-test-file | ||
47 | # | ||
48 | # And start again: | ||
49 | # | ||
50 | # # ftrace-bisect ~/full-file ~/test-file ~/non-test-file | ||
51 | # | ||
52 | # The good thing is, because this cuts the number of functions in ~/test-file | ||
53 | # by half, the cat of it into set_ftrace_filter takes half as long each | ||
54 | # iteration, so don't talk so much at the water cooler the second time. | ||
55 | # | ||
56 | # Eventually, if you did this correctly, you will get down to the problem | ||
57 | # function, and all we need to do is to notrace it. | ||
58 | # | ||
59 | # The way to figure out if the problem function is bad, just do: | ||
60 | # | ||
61 | # # echo <problem-function> > set_ftrace_notrace | ||
62 | # # echo > set_ftrace_filter | ||
63 | # # echo function > current_tracer | ||
64 | # | ||
65 | # And if it doesn't crash, we are done. | ||
66 | # | ||
67 | # If it does crash, do this again (there's more than one problem function) | ||
68 | # but you need to echo the problem function(s) into set_ftrace_notrace before | ||
69 | # enabling function tracing in the above steps. Or if you can compile the | ||
70 | # kernel, annotate the problem functions with "notrace" and start again. | ||
71 | # | ||
72 | |||
73 | |||
74 | if [ $# -ne 3 ]; then | ||
75 | echo 'usage: ftrace-bisect full-file test-file non-test-file' | ||
76 | exit | ||
77 | fi | ||
78 | |||
79 | full=$1 | ||
80 | test=$2 | ||
81 | nontest=$3 | ||
82 | |||
83 | x=`cat $full | wc -l` | ||
84 | if [ $x -eq 1 ]; then | ||
85 | echo "There's only one function left, must be the bad one" | ||
86 | cat $full | ||
87 | exit 0 | ||
88 | fi | ||
89 | |||
90 | let x=$x/2 | ||
91 | let y=$x+1 | ||
92 | |||
93 | if [ ! -f $full ]; then | ||
94 | echo "$full does not exist" | ||
95 | exit 1 | ||
96 | fi | ||
97 | |||
98 | if [ -f $test ]; then | ||
99 | echo -n "$test exists, delete it? [y/N]" | ||
100 | read a | ||
101 | if [ "$a" != "y" -a "$a" != "Y" ]; then | ||
102 | exit 1 | ||
103 | fi | ||
104 | fi | ||
105 | |||
106 | if [ -f $nontest ]; then | ||
107 | echo -n "$nontest exists, delete it? [y/N]" | ||
108 | read a | ||
109 | if [ "$a" != "y" -a "$a" != "Y" ]; then | ||
110 | exit 1 | ||
111 | fi | ||
112 | fi | ||
113 | |||
114 | sed -ne "1,${x}p" $full > $test | ||
115 | sed -ne "$y,\$p" $full > $nontest | ||