diff options
| -rw-r--r-- | Documentation/trace/ftrace.txt | 10 | ||||
| -rw-r--r-- | Documentation/trace/hwlat_detector.txt | 79 | ||||
| -rw-r--r-- | arch/sh/kernel/ftrace.c | 4 | ||||
| -rw-r--r-- | include/linux/ftrace.h | 2 | ||||
| -rw-r--r-- | include/linux/ftrace_irq.h | 31 | ||||
| -rw-r--r-- | kernel/trace/Kconfig | 35 | ||||
| -rw-r--r-- | kernel/trace/Makefile | 1 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 8 | ||||
| -rw-r--r-- | kernel/trace/trace.h | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_entries.h | 27 | ||||
| -rw-r--r-- | kernel/trace/trace_events_trigger.c | 1 | ||||
| -rw-r--r-- | kernel/trace/trace_functions_graph.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace_hwlat.c | 633 | ||||
| -rw-r--r-- | kernel/trace/trace_output.c | 66 | ||||
| -rw-r--r-- | kernel/trace/trace_uprobe.c | 4 | ||||
| -rwxr-xr-x | scripts/tracing/ftrace-bisect.sh | 115 |
17 files changed, 1013 insertions, 20 deletions
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a6b3705e62a6..185c39fea2a0 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
| @@ -858,11 +858,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | |||
| 858 | When enabled, it will account time the task has been | 858 | When enabled, it will account time the task has been |
| 859 | scheduled out as part of the function call. | 859 | scheduled out as part of the function call. |
| 860 | 860 | ||
| 861 | graph-time - When running function graph tracer, to include the | 861 | graph-time - When running function profiler with function graph tracer, |
| 862 | time to call nested functions. When this is not set, | 862 | to include the time to call nested functions. When this is |
| 863 | the time reported for the function will only include | 863 | not set, the time reported for the function will only |
| 864 | the time the function itself executed for, not the time | 864 | include the time the function itself executed for, not the |
| 865 | for functions that it called. | 865 | time for functions that it called. |
| 866 | 866 | ||
| 867 | record-cmd - When any event or tracer is enabled, a hook is enabled | 867 | record-cmd - When any event or tracer is enabled, a hook is enabled |
| 868 | in the sched_switch trace point to fill comm cache | 868 | in the sched_switch trace point to fill comm cache |
diff --git a/Documentation/trace/hwlat_detector.txt b/Documentation/trace/hwlat_detector.txt new file mode 100644 index 000000000000..3207717a0d1a --- /dev/null +++ b/Documentation/trace/hwlat_detector.txt | |||
| @@ -0,0 +1,79 @@ | |||
| 1 | Introduction: | ||
| 2 | ------------- | ||
| 3 | |||
| 4 | The tracer hwlat_detector is a special purpose tracer that is used to | ||
| 5 | detect large system latencies induced by the behavior of certain underlying | ||
| 6 | hardware or firmware, independent of Linux itself. The code was developed | ||
| 7 | originally to detect SMIs (System Management Interrupts) on x86 systems, | ||
| 8 | however there is nothing x86 specific about this patchset. It was | ||
| 9 | originally written for use by the "RT" patch since the Real Time | ||
| 10 | kernel is highly latency sensitive. | ||
| 11 | |||
| 12 | SMIs are not serviced by the Linux kernel, which means that it does not | ||
| 13 | even know that they are occuring. SMIs are instead set up by BIOS code | ||
| 14 | and are serviced by BIOS code, usually for "critical" events such as | ||
| 15 | management of thermal sensors and fans. Sometimes though, SMIs are used for | ||
| 16 | other tasks and those tasks can spend an inordinate amount of time in the | ||
| 17 | handler (sometimes measured in milliseconds). Obviously this is a problem if | ||
| 18 | you are trying to keep event service latencies down in the microsecond range. | ||
| 19 | |||
| 20 | The hardware latency detector works by hogging one of the cpus for configurable | ||
| 21 | amounts of time (with interrupts disabled), polling the CPU Time Stamp Counter | ||
| 22 | for some period, then looking for gaps in the TSC data. Any gap indicates a | ||
| 23 | time when the polling was interrupted and since the interrupts are disabled, | ||
| 24 | the only thing that could do that would be an SMI or other hardware hiccup | ||
| 25 | (or an NMI, but those can be tracked). | ||
| 26 | |||
| 27 | Note that the hwlat detector should *NEVER* be used in a production environment. | ||
| 28 | It is intended to be run manually to determine if the hardware platform has a | ||
| 29 | problem with long system firmware service routines. | ||
| 30 | |||
| 31 | Usage: | ||
| 32 | ------ | ||
| 33 | |||
| 34 | Write the ASCII text "hwlat" into the current_tracer file of the tracing system | ||
| 35 | (mounted at /sys/kernel/tracing or /sys/kernel/tracing). It is possible to | ||
| 36 | redefine the threshold in microseconds (us) above which latency spikes will | ||
| 37 | be taken into account. | ||
| 38 | |||
| 39 | Example: | ||
| 40 | |||
| 41 | # echo hwlat > /sys/kernel/tracing/current_tracer | ||
| 42 | # echo 100 > /sys/kernel/tracing/tracing_thresh | ||
| 43 | |||
| 44 | The /sys/kernel/tracing/hwlat_detector interface contains the following files: | ||
| 45 | |||
| 46 | width - time period to sample with CPUs held (usecs) | ||
| 47 | must be less than the total window size (enforced) | ||
| 48 | window - total period of sampling, width being inside (usecs) | ||
| 49 | |||
| 50 | By default the width is set to 500,000 and window to 1,000,000, meaning that | ||
| 51 | for every 1,000,000 usecs (1s) the hwlat detector will spin for 500,000 usecs | ||
| 52 | (0.5s). If tracing_thresh contains zero when hwlat tracer is enabled, it will | ||
| 53 | change to a default of 10 usecs. If any latencies that exceed the threshold is | ||
| 54 | observed then the data will be written to the tracing ring buffer. | ||
| 55 | |||
| 56 | The minimum sleep time between periods is 1 millisecond. Even if width | ||
| 57 | is less than 1 millisecond apart from window, to allow the system to not | ||
| 58 | be totally starved. | ||
| 59 | |||
| 60 | If tracing_thresh was zero when hwlat detector was started, it will be set | ||
| 61 | back to zero if another tracer is loaded. Note, the last value in | ||
| 62 | tracing_thresh that hwlat detector had will be saved and this value will | ||
| 63 | be restored in tracing_thresh if it is still zero when hwlat detector is | ||
| 64 | started again. | ||
| 65 | |||
| 66 | The following tracing directory files are used by the hwlat_detector: | ||
| 67 | |||
| 68 | in /sys/kernel/tracing: | ||
| 69 | |||
| 70 | tracing_threshold - minimum latency value to be considered (usecs) | ||
| 71 | tracing_max_latency - maximum hardware latency actually observed (usecs) | ||
| 72 | tracing_cpumask - the CPUs to move the hwlat thread across | ||
| 73 | hwlat_detector/width - specified amount of time to spin within window (usecs) | ||
| 74 | hwlat_detector/window - amount of time between (width) runs (usecs) | ||
| 75 | |||
| 76 | The hwlat detector's kernel thread will migrate across each CPU specified in | ||
| 77 | tracing_cpumask between each window. To limit the migration, either modify | ||
| 78 | tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU | ||
| 79 | affinity directly, and the migration will stop. | ||
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 95eccd49672f..53783978162e 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c | |||
| @@ -139,7 +139,7 @@ static void ftrace_mod_code(void) | |||
| 139 | clear_mod_flag(); | 139 | clear_mod_flag(); |
| 140 | } | 140 | } |
| 141 | 141 | ||
| 142 | void ftrace_nmi_enter(void) | 142 | void arch_ftrace_nmi_enter(void) |
| 143 | { | 143 | { |
| 144 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { | 144 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { |
| 145 | smp_rmb(); | 145 | smp_rmb(); |
| @@ -150,7 +150,7 @@ void ftrace_nmi_enter(void) | |||
| 150 | smp_mb(); | 150 | smp_mb(); |
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | void ftrace_nmi_exit(void) | 153 | void arch_ftrace_nmi_exit(void) |
| 154 | { | 154 | { |
| 155 | /* Finish all executions before clearing nmi_running */ | 155 | /* Finish all executions before clearing nmi_running */ |
| 156 | smp_mb(); | 156 | smp_mb(); |
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6f93ac46e7f0..b3d34d3e0e7e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
| @@ -794,7 +794,9 @@ struct ftrace_ret_stack { | |||
| 794 | unsigned long ret; | 794 | unsigned long ret; |
| 795 | unsigned long func; | 795 | unsigned long func; |
| 796 | unsigned long long calltime; | 796 | unsigned long long calltime; |
| 797 | #ifdef CONFIG_FUNCTION_PROFILER | ||
| 797 | unsigned long long subtime; | 798 | unsigned long long subtime; |
| 799 | #endif | ||
| 798 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST | 800 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
| 799 | unsigned long fp; | 801 | unsigned long fp; |
| 800 | #endif | 802 | #endif |
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index dca7bf8cffe2..4ec2c9b205f2 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h | |||
| @@ -3,11 +3,34 @@ | |||
| 3 | 3 | ||
| 4 | 4 | ||
| 5 | #ifdef CONFIG_FTRACE_NMI_ENTER | 5 | #ifdef CONFIG_FTRACE_NMI_ENTER |
| 6 | extern void ftrace_nmi_enter(void); | 6 | extern void arch_ftrace_nmi_enter(void); |
| 7 | extern void ftrace_nmi_exit(void); | 7 | extern void arch_ftrace_nmi_exit(void); |
| 8 | #else | 8 | #else |
| 9 | static inline void ftrace_nmi_enter(void) { } | 9 | static inline void arch_ftrace_nmi_enter(void) { } |
| 10 | static inline void ftrace_nmi_exit(void) { } | 10 | static inline void arch_ftrace_nmi_exit(void) { } |
| 11 | #endif | 11 | #endif |
| 12 | 12 | ||
| 13 | #ifdef CONFIG_HWLAT_TRACER | ||
| 14 | extern bool trace_hwlat_callback_enabled; | ||
| 15 | extern void trace_hwlat_callback(bool enter); | ||
| 16 | #endif | ||
| 17 | |||
| 18 | static inline void ftrace_nmi_enter(void) | ||
| 19 | { | ||
| 20 | #ifdef CONFIG_HWLAT_TRACER | ||
| 21 | if (trace_hwlat_callback_enabled) | ||
| 22 | trace_hwlat_callback(true); | ||
| 23 | #endif | ||
| 24 | arch_ftrace_nmi_enter(); | ||
| 25 | } | ||
| 26 | |||
| 27 | static inline void ftrace_nmi_exit(void) | ||
| 28 | { | ||
| 29 | arch_ftrace_nmi_exit(); | ||
| 30 | #ifdef CONFIG_HWLAT_TRACER | ||
| 31 | if (trace_hwlat_callback_enabled) | ||
| 32 | trace_hwlat_callback(false); | ||
| 33 | #endif | ||
| 34 | } | ||
| 35 | |||
| 13 | #endif /* _LINUX_FTRACE_IRQ_H */ | 36 | #endif /* _LINUX_FTRACE_IRQ_H */ |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ba3326785ca4..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -216,6 +216,41 @@ config SCHED_TRACER | |||
| 216 | This tracer tracks the latency of the highest priority task | 216 | This tracer tracks the latency of the highest priority task |
| 217 | to be scheduled in, starting from the point it has woken up. | 217 | to be scheduled in, starting from the point it has woken up. |
| 218 | 218 | ||
| 219 | config HWLAT_TRACER | ||
| 220 | bool "Tracer to detect hardware latencies (like SMIs)" | ||
| 221 | select GENERIC_TRACER | ||
| 222 | help | ||
| 223 | This tracer, when enabled will create one or more kernel threads, | ||
| 224 | depening on what the cpumask file is set to, which each thread | ||
| 225 | spinning in a loop looking for interruptions caused by | ||
| 226 | something other than the kernel. For example, if a | ||
| 227 | System Management Interrupt (SMI) takes a noticeable amount of | ||
| 228 | time, this tracer will detect it. This is useful for testing | ||
| 229 | if a system is reliable for Real Time tasks. | ||
| 230 | |||
| 231 | Some files are created in the tracing directory when this | ||
| 232 | is enabled: | ||
| 233 | |||
| 234 | hwlat_detector/width - time in usecs for how long to spin for | ||
| 235 | hwlat_detector/window - time in usecs between the start of each | ||
| 236 | iteration | ||
| 237 | |||
| 238 | A kernel thread is created that will spin with interrupts disabled | ||
| 239 | for "width" microseconds in every "widow" cycle. It will not spin | ||
| 240 | for "window - width" microseconds, where the system can | ||
| 241 | continue to operate. | ||
| 242 | |||
| 243 | The output will appear in the trace and trace_pipe files. | ||
| 244 | |||
| 245 | When the tracer is not running, it has no affect on the system, | ||
| 246 | but when it is running, it can cause the system to be | ||
| 247 | periodically non responsive. Do not run this tracer on a | ||
| 248 | production system. | ||
| 249 | |||
| 250 | To enable this tracer, echo in "hwlat" into the current_tracer | ||
| 251 | file. Every time a latency is greater than tracing_thresh, it will | ||
| 252 | be recorded into the ring buffer. | ||
| 253 | |||
| 219 | config ENABLE_DEFAULT_TRACERS | 254 | config ENABLE_DEFAULT_TRACERS |
| 220 | bool "Trace process context switches and events" | 255 | bool "Trace process context switches and events" |
| 221 | depends on !GENERIC_TRACER | 256 | depends on !GENERIC_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..992ab9d99f35 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | |||
| 41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
| 42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
| 43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | 43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o |
| 44 | obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o | ||
| 44 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 45 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
| 45 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 46 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
| 46 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 47 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..2050a7652a86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
| 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
| 874 | { | 874 | { |
| 875 | int index = trace->depth; | ||
| 876 | |||
| 875 | function_profile_call(trace->func, 0, NULL, NULL); | 877 | function_profile_call(trace->func, 0, NULL, NULL); |
| 878 | |||
| 879 | if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) | ||
| 880 | current->ret_stack[index].subtime = 0; | ||
| 881 | |||
| 876 | return 1; | 882 | return 1; |
| 877 | } | 883 | } |
| 878 | 884 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 37824d98ae71..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) | |||
| 1047 | * | 1047 | * |
| 1048 | * Shows real state of the ring buffer if it is enabled or not. | 1048 | * Shows real state of the ring buffer if it is enabled or not. |
| 1049 | */ | 1049 | */ |
| 1050 | static int tracer_tracing_is_on(struct trace_array *tr) | 1050 | int tracer_tracing_is_on(struct trace_array *tr) |
| 1051 | { | 1051 | { |
| 1052 | if (tr->trace_buffer.buffer) | 1052 | if (tr->trace_buffer.buffer) |
| 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); | 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); |
| @@ -4969,7 +4969,7 @@ out: | |||
| 4969 | return ret; | 4969 | return ret; |
| 4970 | } | 4970 | } |
| 4971 | 4971 | ||
| 4972 | #ifdef CONFIG_TRACER_MAX_TRACE | 4972 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 4973 | 4973 | ||
| 4974 | static ssize_t | 4974 | static ssize_t |
| 4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | 4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, |
| @@ -5892,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = { | |||
| 5892 | .llseek = generic_file_llseek, | 5892 | .llseek = generic_file_llseek, |
| 5893 | }; | 5893 | }; |
| 5894 | 5894 | ||
| 5895 | #ifdef CONFIG_TRACER_MAX_TRACE | 5895 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 5896 | static const struct file_operations tracing_max_lat_fops = { | 5896 | static const struct file_operations tracing_max_lat_fops = { |
| 5897 | .open = tracing_open_generic, | 5897 | .open = tracing_open_generic, |
| 5898 | .read = tracing_max_lat_read, | 5898 | .read = tracing_max_lat_read, |
| @@ -7222,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 7222 | 7222 | ||
| 7223 | create_trace_options_dir(tr); | 7223 | create_trace_options_dir(tr); |
| 7224 | 7224 | ||
| 7225 | #ifdef CONFIG_TRACER_MAX_TRACE | 7225 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, | 7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, |
| 7227 | &tr->max_latency, &tracing_max_lat_fops); | 7227 | &tr->max_latency, &tracing_max_lat_fops); |
| 7228 | #endif | 7228 | #endif |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -38,6 +38,7 @@ enum trace_type { | |||
| 38 | TRACE_USER_STACK, | 38 | TRACE_USER_STACK, |
| 39 | TRACE_BLK, | 39 | TRACE_BLK, |
| 40 | TRACE_BPUTS, | 40 | TRACE_BPUTS, |
| 41 | TRACE_HWLAT, | ||
| 41 | 42 | ||
| 42 | __TRACE_LAST_TYPE, | 43 | __TRACE_LAST_TYPE, |
| 43 | }; | 44 | }; |
| @@ -213,6 +214,8 @@ struct trace_array { | |||
| 213 | */ | 214 | */ |
| 214 | struct trace_buffer max_buffer; | 215 | struct trace_buffer max_buffer; |
| 215 | bool allocated_snapshot; | 216 | bool allocated_snapshot; |
| 217 | #endif | ||
| 218 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) | ||
| 216 | unsigned long max_latency; | 219 | unsigned long max_latency; |
| 217 | #endif | 220 | #endif |
| 218 | struct trace_pid_list __rcu *filtered_pids; | 221 | struct trace_pid_list __rcu *filtered_pids; |
| @@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); | |||
| 326 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 329 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
| 327 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 330 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
| 328 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ | 331 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ |
| 332 | IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ | ||
| 329 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 333 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
| 330 | TRACE_MMIO_RW); \ | 334 | TRACE_MMIO_RW); \ |
| 331 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 335 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
| @@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); | |||
| 571 | void tracing_reset_all_online_cpus(void); | 575 | void tracing_reset_all_online_cpus(void); |
| 572 | int tracing_open_generic(struct inode *inode, struct file *filp); | 576 | int tracing_open_generic(struct inode *inode, struct file *filp); |
| 573 | bool tracing_is_disabled(void); | 577 | bool tracing_is_disabled(void); |
| 578 | int tracer_tracing_is_on(struct trace_array *tr); | ||
| 574 | struct dentry *trace_create_file(const char *name, | 579 | struct dentry *trace_create_file(const char *name, |
| 575 | umode_t mode, | 580 | umode_t mode, |
| 576 | struct dentry *parent, | 581 | struct dentry *parent, |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, | |||
| 322 | FILTER_OTHER | 322 | FILTER_OTHER |
| 323 | ); | 323 | ); |
| 324 | 324 | ||
| 325 | |||
| 326 | FTRACE_ENTRY(hwlat, hwlat_entry, | ||
| 327 | |||
| 328 | TRACE_HWLAT, | ||
| 329 | |||
| 330 | F_STRUCT( | ||
| 331 | __field( u64, duration ) | ||
| 332 | __field( u64, outer_duration ) | ||
| 333 | __field( u64, nmi_total_ts ) | ||
| 334 | __field_struct( struct timespec, timestamp ) | ||
| 335 | __field_desc( long, timestamp, tv_sec ) | ||
| 336 | __field_desc( long, timestamp, tv_nsec ) | ||
| 337 | __field( unsigned int, nmi_count ) | ||
| 338 | __field( unsigned int, seqnum ) | ||
| 339 | ), | ||
| 340 | |||
| 341 | F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", | ||
| 342 | __entry->seqnum, | ||
| 343 | __entry->tv_sec, | ||
| 344 | __entry->tv_nsec, | ||
| 345 | __entry->duration, | ||
| 346 | __entry->outer_duration, | ||
| 347 | __entry->nmi_total_ts, | ||
| 348 | __entry->nmi_count), | ||
| 349 | |||
| 350 | FILTER_OTHER | ||
| 351 | ); | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { | |||
| 1028 | static struct event_command trigger_traceoff_cmd = { | 1028 | static struct event_command trigger_traceoff_cmd = { |
| 1029 | .name = "traceoff", | 1029 | .name = "traceoff", |
| 1030 | .trigger_type = ETT_TRACE_ONOFF, | 1030 | .trigger_type = ETT_TRACE_ONOFF, |
| 1031 | .flags = EVENT_CMD_FL_POST_TRIGGER, | ||
| 1031 | .func = event_trigger_callback, | 1032 | .func = event_trigger_callback, |
| 1032 | .reg = register_trigger, | 1033 | .reg = register_trigger, |
| 1033 | .unreg = unregister_trigger, | 1034 | .unreg = unregister_trigger, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0cbe38a844fa..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -170,7 +170,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
| 170 | current->ret_stack[index].ret = ret; | 170 | current->ret_stack[index].ret = ret; |
| 171 | current->ret_stack[index].func = func; | 171 | current->ret_stack[index].func = func; |
| 172 | current->ret_stack[index].calltime = calltime; | 172 | current->ret_stack[index].calltime = calltime; |
| 173 | current->ret_stack[index].subtime = 0; | ||
| 174 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST | 173 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
| 175 | current->ret_stack[index].fp = frame_pointer; | 174 | current->ret_stack[index].fp = frame_pointer; |
| 176 | #endif | 175 | #endif |
| @@ -1183,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1183 | trace_seq_puts(s, "/* "); | 1182 | trace_seq_puts(s, "/* "); |
| 1184 | 1183 | ||
| 1185 | switch (iter->ent->type) { | 1184 | switch (iter->ent->type) { |
| 1185 | case TRACE_BPUTS: | ||
| 1186 | ret = trace_print_bputs_msg_only(iter); | ||
| 1187 | if (ret != TRACE_TYPE_HANDLED) | ||
| 1188 | return ret; | ||
| 1189 | break; | ||
| 1186 | case TRACE_BPRINT: | 1190 | case TRACE_BPRINT: |
| 1187 | ret = trace_print_bprintk_msg_only(iter); | 1191 | ret = trace_print_bprintk_msg_only(iter); |
| 1188 | if (ret != TRACE_TYPE_HANDLED) | 1192 | if (ret != TRACE_TYPE_HANDLED) |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c | |||
| @@ -0,0 +1,633 @@ | |||
| 1 | /* | ||
| 2 | * trace_hwlatdetect.c - A simple Hardware Latency detector. | ||
| 3 | * | ||
| 4 | * Use this tracer to detect large system latencies induced by the behavior of | ||
| 5 | * certain underlying system hardware or firmware, independent of Linux itself. | ||
| 6 | * The code was developed originally to detect the presence of SMIs on Intel | ||
| 7 | * and AMD systems, although there is no dependency upon x86 herein. | ||
| 8 | * | ||
| 9 | * The classical example usage of this tracer is in detecting the presence of | ||
| 10 | * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a | ||
| 11 | * somewhat special form of hardware interrupt spawned from earlier CPU debug | ||
| 12 | * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge | ||
| 13 | * LPC (or other device) to generate a special interrupt under certain | ||
| 14 | * circumstances, for example, upon expiration of a special SMI timer device, | ||
| 15 | * due to certain external thermal readings, on certain I/O address accesses, | ||
| 16 | * and other situations. An SMI hits a special CPU pin, triggers a special | ||
| 17 | * SMI mode (complete with special memory map), and the OS is unaware. | ||
| 18 | * | ||
| 19 | * Although certain hardware-inducing latencies are necessary (for example, | ||
| 20 | * a modern system often requires an SMI handler for correct thermal control | ||
| 21 | * and remote management) they can wreak havoc upon any OS-level performance | ||
| 22 | * guarantees toward low-latency, especially when the OS is not even made | ||
| 23 | * aware of the presence of these interrupts. For this reason, we need a | ||
| 24 | * somewhat brute force mechanism to detect these interrupts. In this case, | ||
| 25 | * we do it by hogging all of the CPU(s) for configurable timer intervals, | ||
| 26 | * sampling the built-in CPU timer, looking for discontiguous readings. | ||
| 27 | * | ||
| 28 | * WARNING: This implementation necessarily introduces latencies. Therefore, | ||
| 29 | * you should NEVER use this tracer while running in a production | ||
| 30 | * environment requiring any kind of low-latency performance | ||
| 31 | * guarantee(s). | ||
| 32 | * | ||
| 33 | * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> | ||
| 34 | * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> | ||
| 35 | * | ||
| 36 | * Includes useful feedback from Clark Williams <clark@redhat.com> | ||
| 37 | * | ||
| 38 | * This file is licensed under the terms of the GNU General Public | ||
| 39 | * License version 2. This program is licensed "as is" without any | ||
| 40 | * warranty of any kind, whether express or implied. | ||
| 41 | */ | ||
| 42 | #include <linux/kthread.h> | ||
| 43 | #include <linux/tracefs.h> | ||
| 44 | #include <linux/uaccess.h> | ||
| 45 | #include <linux/cpumask.h> | ||
| 46 | #include <linux/delay.h> | ||
| 47 | #include "trace.h" | ||
| 48 | |||
| 49 | static struct trace_array *hwlat_trace; | ||
| 50 | |||
| 51 | #define U64STR_SIZE 22 /* 20 digits max */ | ||
| 52 | |||
| 53 | #define BANNER "hwlat_detector: " | ||
| 54 | #define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ | ||
| 55 | #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ | ||
| 56 | #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ | ||
| 57 | |||
| 58 | /* sampling thread*/ | ||
| 59 | static struct task_struct *hwlat_kthread; | ||
| 60 | |||
| 61 | static struct dentry *hwlat_sample_width; /* sample width us */ | ||
| 62 | static struct dentry *hwlat_sample_window; /* sample window us */ | ||
| 63 | |||
| 64 | /* Save the previous tracing_thresh value */ | ||
| 65 | static unsigned long save_tracing_thresh; | ||
| 66 | |||
| 67 | /* NMI timestamp counters */ | ||
| 68 | static u64 nmi_ts_start; | ||
| 69 | static u64 nmi_total_ts; | ||
| 70 | static int nmi_count; | ||
| 71 | static int nmi_cpu; | ||
| 72 | |||
| 73 | /* Tells NMIs to call back to the hwlat tracer to record timestamps */ | ||
| 74 | bool trace_hwlat_callback_enabled; | ||
| 75 | |||
| 76 | /* If the user changed threshold, remember it */ | ||
| 77 | static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; | ||
| 78 | |||
| 79 | /* Individual latency samples are stored here when detected. */ | ||
| 80 | struct hwlat_sample { | ||
| 81 | u64 seqnum; /* unique sequence */ | ||
| 82 | u64 duration; /* delta */ | ||
| 83 | u64 outer_duration; /* delta (outer loop) */ | ||
| 84 | u64 nmi_total_ts; /* Total time spent in NMIs */ | ||
| 85 | struct timespec timestamp; /* wall time */ | ||
| 86 | int nmi_count; /* # NMIs during this sample */ | ||
| 87 | }; | ||
| 88 | |||
| 89 | /* keep the global state somewhere. */ | ||
| 90 | static struct hwlat_data { | ||
| 91 | |||
| 92 | struct mutex lock; /* protect changes */ | ||
| 93 | |||
| 94 | u64 count; /* total since reset */ | ||
| 95 | |||
| 96 | u64 sample_window; /* total sampling window (on+off) */ | ||
| 97 | u64 sample_width; /* active sampling portion of window */ | ||
| 98 | |||
| 99 | } hwlat_data = { | ||
| 100 | .sample_window = DEFAULT_SAMPLE_WINDOW, | ||
| 101 | .sample_width = DEFAULT_SAMPLE_WIDTH, | ||
| 102 | }; | ||
| 103 | |||
| 104 | static void trace_hwlat_sample(struct hwlat_sample *sample) | ||
| 105 | { | ||
| 106 | struct trace_array *tr = hwlat_trace; | ||
| 107 | struct trace_event_call *call = &event_hwlat; | ||
| 108 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 109 | struct ring_buffer_event *event; | ||
| 110 | struct hwlat_entry *entry; | ||
| 111 | unsigned long flags; | ||
| 112 | int pc; | ||
| 113 | |||
| 114 | pc = preempt_count(); | ||
| 115 | local_save_flags(flags); | ||
| 116 | |||
| 117 | event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), | ||
| 118 | flags, pc); | ||
| 119 | if (!event) | ||
| 120 | return; | ||
| 121 | entry = ring_buffer_event_data(event); | ||
| 122 | entry->seqnum = sample->seqnum; | ||
| 123 | entry->duration = sample->duration; | ||
| 124 | entry->outer_duration = sample->outer_duration; | ||
| 125 | entry->timestamp = sample->timestamp; | ||
| 126 | entry->nmi_total_ts = sample->nmi_total_ts; | ||
| 127 | entry->nmi_count = sample->nmi_count; | ||
| 128 | |||
| 129 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 130 | __buffer_unlock_commit(buffer, event); | ||
| 131 | } | ||
| 132 | |||
| 133 | /* Macros to encapsulate the time capturing infrastructure */ | ||
| 134 | #define time_type u64 | ||
| 135 | #define time_get() trace_clock_local() | ||
| 136 | #define time_to_us(x) div_u64(x, 1000) | ||
| 137 | #define time_sub(a, b) ((a) - (b)) | ||
| 138 | #define init_time(a, b) (a = b) | ||
| 139 | #define time_u64(a) a | ||
| 140 | |||
| 141 | void trace_hwlat_callback(bool enter) | ||
| 142 | { | ||
| 143 | if (smp_processor_id() != nmi_cpu) | ||
| 144 | return; | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Currently trace_clock_local() calls sched_clock() and the | ||
| 148 | * generic version is not NMI safe. | ||
| 149 | */ | ||
| 150 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { | ||
| 151 | if (enter) | ||
| 152 | nmi_ts_start = time_get(); | ||
| 153 | else | ||
| 154 | nmi_total_ts = time_get() - nmi_ts_start; | ||
| 155 | } | ||
| 156 | |||
| 157 | if (enter) | ||
| 158 | nmi_count++; | ||
| 159 | } | ||
| 160 | |||
| 161 | /** | ||
| 162 | * get_sample - sample the CPU TSC and look for likely hardware latencies | ||
| 163 | * | ||
| 164 | * Used to repeatedly capture the CPU TSC (or similar), looking for potential | ||
| 165 | * hardware-induced latency. Called with interrupts disabled and with | ||
| 166 | * hwlat_data.lock held. | ||
| 167 | */ | ||
| 168 | static int get_sample(void) | ||
| 169 | { | ||
| 170 | struct trace_array *tr = hwlat_trace; | ||
| 171 | time_type start, t1, t2, last_t2; | ||
| 172 | s64 diff, total, last_total = 0; | ||
| 173 | u64 sample = 0; | ||
| 174 | u64 thresh = tracing_thresh; | ||
| 175 | u64 outer_sample = 0; | ||
| 176 | int ret = -1; | ||
| 177 | |||
| 178 | do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ | ||
| 179 | |||
| 180 | nmi_cpu = smp_processor_id(); | ||
| 181 | nmi_total_ts = 0; | ||
| 182 | nmi_count = 0; | ||
| 183 | /* Make sure NMIs see this first */ | ||
| 184 | barrier(); | ||
| 185 | |||
| 186 | trace_hwlat_callback_enabled = true; | ||
| 187 | |||
| 188 | init_time(last_t2, 0); | ||
| 189 | start = time_get(); /* start timestamp */ | ||
| 190 | |||
| 191 | do { | ||
| 192 | |||
| 193 | t1 = time_get(); /* we'll look for a discontinuity */ | ||
| 194 | t2 = time_get(); | ||
| 195 | |||
| 196 | if (time_u64(last_t2)) { | ||
| 197 | /* Check the delta from outer loop (t2 to next t1) */ | ||
| 198 | diff = time_to_us(time_sub(t1, last_t2)); | ||
| 199 | /* This shouldn't happen */ | ||
| 200 | if (diff < 0) { | ||
| 201 | pr_err(BANNER "time running backwards\n"); | ||
| 202 | goto out; | ||
| 203 | } | ||
| 204 | if (diff > outer_sample) | ||
| 205 | outer_sample = diff; | ||
| 206 | } | ||
| 207 | last_t2 = t2; | ||
| 208 | |||
| 209 | total = time_to_us(time_sub(t2, start)); /* sample width */ | ||
| 210 | |||
| 211 | /* Check for possible overflows */ | ||
| 212 | if (total < last_total) { | ||
| 213 | pr_err("Time total overflowed\n"); | ||
| 214 | break; | ||
| 215 | } | ||
| 216 | last_total = total; | ||
| 217 | |||
| 218 | /* This checks the inner loop (t1 to t2) */ | ||
| 219 | diff = time_to_us(time_sub(t2, t1)); /* current diff */ | ||
| 220 | |||
| 221 | /* This shouldn't happen */ | ||
| 222 | if (diff < 0) { | ||
| 223 | pr_err(BANNER "time running backwards\n"); | ||
| 224 | goto out; | ||
| 225 | } | ||
| 226 | |||
| 227 | if (diff > sample) | ||
| 228 | sample = diff; /* only want highest value */ | ||
| 229 | |||
| 230 | } while (total <= hwlat_data.sample_width); | ||
| 231 | |||
| 232 | barrier(); /* finish the above in the view for NMIs */ | ||
| 233 | trace_hwlat_callback_enabled = false; | ||
| 234 | barrier(); /* Make sure nmi_total_ts is no longer updated */ | ||
| 235 | |||
| 236 | ret = 0; | ||
| 237 | |||
| 238 | /* If we exceed the threshold value, we have found a hardware latency */ | ||
| 239 | if (sample > thresh || outer_sample > thresh) { | ||
| 240 | struct hwlat_sample s; | ||
| 241 | |||
| 242 | ret = 1; | ||
| 243 | |||
| 244 | /* We read in microseconds */ | ||
| 245 | if (nmi_total_ts) | ||
| 246 | do_div(nmi_total_ts, NSEC_PER_USEC); | ||
| 247 | |||
| 248 | hwlat_data.count++; | ||
| 249 | s.seqnum = hwlat_data.count; | ||
| 250 | s.duration = sample; | ||
| 251 | s.outer_duration = outer_sample; | ||
| 252 | s.timestamp = CURRENT_TIME; | ||
| 253 | s.nmi_total_ts = nmi_total_ts; | ||
| 254 | s.nmi_count = nmi_count; | ||
| 255 | trace_hwlat_sample(&s); | ||
| 256 | |||
| 257 | /* Keep a running maximum ever recorded hardware latency */ | ||
| 258 | if (sample > tr->max_latency) | ||
| 259 | tr->max_latency = sample; | ||
| 260 | } | ||
| 261 | |||
| 262 | out: | ||
| 263 | return ret; | ||
| 264 | } | ||
| 265 | |||
| 266 | static struct cpumask save_cpumask; | ||
| 267 | static bool disable_migrate; | ||
| 268 | |||
| 269 | static void move_to_next_cpu(void) | ||
| 270 | { | ||
| 271 | static struct cpumask *current_mask; | ||
| 272 | int next_cpu; | ||
| 273 | |||
| 274 | if (disable_migrate) | ||
| 275 | return; | ||
| 276 | |||
| 277 | /* Just pick the first CPU on first iteration */ | ||
| 278 | if (!current_mask) { | ||
| 279 | current_mask = &save_cpumask; | ||
| 280 | get_online_cpus(); | ||
| 281 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
| 282 | put_online_cpus(); | ||
| 283 | next_cpu = cpumask_first(current_mask); | ||
| 284 | goto set_affinity; | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | * If for some reason the user modifies the CPU affinity | ||
| 289 | * of this thread, than stop migrating for the duration | ||
| 290 | * of the current test. | ||
| 291 | */ | ||
| 292 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | ||
| 293 | goto disable; | ||
| 294 | |||
| 295 | get_online_cpus(); | ||
| 296 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
| 297 | next_cpu = cpumask_next(smp_processor_id(), current_mask); | ||
| 298 | put_online_cpus(); | ||
| 299 | |||
| 300 | if (next_cpu >= nr_cpu_ids) | ||
| 301 | next_cpu = cpumask_first(current_mask); | ||
| 302 | |||
| 303 | set_affinity: | ||
| 304 | if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ | ||
| 305 | goto disable; | ||
| 306 | |||
| 307 | cpumask_clear(current_mask); | ||
| 308 | cpumask_set_cpu(next_cpu, current_mask); | ||
| 309 | |||
| 310 | sched_setaffinity(0, current_mask); | ||
| 311 | return; | ||
| 312 | |||
| 313 | disable: | ||
| 314 | disable_migrate = true; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * kthread_fn - The CPU time sampling/hardware latency detection kernel thread | ||
| 319 | * | ||
| 320 | * Used to periodically sample the CPU TSC via a call to get_sample. We | ||
| 321 | * disable interrupts, which does (intentionally) introduce latency since we | ||
| 322 | * need to ensure nothing else might be running (and thus preempting). | ||
| 323 | * Obviously this should never be used in production environments. | ||
| 324 | * | ||
| 325 | * Currently this runs on which ever CPU it was scheduled on, but most | ||
| 326 | * real-world hardware latency situations occur across several CPUs, | ||
| 327 | * but we might later generalize this if we find there are any actualy | ||
| 328 | * systems with alternate SMI delivery or other hardware latencies. | ||
| 329 | */ | ||
| 330 | static int kthread_fn(void *data) | ||
| 331 | { | ||
| 332 | u64 interval; | ||
| 333 | |||
| 334 | while (!kthread_should_stop()) { | ||
| 335 | |||
| 336 | move_to_next_cpu(); | ||
| 337 | |||
| 338 | local_irq_disable(); | ||
| 339 | get_sample(); | ||
| 340 | local_irq_enable(); | ||
| 341 | |||
| 342 | mutex_lock(&hwlat_data.lock); | ||
| 343 | interval = hwlat_data.sample_window - hwlat_data.sample_width; | ||
| 344 | mutex_unlock(&hwlat_data.lock); | ||
| 345 | |||
| 346 | do_div(interval, USEC_PER_MSEC); /* modifies interval value */ | ||
| 347 | |||
| 348 | /* Always sleep for at least 1ms */ | ||
| 349 | if (interval < 1) | ||
| 350 | interval = 1; | ||
| 351 | |||
| 352 | if (msleep_interruptible(interval)) | ||
| 353 | break; | ||
| 354 | } | ||
| 355 | |||
| 356 | return 0; | ||
| 357 | } | ||
| 358 | |||
| 359 | /** | ||
| 360 | * start_kthread - Kick off the hardware latency sampling/detector kthread | ||
| 361 | * | ||
| 362 | * This starts the kernel thread that will sit and sample the CPU timestamp | ||
| 363 | * counter (TSC or similar) and look for potential hardware latencies. | ||
| 364 | */ | ||
| 365 | static int start_kthread(struct trace_array *tr) | ||
| 366 | { | ||
| 367 | struct task_struct *kthread; | ||
| 368 | |||
| 369 | kthread = kthread_create(kthread_fn, NULL, "hwlatd"); | ||
| 370 | if (IS_ERR(kthread)) { | ||
| 371 | pr_err(BANNER "could not start sampling thread\n"); | ||
| 372 | return -ENOMEM; | ||
| 373 | } | ||
| 374 | hwlat_kthread = kthread; | ||
| 375 | wake_up_process(kthread); | ||
| 376 | |||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | /** | ||
| 381 | * stop_kthread - Inform the hardware latency samping/detector kthread to stop | ||
| 382 | * | ||
| 383 | * This kicks the running hardware latency sampling/detector kernel thread and | ||
| 384 | * tells it to stop sampling now. Use this on unload and at system shutdown. | ||
| 385 | */ | ||
| 386 | static void stop_kthread(void) | ||
| 387 | { | ||
| 388 | if (!hwlat_kthread) | ||
| 389 | return; | ||
| 390 | kthread_stop(hwlat_kthread); | ||
| 391 | hwlat_kthread = NULL; | ||
| 392 | } | ||
| 393 | |||
| 394 | /* | ||
| 395 | * hwlat_read - Wrapper read function for reading both window and width | ||
| 396 | * @filp: The active open file structure | ||
| 397 | * @ubuf: The userspace provided buffer to read value into | ||
| 398 | * @cnt: The maximum number of bytes to read | ||
| 399 | * @ppos: The current "file" position | ||
| 400 | * | ||
| 401 | * This function provides a generic read implementation for the global state | ||
| 402 | * "hwlat_data" structure filesystem entries. | ||
| 403 | */ | ||
| 404 | static ssize_t hwlat_read(struct file *filp, char __user *ubuf, | ||
| 405 | size_t cnt, loff_t *ppos) | ||
| 406 | { | ||
| 407 | char buf[U64STR_SIZE]; | ||
| 408 | u64 *entry = filp->private_data; | ||
| 409 | u64 val; | ||
| 410 | int len; | ||
| 411 | |||
| 412 | if (!entry) | ||
| 413 | return -EFAULT; | ||
| 414 | |||
| 415 | if (cnt > sizeof(buf)) | ||
| 416 | cnt = sizeof(buf); | ||
| 417 | |||
| 418 | val = *entry; | ||
| 419 | |||
| 420 | len = snprintf(buf, sizeof(buf), "%llu\n", val); | ||
| 421 | |||
| 422 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); | ||
| 423 | } | ||
| 424 | |||
| 425 | /** | ||
| 426 | * hwlat_width_write - Write function for "width" entry | ||
| 427 | * @filp: The active open file structure | ||
| 428 | * @ubuf: The user buffer that contains the value to write | ||
| 429 | * @cnt: The maximum number of bytes to write to "file" | ||
| 430 | * @ppos: The current position in @file | ||
| 431 | * | ||
| 432 | * This function provides a write implementation for the "width" interface | ||
| 433 | * to the hardware latency detector. It can be used to configure | ||
| 434 | * for how many us of the total window us we will actively sample for any | ||
| 435 | * hardware-induced latency periods. Obviously, it is not possible to | ||
| 436 | * sample constantly and have the system respond to a sample reader, or, | ||
| 437 | * worse, without having the system appear to have gone out to lunch. It | ||
| 438 | * is enforced that width is less that the total window size. | ||
| 439 | */ | ||
| 440 | static ssize_t | ||
| 441 | hwlat_width_write(struct file *filp, const char __user *ubuf, | ||
| 442 | size_t cnt, loff_t *ppos) | ||
| 443 | { | ||
| 444 | u64 val; | ||
| 445 | int err; | ||
| 446 | |||
| 447 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
| 448 | if (err) | ||
| 449 | return err; | ||
| 450 | |||
| 451 | mutex_lock(&hwlat_data.lock); | ||
| 452 | if (val < hwlat_data.sample_window) | ||
| 453 | hwlat_data.sample_width = val; | ||
| 454 | else | ||
| 455 | err = -EINVAL; | ||
| 456 | mutex_unlock(&hwlat_data.lock); | ||
| 457 | |||
| 458 | if (err) | ||
| 459 | return err; | ||
| 460 | |||
| 461 | return cnt; | ||
| 462 | } | ||
| 463 | |||
| 464 | /** | ||
| 465 | * hwlat_window_write - Write function for "window" entry | ||
| 466 | * @filp: The active open file structure | ||
| 467 | * @ubuf: The user buffer that contains the value to write | ||
| 468 | * @cnt: The maximum number of bytes to write to "file" | ||
| 469 | * @ppos: The current position in @file | ||
| 470 | * | ||
| 471 | * This function provides a write implementation for the "window" interface | ||
| 472 | * to the hardware latency detetector. The window is the total time | ||
| 473 | * in us that will be considered one sample period. Conceptually, windows | ||
| 474 | * occur back-to-back and contain a sample width period during which | ||
| 475 | * actual sampling occurs. Can be used to write a new total window size. It | ||
| 476 | * is enfoced that any value written must be greater than the sample width | ||
| 477 | * size, or an error results. | ||
| 478 | */ | ||
| 479 | static ssize_t | ||
| 480 | hwlat_window_write(struct file *filp, const char __user *ubuf, | ||
| 481 | size_t cnt, loff_t *ppos) | ||
| 482 | { | ||
| 483 | u64 val; | ||
| 484 | int err; | ||
| 485 | |||
| 486 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
| 487 | if (err) | ||
| 488 | return err; | ||
| 489 | |||
| 490 | mutex_lock(&hwlat_data.lock); | ||
| 491 | if (hwlat_data.sample_width < val) | ||
| 492 | hwlat_data.sample_window = val; | ||
| 493 | else | ||
| 494 | err = -EINVAL; | ||
| 495 | mutex_unlock(&hwlat_data.lock); | ||
| 496 | |||
| 497 | if (err) | ||
| 498 | return err; | ||
| 499 | |||
| 500 | return cnt; | ||
| 501 | } | ||
| 502 | |||
| 503 | static const struct file_operations width_fops = { | ||
| 504 | .open = tracing_open_generic, | ||
| 505 | .read = hwlat_read, | ||
| 506 | .write = hwlat_width_write, | ||
| 507 | }; | ||
| 508 | |||
| 509 | static const struct file_operations window_fops = { | ||
| 510 | .open = tracing_open_generic, | ||
| 511 | .read = hwlat_read, | ||
| 512 | .write = hwlat_window_write, | ||
| 513 | }; | ||
| 514 | |||
| 515 | /** | ||
| 516 | * init_tracefs - A function to initialize the tracefs interface files | ||
| 517 | * | ||
| 518 | * This function creates entries in tracefs for "hwlat_detector". | ||
| 519 | * It creates the hwlat_detector directory in the tracing directory, | ||
| 520 | * and within that directory is the count, width and window files to | ||
| 521 | * change and view those values. | ||
| 522 | */ | ||
| 523 | static int init_tracefs(void) | ||
| 524 | { | ||
| 525 | struct dentry *d_tracer; | ||
| 526 | struct dentry *top_dir; | ||
| 527 | |||
| 528 | d_tracer = tracing_init_dentry(); | ||
| 529 | if (IS_ERR(d_tracer)) | ||
| 530 | return -ENOMEM; | ||
| 531 | |||
| 532 | top_dir = tracefs_create_dir("hwlat_detector", d_tracer); | ||
| 533 | if (!top_dir) | ||
| 534 | return -ENOMEM; | ||
| 535 | |||
| 536 | hwlat_sample_window = tracefs_create_file("window", 0640, | ||
| 537 | top_dir, | ||
| 538 | &hwlat_data.sample_window, | ||
| 539 | &window_fops); | ||
| 540 | if (!hwlat_sample_window) | ||
| 541 | goto err; | ||
| 542 | |||
| 543 | hwlat_sample_width = tracefs_create_file("width", 0644, | ||
| 544 | top_dir, | ||
| 545 | &hwlat_data.sample_width, | ||
| 546 | &width_fops); | ||
| 547 | if (!hwlat_sample_width) | ||
| 548 | goto err; | ||
| 549 | |||
| 550 | return 0; | ||
| 551 | |||
| 552 | err: | ||
| 553 | tracefs_remove_recursive(top_dir); | ||
| 554 | return -ENOMEM; | ||
| 555 | } | ||
| 556 | |||
| 557 | static void hwlat_tracer_start(struct trace_array *tr) | ||
| 558 | { | ||
| 559 | int err; | ||
| 560 | |||
| 561 | err = start_kthread(tr); | ||
| 562 | if (err) | ||
| 563 | pr_err(BANNER "Cannot start hwlat kthread\n"); | ||
| 564 | } | ||
| 565 | |||
| 566 | static void hwlat_tracer_stop(struct trace_array *tr) | ||
| 567 | { | ||
| 568 | stop_kthread(); | ||
| 569 | } | ||
| 570 | |||
| 571 | static bool hwlat_busy; | ||
| 572 | |||
| 573 | static int hwlat_tracer_init(struct trace_array *tr) | ||
| 574 | { | ||
| 575 | /* Only allow one instance to enable this */ | ||
| 576 | if (hwlat_busy) | ||
| 577 | return -EBUSY; | ||
| 578 | |||
| 579 | hwlat_trace = tr; | ||
| 580 | |||
| 581 | disable_migrate = false; | ||
| 582 | hwlat_data.count = 0; | ||
| 583 | tr->max_latency = 0; | ||
| 584 | save_tracing_thresh = tracing_thresh; | ||
| 585 | |||
| 586 | /* tracing_thresh is in nsecs, we speak in usecs */ | ||
| 587 | if (!tracing_thresh) | ||
| 588 | tracing_thresh = last_tracing_thresh; | ||
| 589 | |||
| 590 | if (tracer_tracing_is_on(tr)) | ||
| 591 | hwlat_tracer_start(tr); | ||
| 592 | |||
| 593 | hwlat_busy = true; | ||
| 594 | |||
| 595 | return 0; | ||
| 596 | } | ||
| 597 | |||
| 598 | static void hwlat_tracer_reset(struct trace_array *tr) | ||
| 599 | { | ||
| 600 | stop_kthread(); | ||
| 601 | |||
| 602 | /* the tracing threshold is static between runs */ | ||
| 603 | last_tracing_thresh = tracing_thresh; | ||
| 604 | |||
| 605 | tracing_thresh = save_tracing_thresh; | ||
| 606 | hwlat_busy = false; | ||
| 607 | } | ||
| 608 | |||
| 609 | static struct tracer hwlat_tracer __read_mostly = | ||
| 610 | { | ||
| 611 | .name = "hwlat", | ||
| 612 | .init = hwlat_tracer_init, | ||
| 613 | .reset = hwlat_tracer_reset, | ||
| 614 | .start = hwlat_tracer_start, | ||
| 615 | .stop = hwlat_tracer_stop, | ||
| 616 | .allow_instances = true, | ||
| 617 | }; | ||
| 618 | |||
| 619 | __init static int init_hwlat_tracer(void) | ||
| 620 | { | ||
| 621 | int ret; | ||
| 622 | |||
| 623 | mutex_init(&hwlat_data.lock); | ||
| 624 | |||
| 625 | ret = register_tracer(&hwlat_tracer); | ||
| 626 | if (ret) | ||
| 627 | return ret; | ||
| 628 | |||
| 629 | init_tracefs(); | ||
| 630 | |||
| 631 | return 0; | ||
| 632 | } | ||
| 633 | late_initcall(init_hwlat_tracer); | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { | |||
| 1098 | .funcs = &trace_user_stack_funcs, | 1098 | .funcs = &trace_user_stack_funcs, |
| 1099 | }; | 1099 | }; |
| 1100 | 1100 | ||
| 1101 | /* TRACE_HWLAT */ | ||
| 1102 | static enum print_line_t | ||
| 1103 | trace_hwlat_print(struct trace_iterator *iter, int flags, | ||
| 1104 | struct trace_event *event) | ||
| 1105 | { | ||
| 1106 | struct trace_entry *entry = iter->ent; | ||
| 1107 | struct trace_seq *s = &iter->seq; | ||
| 1108 | struct hwlat_entry *field; | ||
| 1109 | |||
| 1110 | trace_assign_type(field, entry); | ||
| 1111 | |||
| 1112 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", | ||
| 1113 | field->seqnum, | ||
| 1114 | field->duration, | ||
| 1115 | field->outer_duration, | ||
| 1116 | field->timestamp.tv_sec, | ||
| 1117 | field->timestamp.tv_nsec); | ||
| 1118 | |||
| 1119 | if (field->nmi_count) { | ||
| 1120 | /* | ||
| 1121 | * The generic sched_clock() is not NMI safe, thus | ||
| 1122 | * we only record the count and not the time. | ||
| 1123 | */ | ||
| 1124 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) | ||
| 1125 | trace_seq_printf(s, " nmi-total:%llu", | ||
| 1126 | field->nmi_total_ts); | ||
| 1127 | trace_seq_printf(s, " nmi-count:%u", | ||
| 1128 | field->nmi_count); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | trace_seq_putc(s, '\n'); | ||
| 1132 | |||
| 1133 | return trace_handle_return(s); | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | |||
| 1137 | static enum print_line_t | ||
| 1138 | trace_hwlat_raw(struct trace_iterator *iter, int flags, | ||
| 1139 | struct trace_event *event) | ||
| 1140 | { | ||
| 1141 | struct hwlat_entry *field; | ||
| 1142 | struct trace_seq *s = &iter->seq; | ||
| 1143 | |||
| 1144 | trace_assign_type(field, iter->ent); | ||
| 1145 | |||
| 1146 | trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", | ||
| 1147 | field->duration, | ||
| 1148 | field->outer_duration, | ||
| 1149 | field->timestamp.tv_sec, | ||
| 1150 | field->timestamp.tv_nsec, | ||
| 1151 | field->seqnum); | ||
| 1152 | |||
| 1153 | return trace_handle_return(s); | ||
| 1154 | } | ||
| 1155 | |||
| 1156 | static struct trace_event_functions trace_hwlat_funcs = { | ||
| 1157 | .trace = trace_hwlat_print, | ||
| 1158 | .raw = trace_hwlat_raw, | ||
| 1159 | }; | ||
| 1160 | |||
| 1161 | static struct trace_event trace_hwlat_event = { | ||
| 1162 | .type = TRACE_HWLAT, | ||
| 1163 | .funcs = &trace_hwlat_funcs, | ||
| 1164 | }; | ||
| 1165 | |||
| 1101 | /* TRACE_BPUTS */ | 1166 | /* TRACE_BPUTS */ |
| 1102 | static enum print_line_t | 1167 | static enum print_line_t |
| 1103 | trace_bputs_print(struct trace_iterator *iter, int flags, | 1168 | trace_bputs_print(struct trace_iterator *iter, int flags, |
| @@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { | |||
| 1233 | &trace_bputs_event, | 1298 | &trace_bputs_event, |
| 1234 | &trace_bprint_event, | 1299 | &trace_bprint_event, |
| 1235 | &trace_print_event, | 1300 | &trace_print_event, |
| 1301 | &trace_hwlat_event, | ||
| 1236 | NULL | 1302 | NULL |
| 1237 | }; | 1303 | }; |
| 1238 | 1304 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7a687320f867..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -431,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) | |||
| 431 | pr_info("Probe point is not specified.\n"); | 431 | pr_info("Probe point is not specified.\n"); |
| 432 | return -EINVAL; | 432 | return -EINVAL; |
| 433 | } | 433 | } |
| 434 | if (isdigit(argv[1][0])) { | ||
| 435 | pr_info("probe point must be have a filename.\n"); | ||
| 436 | return -EINVAL; | ||
| 437 | } | ||
| 438 | arg = strchr(argv[1], ':'); | 434 | arg = strchr(argv[1], ':'); |
| 439 | if (!arg) { | 435 | if (!arg) { |
| 440 | ret = -EINVAL; | 436 | ret = -EINVAL; |
diff --git a/scripts/tracing/ftrace-bisect.sh b/scripts/tracing/ftrace-bisect.sh new file mode 100755 index 000000000000..9ff8ac5fc53c --- /dev/null +++ b/scripts/tracing/ftrace-bisect.sh | |||
| @@ -0,0 +1,115 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | # | ||
| 3 | # Here's how to use this: | ||
| 4 | # | ||
| 5 | # This script is used to help find functions that are being traced by function | ||
| 6 | # tracer or function graph tracing that causes the machine to reboot, hang, or | ||
| 7 | # crash. Here's the steps to take. | ||
| 8 | # | ||
| 9 | # First, determine if function tracing is working with a single function: | ||
| 10 | # | ||
| 11 | # (note, if this is a problem with function_graph tracing, then simply | ||
| 12 | # replace "function" with "function_graph" in the following steps). | ||
| 13 | # | ||
| 14 | # # cd /sys/kernel/debug/tracing | ||
| 15 | # # echo schedule > set_ftrace_filter | ||
| 16 | # # echo function > current_tracer | ||
| 17 | # | ||
| 18 | # If this works, then we know that something is being traced that shouldn't be. | ||
| 19 | # | ||
| 20 | # # echo nop > current_tracer | ||
| 21 | # | ||
| 22 | # # cat available_filter_functions > ~/full-file | ||
| 23 | # # ftrace-bisect ~/full-file ~/test-file ~/non-test-file | ||
| 24 | # # cat ~/test-file > set_ftrace_filter | ||
| 25 | # | ||
| 26 | # *** Note *** this will take several minutes. Setting multiple functions is | ||
| 27 | # an O(n^2) operation, and we are dealing with thousands of functions. So go | ||
| 28 | # have coffee, talk with your coworkers, read facebook. And eventually, this | ||
| 29 | # operation will end. | ||
| 30 | # | ||
| 31 | # # echo function > current_tracer | ||
| 32 | # | ||
| 33 | # If it crashes, we know that ~/test-file has a bad function. | ||
| 34 | # | ||
| 35 | # Reboot back to test kernel. | ||
| 36 | # | ||
| 37 | # # cd /sys/kernel/debug/tracing | ||
| 38 | # # mv ~/test-file ~/full-file | ||
| 39 | # | ||
| 40 | # If it didn't crash. | ||
| 41 | # | ||
| 42 | # # echo nop > current_tracer | ||
| 43 | # # mv ~/non-test-file ~/full-file | ||
| 44 | # | ||
| 45 | # Get rid of the other test file from previous run (or save them off somewhere). | ||
| 46 | # # rm -f ~/test-file ~/non-test-file | ||
| 47 | # | ||
| 48 | # And start again: | ||
| 49 | # | ||
| 50 | # # ftrace-bisect ~/full-file ~/test-file ~/non-test-file | ||
| 51 | # | ||
| 52 | # The good thing is, because this cuts the number of functions in ~/test-file | ||
| 53 | # by half, the cat of it into set_ftrace_filter takes half as long each | ||
| 54 | # iteration, so don't talk so much at the water cooler the second time. | ||
| 55 | # | ||
| 56 | # Eventually, if you did this correctly, you will get down to the problem | ||
| 57 | # function, and all we need to do is to notrace it. | ||
| 58 | # | ||
| 59 | # The way to figure out if the problem function is bad, just do: | ||
| 60 | # | ||
| 61 | # # echo <problem-function> > set_ftrace_notrace | ||
| 62 | # # echo > set_ftrace_filter | ||
| 63 | # # echo function > current_tracer | ||
| 64 | # | ||
| 65 | # And if it doesn't crash, we are done. | ||
| 66 | # | ||
| 67 | # If it does crash, do this again (there's more than one problem function) | ||
| 68 | # but you need to echo the problem function(s) into set_ftrace_notrace before | ||
| 69 | # enabling function tracing in the above steps. Or if you can compile the | ||
| 70 | # kernel, annotate the problem functions with "notrace" and start again. | ||
| 71 | # | ||
| 72 | |||
| 73 | |||
| 74 | if [ $# -ne 3 ]; then | ||
| 75 | echo 'usage: ftrace-bisect full-file test-file non-test-file' | ||
| 76 | exit | ||
| 77 | fi | ||
| 78 | |||
| 79 | full=$1 | ||
| 80 | test=$2 | ||
| 81 | nontest=$3 | ||
| 82 | |||
| 83 | x=`cat $full | wc -l` | ||
| 84 | if [ $x -eq 1 ]; then | ||
| 85 | echo "There's only one function left, must be the bad one" | ||
| 86 | cat $full | ||
| 87 | exit 0 | ||
| 88 | fi | ||
| 89 | |||
| 90 | let x=$x/2 | ||
| 91 | let y=$x+1 | ||
| 92 | |||
| 93 | if [ ! -f $full ]; then | ||
| 94 | echo "$full does not exist" | ||
| 95 | exit 1 | ||
| 96 | fi | ||
| 97 | |||
| 98 | if [ -f $test ]; then | ||
| 99 | echo -n "$test exists, delete it? [y/N]" | ||
| 100 | read a | ||
| 101 | if [ "$a" != "y" -a "$a" != "Y" ]; then | ||
| 102 | exit 1 | ||
| 103 | fi | ||
| 104 | fi | ||
| 105 | |||
| 106 | if [ -f $nontest ]; then | ||
| 107 | echo -n "$nontest exists, delete it? [y/N]" | ||
| 108 | read a | ||
| 109 | if [ "$a" != "y" -a "$a" != "Y" ]; then | ||
| 110 | exit 1 | ||
| 111 | fi | ||
| 112 | fi | ||
| 113 | |||
| 114 | sed -ne "1,${x}p" $full > $test | ||
| 115 | sed -ne "$y,\$p" $full > $nontest | ||
