aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/tracers/mmiotrace.txt164
-rw-r--r--Makefile4
-rw-r--r--arch/powerpc/Kconfig4
-rw-r--r--arch/powerpc/kernel/Makefile14
-rw-r--r--arch/powerpc/kernel/entry_32.S130
-rw-r--r--arch/powerpc/kernel/entry_64.S62
-rw-r--r--arch/powerpc/kernel/ftrace.c165
-rw-r--r--arch/powerpc/kernel/io.c3
-rw-r--r--arch/powerpc/kernel/irq.c6
-rw-r--r--arch/powerpc/kernel/setup_32.c11
-rw-r--r--arch/powerpc/kernel/setup_64.c5
-rw-r--r--arch/powerpc/platforms/powermac/Makefile5
-rw-r--r--arch/sparc64/Kconfig2
-rw-r--r--arch/sparc64/Kconfig.debug2
-rw-r--r--arch/sparc64/kernel/Makefile1
-rw-r--r--arch/sparc64/kernel/ftrace.c99
-rw-r--r--arch/sparc64/lib/mcount.S58
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Kconfig.debug28
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/entry_32.S68
-rw-r--r--arch/x86/kernel/entry_64.S102
-rw-r--r--arch/x86/kernel/ftrace.c159
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c9
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/process_32.c3
-rw-r--r--arch/x86/kernel/process_64.c3
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/thunk_32.S47
-rw-r--r--arch/x86/lib/thunk_64.S19
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c11
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/mmio-mod.c515
-rw-r--r--arch/x86/mm/pageattr.c1
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/testmmiotrace.c71
-rw-r--r--arch/x86/vdso/vclock_gettime.c15
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--include/asm-powerpc/hw_irq.h10
-rw-r--r--include/asm-x86/alternative.h2
-rw-r--r--include/asm-x86/irqflags.h24
-rw-r--r--include/asm-x86/vsyscall.h3
-rw-r--r--include/linux/ftrace.h132
-rw-r--r--include/linux/irqflags.h13
-rw-r--r--include/linux/linkage.h2
-rw-r--r--include/linux/marker.h40
-rw-r--r--include/linux/mmiotrace.h85
-rw-r--r--include/linux/preempt.h34
-rw-r--r--include/linux/sched.h16
-rw-r--r--include/linux/writeback.h2
-rw-r--r--kernel/Makefile14
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/lockdep.c33
-rw-r--r--kernel/marker.c30
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/sched.c55
-rw-r--r--kernel/semaphore.c2
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/trace/Kconfig127
-rw-r--r--kernel/trace/Makefile23
-rw-r--r--kernel/trace/ftrace.c1398
-rw-r--r--kernel/trace/trace.c3073
-rw-r--r--kernel/trace/trace.h327
-rw-r--r--kernel/trace/trace_functions.c78
-rw-r--r--kernel/trace/trace_irqsoff.c502
-rw-r--r--kernel/trace/trace_mmiotrace.c295
-rw-r--r--kernel/trace/trace_sched_switch.c301
-rw-r--r--kernel/trace/trace_sched_wakeup.c382
-rw-r--r--kernel/trace/trace_selftest.c539
-rw-r--r--kernel/trace/trace_selftest_dynamic.c7
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/Makefile9
-rw-r--r--lib/smp_processor_id.c6
-rw-r--r--mm/page-writeback.c10
-rw-r--r--scripts/Makefile.lib3
85 files changed, 10398 insertions, 112 deletions
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
new file mode 100644
index 000000000000..a4afb560a45b
--- /dev/null
+++ b/Documentation/tracers/mmiotrace.txt
@@ -0,0 +1,164 @@
1 In-kernel memory-mapped I/O tracing
2
3
4Home page and links to optional user space tools:
5
6 http://nouveau.freedesktop.org/wiki/MmioTrace
7
8MMIO tracing was originally developed by Intel around 2003 for their Fault
9Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel,
10Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau
11project in mind. Since then many people have contributed.
12
13Mmiotrace was built for reverse engineering any memory-mapped IO device with
14the Nouveau project as the first real user. Only x86 and x86_64 architectures
15are supported.
16
17Out-of-tree mmiotrace was originally modified for mainline inclusion and
18ftrace framework by Pekka Paalanen <pq@iki.fi>.
19
20
21Preparation
22-----------
23
24Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is
25disabled by default, so it is safe to have this set to yes. SMP systems are
26supported, but tracing is unreliable and may miss events if more than one CPU
27is on-line, therefore mmiotrace takes all but one CPU off-line during run-time
28activation. You can re-enable CPUs by hand, but you have been warned, there
29is no way to automatically detect if you are losing events due to CPUs racing.
30
31
32Usage Quick Reference
33---------------------
34
35$ mount -t debugfs debugfs /debug
36$ echo mmiotrace > /debug/tracing/current_tracer
37$ cat /debug/tracing/trace_pipe > mydump.txt &
38Start X or whatever.
39$ echo "X is up" > /debug/tracing/marker
40$ echo none > /debug/tracing/current_tracer
41Check for lost events.
42
43
44Usage
45-----
46
47Make sure debugfs is mounted to /debug. If not, (requires root privileges)
48$ mount -t debugfs debugfs /debug
49
50Check that the driver you are about to trace is not loaded.
51
52Activate mmiotrace (requires root privileges):
53$ echo mmiotrace > /debug/tracing/current_tracer
54
55Start storing the trace:
56$ cat /debug/tracing/trace_pipe > mydump.txt &
57The 'cat' process should stay running (sleeping) in the background.
58
59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
60accesses to areas that are ioremapped while mmiotrace is active.
61
62[Unimplemented feature:]
63During tracing you can place comments (markers) into the trace by
64$ echo "X is up" > /debug/tracing/marker
65This makes it easier to see which part of the (huge) trace corresponds to
66which action. It is recommended to place descriptive markers about what you
67do.
68
69Shut down mmiotrace (requires root privileges):
70$ echo none > /debug/tracing/current_tracer
71The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
72pressing ctrl+c.
73
74Check that mmiotrace did not lose events due to a buffer filling up. Either
75$ grep -i lost mydump.txt
76which tells you exactly how many events were lost, or use
77$ dmesg
78to view your kernel log and look for "mmiotrace has lost events" warning. If
79events were lost, the trace is incomplete. You should enlarge the buffers and
80try again. Buffers are enlarged by first seeing how large the current buffers
81are:
82$ cat /debug/tracing/trace_entries
83gives you a number. Approximately double this number and write it back, for
84instance:
85$ echo 128000 > /debug/tracing/trace_entries
86Then start again from the top.
87
88If you are doing a trace for a driver project, e.g. Nouveau, you should also
89do the following before sending your results:
90$ lspci -vvv > lspci.txt
91$ dmesg > dmesg.txt
92$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
93and then send the .tar.gz file. The trace compresses considerably. Replace
94"pciid" and "nick" with the PCI ID or model name of your piece of hardware
95under investigation and your nick name.
96
97
98How Mmiotrace Works
99-------------------
100
101Access to hardware IO-memory is gained by mapping addresses from PCI bus by
102calling one of the ioremap_*() functions. Mmiotrace is hooked into the
103__ioremap() function and gets called whenever a mapping is created. Mapping is
104an event that is recorded into the trace log. Note, that ISA range mappings
105are not caught, since the mapping always exists and is returned directly.
106
107MMIO accesses are recorded via page faults. Just before __ioremap() returns,
108the mapped pages are marked as not present. Any access to the pages causes a
109fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace
110marks the page present, sets TF flag to achieve single stepping and exits the
111fault handler. The instruction that faulted is executed and debug trap is
112entered. Here mmiotrace again marks the page as not present. The instruction
113is decoded to get the type of operation (read/write), data width and the value
114read or written. These are stored to the trace log.
115
116Setting the page present in the page fault handler has a race condition on SMP
117machines. During the single stepping other CPUs may run freely on that page
118and events can be missed without a notice. Re-enabling other CPUs during
119tracing is discouraged.
120
121
122Trace Log Format
123----------------
124
125The raw log is text and easily filtered with e.g. grep and awk. One record is
126one line in the log. A record starts with a keyword, followed by keyword
127dependant arguments. Arguments are separated by a space, or continue until the
128end of line. The format for version 20070824 is as follows:
129
130Explanation Keyword Space separated arguments
131---------------------------------------------------------------------------
132
133read event R width, timestamp, map id, physical, value, PC, PID
134write event W width, timestamp, map id, physical, value, PC, PID
135ioremap event MAP timestamp, map id, physical, virtual, length, PC, PID
136iounmap event UNMAP timestamp, map id, PC, PID
137marker MARK timestamp, text
138version VERSION the string "20070824"
139info for reader LSPCI one line from lspci -v
140PCI address map PCIDEV space separated /proc/bus/pci/devices data
141unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID
142
143Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
144is a kernel virtual address. Width is the data width in bytes and value is the
145data value. Map id is an arbitrary id number identifying the mapping that was
146used in an operation. PC is the program counter and PID is process id. PC is
147zero if it is not recorded. PID is always zero as tracing MMIO accesses
148originating in user space memory is not yet supported.
149
150For instance, the following awk filter will pass all 32-bit writes that target
151physical addresses in the range [0xfb73ce40, 0xfb800000[
152
153$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 &&
154adr < 0xfb800000) print; }'
155
156
157Tools for Developers
158--------------------
159
160The user space tools include utilities for:
161- replacing numeric addresses and values with hardware register names
162- replaying MMIO logs, i.e., re-executing the recorded writes
163
164
diff --git a/Makefile b/Makefile
index 6923d669a4f6..db164f69cffa 100644
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS += -g
528KBUILD_AFLAGS += -gdwarf-2 528KBUILD_AFLAGS += -gdwarf-2
529endif 529endif
530 530
531ifdef CONFIG_FTRACE
532KBUILD_CFLAGS += -pg
533endif
534
531# We trigger additional mismatches with less inlining 535# We trigger additional mismatches with less inlining
532ifdef CONFIG_DEBUG_SECTION_MISMATCH 536ifdef CONFIG_DEBUG_SECTION_MISMATCH
533KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once) 537KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3934e2659407..a5e9912e2d37 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -105,11 +105,13 @@ config ARCH_NO_VIRT_TO_BUS
105config PPC 105config PPC
106 bool 106 bool
107 default y 107 default y
108 select HAVE_DYNAMIC_FTRACE
109 select HAVE_FTRACE
108 select HAVE_IDE 110 select HAVE_IDE
109 select HAVE_OPROFILE
110 select HAVE_KPROBES 111 select HAVE_KPROBES
111 select HAVE_KRETPROBES 112 select HAVE_KRETPROBES
112 select HAVE_LMB 113 select HAVE_LMB
114 select HAVE_OPROFILE
113 115
114config EARLY_PRINTK 116config EARLY_PRINTK
115 bool 117 bool
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2346d271fbfd..f3f5e2641432 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -12,6 +12,18 @@ CFLAGS_prom_init.o += -fPIC
12CFLAGS_btext.o += -fPIC 12CFLAGS_btext.o += -fPIC
13endif 13endif
14 14
15ifdef CONFIG_FTRACE
16# Do not trace early boot code
17CFLAGS_REMOVE_cputable.o = -pg
18CFLAGS_REMOVE_prom_init.o = -pg
19
20ifdef CONFIG_DYNAMIC_FTRACE
21# dynamic ftrace setup.
22CFLAGS_REMOVE_ftrace.o = -pg
23endif
24
25endif
26
15obj-y := cputable.o ptrace.o syscalls.o \ 27obj-y := cputable.o ptrace.o syscalls.o \
16 irq.o align.o signal_32.o pmc.o vdso.o \ 28 irq.o align.o signal_32.o pmc.o vdso.o \
17 init_task.o process.o systbl.o idle.o \ 29 init_task.o process.o systbl.o idle.o \
@@ -78,6 +90,8 @@ obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \
78obj-$(CONFIG_AUDIT) += audit.o 90obj-$(CONFIG_AUDIT) += audit.o
79obj64-$(CONFIG_AUDIT) += compat_audit.o 91obj64-$(CONFIG_AUDIT) += compat_audit.o
80 92
93obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
94
81obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 95obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
82 96
83ifneq ($(CONFIG_PPC_INDIRECT_IO),y) 97ifneq ($(CONFIG_PPC_INDIRECT_IO),y)
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 0c8614d9875c..0e6221889ca9 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1035,3 +1035,133 @@ machine_check_in_rtas:
1035 /* XXX load up BATs and panic */ 1035 /* XXX load up BATs and panic */
1036 1036
1037#endif /* CONFIG_PPC_RTAS */ 1037#endif /* CONFIG_PPC_RTAS */
1038
1039#ifdef CONFIG_FTRACE
1040#ifdef CONFIG_DYNAMIC_FTRACE
1041_GLOBAL(mcount)
1042_GLOBAL(_mcount)
1043 stwu r1,-48(r1)
1044 stw r3, 12(r1)
1045 stw r4, 16(r1)
1046 stw r5, 20(r1)
1047 stw r6, 24(r1)
1048 mflr r3
1049 stw r7, 28(r1)
1050 mfcr r5
1051 stw r8, 32(r1)
1052 stw r9, 36(r1)
1053 stw r10,40(r1)
1054 stw r3, 44(r1)
1055 stw r5, 8(r1)
1056 .globl mcount_call
1057mcount_call:
1058 bl ftrace_stub
1059 nop
1060 lwz r6, 8(r1)
1061 lwz r0, 44(r1)
1062 lwz r3, 12(r1)
1063 mtctr r0
1064 lwz r4, 16(r1)
1065 mtcr r6
1066 lwz r5, 20(r1)
1067 lwz r6, 24(r1)
1068 lwz r0, 52(r1)
1069 lwz r7, 28(r1)
1070 lwz r8, 32(r1)
1071 mtlr r0
1072 lwz r9, 36(r1)
1073 lwz r10,40(r1)
1074 addi r1, r1, 48
1075 bctr
1076
1077_GLOBAL(ftrace_caller)
1078 /* Based off of objdump optput from glibc */
1079 stwu r1,-48(r1)
1080 stw r3, 12(r1)
1081 stw r4, 16(r1)
1082 stw r5, 20(r1)
1083 stw r6, 24(r1)
1084 mflr r3
1085 lwz r4, 52(r1)
1086 mfcr r5
1087 stw r7, 28(r1)
1088 stw r8, 32(r1)
1089 stw r9, 36(r1)
1090 stw r10,40(r1)
1091 stw r3, 44(r1)
1092 stw r5, 8(r1)
1093.globl ftrace_call
1094ftrace_call:
1095 bl ftrace_stub
1096 nop
1097 lwz r6, 8(r1)
1098 lwz r0, 44(r1)
1099 lwz r3, 12(r1)
1100 mtctr r0
1101 lwz r4, 16(r1)
1102 mtcr r6
1103 lwz r5, 20(r1)
1104 lwz r6, 24(r1)
1105 lwz r0, 52(r1)
1106 lwz r7, 28(r1)
1107 lwz r8, 32(r1)
1108 mtlr r0
1109 lwz r9, 36(r1)
1110 lwz r10,40(r1)
1111 addi r1, r1, 48
1112 bctr
1113#else
1114_GLOBAL(mcount)
1115_GLOBAL(_mcount)
1116 stwu r1,-48(r1)
1117 stw r3, 12(r1)
1118 stw r4, 16(r1)
1119 stw r5, 20(r1)
1120 stw r6, 24(r1)
1121 mflr r3
1122 lwz r4, 52(r1)
1123 mfcr r5
1124 stw r7, 28(r1)
1125 stw r8, 32(r1)
1126 stw r9, 36(r1)
1127 stw r10,40(r1)
1128 stw r3, 44(r1)
1129 stw r5, 8(r1)
1130
1131 LOAD_REG_ADDR(r5, ftrace_trace_function)
1132#if 0
1133 mtctr r3
1134 mr r1, r5
1135 bctrl
1136#endif
1137 lwz r5,0(r5)
1138#if 1
1139 mtctr r5
1140 bctrl
1141#else
1142 bl ftrace_stub
1143#endif
1144 nop
1145
1146 lwz r6, 8(r1)
1147 lwz r0, 44(r1)
1148 lwz r3, 12(r1)
1149 mtctr r0
1150 lwz r4, 16(r1)
1151 mtcr r6
1152 lwz r5, 20(r1)
1153 lwz r6, 24(r1)
1154 lwz r0, 52(r1)
1155 lwz r7, 28(r1)
1156 lwz r8, 32(r1)
1157 mtlr r0
1158 lwz r9, 36(r1)
1159 lwz r10,40(r1)
1160 addi r1, r1, 48
1161 bctr
1162#endif
1163
1164_GLOBAL(ftrace_stub)
1165 blr
1166
1167#endif /* CONFIG_MCOUNT */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index c0db5b769e55..2c4d9e056ead 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -870,3 +870,65 @@ _GLOBAL(enter_prom)
870 ld r0,16(r1) 870 ld r0,16(r1)
871 mtlr r0 871 mtlr r0
872 blr 872 blr
873
874#ifdef CONFIG_FTRACE
875#ifdef CONFIG_DYNAMIC_FTRACE
876_GLOBAL(mcount)
877_GLOBAL(_mcount)
878 /* Taken from output of objdump from lib64/glibc */
879 mflr r3
880 stdu r1, -112(r1)
881 std r3, 128(r1)
882 .globl mcount_call
883mcount_call:
884 bl ftrace_stub
885 nop
886 ld r0, 128(r1)
887 mtlr r0
888 addi r1, r1, 112
889 blr
890
891_GLOBAL(ftrace_caller)
892 /* Taken from output of objdump from lib64/glibc */
893 mflr r3
894 ld r11, 0(r1)
895 stdu r1, -112(r1)
896 std r3, 128(r1)
897 ld r4, 16(r11)
898.globl ftrace_call
899ftrace_call:
900 bl ftrace_stub
901 nop
902 ld r0, 128(r1)
903 mtlr r0
904 addi r1, r1, 112
905_GLOBAL(ftrace_stub)
906 blr
907#else
908_GLOBAL(mcount)
909 blr
910
911_GLOBAL(_mcount)
912 /* Taken from output of objdump from lib64/glibc */
913 mflr r3
914 ld r11, 0(r1)
915 stdu r1, -112(r1)
916 std r3, 128(r1)
917 ld r4, 16(r11)
918
919
920 LOAD_REG_ADDR(r5,ftrace_trace_function)
921 ld r5,0(r5)
922 ld r5,0(r5)
923 mtctr r5
924 bctrl
925
926 nop
927 ld r0, 128(r1)
928 mtlr r0
929 addi r1, r1, 112
930_GLOBAL(ftrace_stub)
931 blr
932
933#endif
934#endif
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
new file mode 100644
index 000000000000..5a4993fefa45
--- /dev/null
+++ b/arch/powerpc/kernel/ftrace.c
@@ -0,0 +1,165 @@
1/*
2 * Code for replacing ftrace calls with jumps.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 *
6 * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
7 *
8 */
9
10#include <linux/spinlock.h>
11#include <linux/hardirq.h>
12#include <linux/ftrace.h>
13#include <linux/percpu.h>
14#include <linux/init.h>
15#include <linux/list.h>
16
17#include <asm/cacheflush.h>
18
19#define CALL_BACK 4
20
21static unsigned int ftrace_nop = 0x60000000;
22
23#ifdef CONFIG_PPC32
24# define GET_ADDR(addr) addr
25#else
26/* PowerPC64's functions are data that points to the functions */
27# define GET_ADDR(addr) *(unsigned long *)addr
28#endif
29
30notrace int ftrace_ip_converted(unsigned long ip)
31{
32 unsigned int save;
33
34 ip -= CALL_BACK;
35 save = *(unsigned int *)ip;
36
37 return save == ftrace_nop;
38}
39
40static unsigned int notrace ftrace_calc_offset(long ip, long addr)
41{
42 return (int)((addr + CALL_BACK) - ip);
43}
44
45notrace unsigned char *ftrace_nop_replace(void)
46{
47 return (char *)&ftrace_nop;
48}
49
50notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
51{
52 static unsigned int op;
53
54 addr = GET_ADDR(addr);
55
56 /* Set to "bl addr" */
57 op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffe);
58
59 /*
60 * No locking needed, this must be called via kstop_machine
61 * which in essence is like running on a uniprocessor machine.
62 */
63 return (unsigned char *)&op;
64}
65
66#ifdef CONFIG_PPC64
67# define _ASM_ALIGN " .align 3 "
68# define _ASM_PTR " .llong "
69#else
70# define _ASM_ALIGN " .align 2 "
71# define _ASM_PTR " .long "
72#endif
73
74notrace int
75ftrace_modify_code(unsigned long ip, unsigned char *old_code,
76 unsigned char *new_code)
77{
78 unsigned replaced;
79 unsigned old = *(unsigned *)old_code;
80 unsigned new = *(unsigned *)new_code;
81 int faulted = 0;
82
83 /* move the IP back to the start of the call */
84 ip -= CALL_BACK;
85
86 /*
87 * Note: Due to modules and __init, code can
88 * disappear and change, we need to protect against faulting
89 * as well as code changing.
90 *
91 * No real locking needed, this code is run through
92 * kstop_machine.
93 */
94 asm volatile (
95 "1: lwz %1, 0(%2)\n"
96 " cmpw %1, %5\n"
97 " bne 2f\n"
98 " stwu %3, 0(%2)\n"
99 "2:\n"
100 ".section .fixup, \"ax\"\n"
101 "3: li %0, 1\n"
102 " b 2b\n"
103 ".previous\n"
104 ".section __ex_table,\"a\"\n"
105 _ASM_ALIGN "\n"
106 _ASM_PTR "1b, 3b\n"
107 ".previous"
108 : "=r"(faulted), "=r"(replaced)
109 : "r"(ip), "r"(new),
110 "0"(faulted), "r"(old)
111 : "memory");
112
113 if (replaced != old && replaced != new)
114 faulted = 2;
115
116 if (!faulted)
117 flush_icache_range(ip, ip + 8);
118
119 return faulted;
120}
121
122notrace int ftrace_update_ftrace_func(ftrace_func_t func)
123{
124 unsigned long ip = (unsigned long)(&ftrace_call);
125 unsigned char old[4], *new;
126 int ret;
127
128 ip += CALL_BACK;
129
130 memcpy(old, &ftrace_call, 4);
131 new = ftrace_call_replace(ip, (unsigned long)func);
132 ret = ftrace_modify_code(ip, old, new);
133
134 return ret;
135}
136
137notrace int ftrace_mcount_set(unsigned long *data)
138{
139 unsigned long ip = (long)(&mcount_call);
140 unsigned long *addr = data;
141 unsigned char old[4], *new;
142
143 /* ip is at the location, but modify code will subtact this */
144 ip += CALL_BACK;
145
146 /*
147 * Replace the mcount stub with a pointer to the
148 * ip recorder function.
149 */
150 memcpy(old, &mcount_call, 4);
151 new = ftrace_call_replace(ip, *addr);
152 *addr = ftrace_modify_code(ip, old, new);
153
154 return 0;
155}
156
157int __init ftrace_dyn_arch_init(void *data)
158{
159 /* This is running in kstop_machine */
160
161 ftrace_mcount_set(data);
162
163 return 0;
164}
165
diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c
index e31aca9208eb..1882bf419fa6 100644
--- a/arch/powerpc/kernel/io.c
+++ b/arch/powerpc/kernel/io.c
@@ -120,7 +120,8 @@ EXPORT_SYMBOL(_outsl_ns);
120 120
121#define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0) 121#define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
122 122
123void _memset_io(volatile void __iomem *addr, int c, unsigned long n) 123notrace void
124_memset_io(volatile void __iomem *addr, int c, unsigned long n)
124{ 125{
125 void *p = (void __force *)addr; 126 void *p = (void __force *)addr;
126 u32 lc = c; 127 u32 lc = c;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 2f73f705d564..6e01eb0a3315 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_desc);
98 98
99int distribute_irqs = 1; 99int distribute_irqs = 1;
100 100
101static inline unsigned long get_hard_enabled(void) 101static inline notrace unsigned long get_hard_enabled(void)
102{ 102{
103 unsigned long enabled; 103 unsigned long enabled;
104 104
@@ -108,13 +108,13 @@ static inline unsigned long get_hard_enabled(void)
108 return enabled; 108 return enabled;
109} 109}
110 110
111static inline void set_soft_enabled(unsigned long enable) 111static inline notrace void set_soft_enabled(unsigned long enable)
112{ 112{
113 __asm__ __volatile__("stb %0,%1(13)" 113 __asm__ __volatile__("stb %0,%1(13)"
114 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); 114 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
115} 115}
116 116
117void raw_local_irq_restore(unsigned long en) 117notrace void raw_local_irq_restore(unsigned long en)
118{ 118{
119 /* 119 /*
120 * get_paca()->soft_enabled = en; 120 * get_paca()->soft_enabled = en;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 5112a4aa801d..22f8e2bacd32 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -47,6 +47,11 @@
47#include <asm/kgdb.h> 47#include <asm/kgdb.h>
48#endif 48#endif
49 49
50#ifdef CONFIG_FTRACE
51extern void _mcount(void);
52EXPORT_SYMBOL(_mcount);
53#endif
54
50extern void bootx_init(unsigned long r4, unsigned long phys); 55extern void bootx_init(unsigned long r4, unsigned long phys);
51 56
52int boot_cpuid; 57int boot_cpuid;
@@ -81,7 +86,7 @@ int ucache_bsize;
81 * from the address that it was linked at, so we must use RELOC/PTRRELOC 86 * from the address that it was linked at, so we must use RELOC/PTRRELOC
82 * to access static data (including strings). -- paulus 87 * to access static data (including strings). -- paulus
83 */ 88 */
84unsigned long __init early_init(unsigned long dt_ptr) 89notrace unsigned long __init early_init(unsigned long dt_ptr)
85{ 90{
86 unsigned long offset = reloc_offset(); 91 unsigned long offset = reloc_offset();
87 struct cpu_spec *spec; 92 struct cpu_spec *spec;
@@ -111,7 +116,7 @@ unsigned long __init early_init(unsigned long dt_ptr)
111 * This is called very early on the boot process, after a minimal 116 * This is called very early on the boot process, after a minimal
112 * MMU environment has been set up but before MMU_init is called. 117 * MMU environment has been set up but before MMU_init is called.
113 */ 118 */
114void __init machine_init(unsigned long dt_ptr, unsigned long phys) 119notrace void __init machine_init(unsigned long dt_ptr, unsigned long phys)
115{ 120{
116 /* Enable early debugging if any specified (see udbg.h) */ 121 /* Enable early debugging if any specified (see udbg.h) */
117 udbg_early_init(); 122 udbg_early_init();
@@ -133,7 +138,7 @@ void __init machine_init(unsigned long dt_ptr, unsigned long phys)
133 138
134#ifdef CONFIG_BOOKE_WDT 139#ifdef CONFIG_BOOKE_WDT
135/* Checks wdt=x and wdt_period=xx command-line option */ 140/* Checks wdt=x and wdt_period=xx command-line option */
136int __init early_parse_wdt(char *p) 141notrace int __init early_parse_wdt(char *p)
137{ 142{
138 if (p && strncmp(p, "0", 1) != 0) 143 if (p && strncmp(p, "0", 1) != 0)
139 booke_wdt_enabled = 1; 144 booke_wdt_enabled = 1;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 098fd96a394a..277bf18cbbcc 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -85,6 +85,11 @@ struct ppc64_caches ppc64_caches = {
85}; 85};
86EXPORT_SYMBOL_GPL(ppc64_caches); 86EXPORT_SYMBOL_GPL(ppc64_caches);
87 87
88#ifdef CONFIG_FTRACE
89extern void _mcount(void);
90EXPORT_SYMBOL(_mcount);
91#endif
92
88/* 93/*
89 * These are used in binfmt_elf.c to put aux entries on the stack 94 * These are used in binfmt_elf.c to put aux entries on the stack
90 * for each elf executable being started. 95 * for each elf executable being started.
diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile
index 4d72c8f72159..89774177b209 100644
--- a/arch/powerpc/platforms/powermac/Makefile
+++ b/arch/powerpc/platforms/powermac/Makefile
@@ -1,5 +1,10 @@
1CFLAGS_bootx_init.o += -fPIC 1CFLAGS_bootx_init.o += -fPIC
2 2
3ifdef CONFIG_FTRACE
4# Do not trace early boot code
5CFLAGS_REMOVE_bootx_init.o = -pg
6endif
7
3obj-y += pic.o setup.o time.o feature.o pci.o \ 8obj-y += pic.o setup.o time.o feature.o pci.o \
4 sleep.o low_i2c.o cache.o pfunc_core.o \ 9 sleep.o low_i2c.o cache.o pfunc_core.o \
5 pfunc_base.o 10 pfunc_base.o
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index eb36f3b746b8..fca9246470b1 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -11,6 +11,8 @@ config SPARC
11config SPARC64 11config SPARC64
12 bool 12 bool
13 default y 13 default y
14 select HAVE_DYNAMIC_FTRACE
15 select HAVE_FTRACE
14 select HAVE_IDE 16 select HAVE_IDE
15 select HAVE_LMB 17 select HAVE_LMB
16 select HAVE_ARCH_KGDB 18 select HAVE_ARCH_KGDB
diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug
index 6a4d28a4076d..d6d32d178fc8 100644
--- a/arch/sparc64/Kconfig.debug
+++ b/arch/sparc64/Kconfig.debug
@@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC
33 33
34config MCOUNT 34config MCOUNT
35 bool 35 bool
36 depends on STACK_DEBUG 36 depends on STACK_DEBUG || FTRACE
37 default y 37 default y
38 38
39config FRAME_POINTER 39config FRAME_POINTER
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index ec4f5ebb1ca6..418b5782096e 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -14,6 +14,7 @@ obj-y := process.o setup.o cpu.o idprom.o \
14 power.o sbus.o sparc64_ksyms.o chmc.o \ 14 power.o sbus.o sparc64_ksyms.o chmc.o \
15 visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o 15 visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o
16 16
17obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
17obj-$(CONFIG_STACKTRACE) += stacktrace.o 18obj-$(CONFIG_STACKTRACE) += stacktrace.o
18obj-$(CONFIG_PCI) += ebus.o pci_common.o \ 19obj-$(CONFIG_PCI) += ebus.o pci_common.o \
19 pci_psycho.o pci_sabre.o pci_schizo.o \ 20 pci_psycho.o pci_sabre.o pci_schizo.o \
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
new file mode 100644
index 000000000000..f449e6df6c4a
--- /dev/null
+++ b/arch/sparc64/kernel/ftrace.c
@@ -0,0 +1,99 @@
1#include <linux/spinlock.h>
2#include <linux/hardirq.h>
3#include <linux/ftrace.h>
4#include <linux/percpu.h>
5#include <linux/init.h>
6#include <linux/list.h>
7
8static const u32 ftrace_nop = 0x01000000;
9
10notrace int ftrace_ip_converted(unsigned long ip)
11{
12 u32 insn = *(u32 *) ip;
13
14 return (insn == ftrace_nop);
15}
16
17notrace unsigned char *ftrace_nop_replace(void)
18{
19 return (char *)&ftrace_nop;
20}
21
22notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
23{
24 static u32 call;
25 s32 off;
26
27 off = ((s32)addr - (s32)ip);
28 call = 0x40000000 | ((u32)off >> 2);
29
30 return (unsigned char *) &call;
31}
32
33notrace int
34ftrace_modify_code(unsigned long ip, unsigned char *old_code,
35 unsigned char *new_code)
36{
37 u32 old = *(u32 *)old_code;
38 u32 new = *(u32 *)new_code;
39 u32 replaced;
40 int faulted;
41
42 __asm__ __volatile__(
43 "1: cas [%[ip]], %[old], %[new]\n"
44 " flush %[ip]\n"
45 " mov 0, %[faulted]\n"
46 "2:\n"
47 " .section .fixup,#alloc,#execinstr\n"
48 " .align 4\n"
49 "3: sethi %%hi(2b), %[faulted]\n"
50 " jmpl %[faulted] + %%lo(2b), %%g0\n"
51 " mov 1, %[faulted]\n"
52 " .previous\n"
53 " .section __ex_table,\"a\"\n"
54 " .align 4\n"
55 " .word 1b, 3b\n"
56 " .previous\n"
57 : "=r" (replaced), [faulted] "=r" (faulted)
58 : [new] "0" (new), [old] "r" (old), [ip] "r" (ip)
59 : "memory");
60
61 if (replaced != old && replaced != new)
62 faulted = 2;
63
64 return faulted;
65}
66
67notrace int ftrace_update_ftrace_func(ftrace_func_t func)
68{
69 unsigned long ip = (unsigned long)(&ftrace_call);
70 unsigned char old[4], *new;
71
72 memcpy(old, &ftrace_call, 4);
73 new = ftrace_call_replace(ip, (unsigned long)func);
74 return ftrace_modify_code(ip, old, new);
75}
76
77notrace int ftrace_mcount_set(unsigned long *data)
78{
79 unsigned long ip = (long)(&mcount_call);
80 unsigned long *addr = data;
81 unsigned char old[4], *new;
82
83 /*
84 * Replace the mcount stub with a pointer to the
85 * ip recorder function.
86 */
87 memcpy(old, &mcount_call, 4);
88 new = ftrace_call_replace(ip, *addr);
89 *addr = ftrace_modify_code(ip, old, new);
90
91 return 0;
92}
93
94
95int __init ftrace_dyn_arch_init(void *data)
96{
97 ftrace_mcount_set(data);
98 return 0;
99}
diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S
index 9e4534b485c7..7735a7a60533 100644
--- a/arch/sparc64/lib/mcount.S
+++ b/arch/sparc64/lib/mcount.S
@@ -28,10 +28,13 @@ ovstack:
28 .skip OVSTACKSIZE 28 .skip OVSTACKSIZE
29#endif 29#endif
30 .text 30 .text
31 .align 32 31 .align 32
32 .globl mcount, _mcount 32 .globl _mcount
33mcount: 33 .type _mcount,#function
34 .globl mcount
35 .type mcount,#function
34_mcount: 36_mcount:
37mcount:
35#ifdef CONFIG_STACK_DEBUG 38#ifdef CONFIG_STACK_DEBUG
36 /* 39 /*
37 * Check whether %sp is dangerously low. 40 * Check whether %sp is dangerously low.
@@ -55,6 +58,53 @@ _mcount:
55 or %g3, %lo(panicstring), %o0 58 or %g3, %lo(panicstring), %o0
56 call prom_halt 59 call prom_halt
57 nop 60 nop
611:
62#endif
63#ifdef CONFIG_FTRACE
64#ifdef CONFIG_DYNAMIC_FTRACE
65 mov %o7, %o0
66 .globl mcount_call
67mcount_call:
68 call ftrace_stub
69 mov %o0, %o7
70#else
71 sethi %hi(ftrace_trace_function), %g1
72 sethi %hi(ftrace_stub), %g2
73 ldx [%g1 + %lo(ftrace_trace_function)], %g1
74 or %g2, %lo(ftrace_stub), %g2
75 cmp %g1, %g2
76 be,pn %icc, 1f
77 mov %i7, %o1
78 jmpl %g1, %g0
79 mov %o7, %o0
80 /* not reached */
811:
58#endif 82#endif
591: retl 83#endif
84 retl
60 nop 85 nop
86 .size _mcount,.-_mcount
87 .size mcount,.-mcount
88
89#ifdef CONFIG_FTRACE
90 .globl ftrace_stub
91 .type ftrace_stub,#function
92ftrace_stub:
93 retl
94 nop
95 .size ftrace_stub,.-ftrace_stub
96#ifdef CONFIG_DYNAMIC_FTRACE
97 .globl ftrace_caller
98 .type ftrace_caller,#function
99ftrace_caller:
100 mov %i7, %o1
101 mov %o7, %o0
102 .globl ftrace_call
103ftrace_call:
104 call ftrace_stub
105 mov %o0, %o7
106 retl
107 nop
108 .size ftrace_caller,.-ftrace_caller
109#endif
110#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba0..b0937c03af3c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,8 @@ config X86
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_KPROBES 24 select HAVE_KPROBES
25 select HAVE_KRETPROBES 25 select HAVE_KRETPROBES
26 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE
26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
27 select HAVE_ARCH_KGDB if !X86_VOYAGER 29 select HAVE_ARCH_KGDB if !X86_VOYAGER
28 30
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 18363374d51a..f7169edfbeab 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -172,6 +172,34 @@ config IOMMU_LEAK
172 Add a simple leak tracer to the IOMMU code. This is useful when you 172 Add a simple leak tracer to the IOMMU code. This is useful when you
173 are debugging a buggy device driver that leaks IOMMU mappings. 173 are debugging a buggy device driver that leaks IOMMU mappings.
174 174
175config MMIOTRACE_HOOKS
176 bool
177
178config MMIOTRACE
179 bool "Memory mapped IO tracing"
180 depends on DEBUG_KERNEL && PCI
181 select TRACING
182 select MMIOTRACE_HOOKS
183 default y
184 help
185 Mmiotrace traces Memory Mapped I/O access and is meant for
186 debugging and reverse engineering. It is called from the ioremap
187 implementation and works via page faults. Tracing is disabled by
188 default and can be enabled at run-time.
189
190 See Documentation/tracers/mmiotrace.txt.
191 If you are not helping to develop drivers, say N.
192
193config MMIOTRACE_TEST
194 tristate "Test module for mmiotrace"
195 depends on MMIOTRACE && m
196 help
197 This is a dumb module for testing mmiotrace. It is very dangerous
198 as it will write garbage to IO memory starting at a given address.
199 However, it should be safe to use on e.g. unused portion of VRAM.
200
201 Say N, unless you absolutely know what you are doing.
202
175# 203#
176# IO delay types: 204# IO delay types:
177# 205#
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..739d49acd2f1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,6 +6,13 @@ extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE
10# Do not profile debug utilities
11CFLAGS_REMOVE_tsc_64.o = -pg
12CFLAGS_REMOVE_tsc_32.o = -pg
13CFLAGS_REMOVE_rtc.o = -pg
14endif
15
9# 16#
10# vsyscalls (which work on the user stack) should have 17# vsyscalls (which work on the user stack) should have
11# no stack-protector checks: 18# no stack-protector checks:
@@ -56,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
56obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
57obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
58obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
59obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
60obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
61obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..2763cb37b553 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/spinlock.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
143#ifdef CONFIG_X86_64 143#ifdef CONFIG_X86_64
144 144
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146static inline const unsigned char*const * find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
162 { -1, NULL } 162 { -1, NULL }
163}; 163};
164 164
165static const unsigned char*const * find_nop_table(void) 165const unsigned char *const *find_nop_table(void)
166{ 166{
167 const unsigned char *const *noptable = intel_nops; 167 const unsigned char *const *noptable = intel_nops;
168 int i; 168 int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
279 struct list_head next; 279 struct list_head next;
280}; 280};
281static LIST_HEAD(smp_alt_modules); 281static LIST_HEAD(smp_alt_modules);
282static DEFINE_SPINLOCK(smp_alt); 282static DEFINE_MUTEX(smp_alt);
283static int smp_mode = 1; /* protected by smp_alt */ 283static int smp_mode = 1; /* protected by smp_alt */
284 284
285void alternatives_smp_module_add(struct module *mod, char *name, 285void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
312 __func__, smp->locks, smp->locks_end, 312 __func__, smp->locks, smp->locks_end,
313 smp->text, smp->text_end, smp->name); 313 smp->text, smp->text_end, smp->name);
314 314
315 spin_lock(&smp_alt); 315 mutex_lock(&smp_alt);
316 list_add_tail(&smp->next, &smp_alt_modules); 316 list_add_tail(&smp->next, &smp_alt_modules);
317 if (boot_cpu_has(X86_FEATURE_UP)) 317 if (boot_cpu_has(X86_FEATURE_UP))
318 alternatives_smp_unlock(smp->locks, smp->locks_end, 318 alternatives_smp_unlock(smp->locks, smp->locks_end,
319 smp->text, smp->text_end); 319 smp->text, smp->text_end);
320 spin_unlock(&smp_alt); 320 mutex_unlock(&smp_alt);
321} 321}
322 322
323void alternatives_smp_module_del(struct module *mod) 323void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
327 if (smp_alt_once || noreplace_smp) 327 if (smp_alt_once || noreplace_smp)
328 return; 328 return;
329 329
330 spin_lock(&smp_alt); 330 mutex_lock(&smp_alt);
331 list_for_each_entry(item, &smp_alt_modules, next) { 331 list_for_each_entry(item, &smp_alt_modules, next) {
332 if (mod != item->mod) 332 if (mod != item->mod)
333 continue; 333 continue;
334 list_del(&item->next); 334 list_del(&item->next);
335 spin_unlock(&smp_alt); 335 mutex_unlock(&smp_alt);
336 DPRINTK("%s: %s\n", __func__, item->name); 336 DPRINTK("%s: %s\n", __func__, item->name);
337 kfree(item); 337 kfree(item);
338 return; 338 return;
339 } 339 }
340 spin_unlock(&smp_alt); 340 mutex_unlock(&smp_alt);
341} 341}
342 342
343void alternatives_smp_switch(int smp) 343void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
359 return; 359 return;
360 BUG_ON(!smp && (num_online_cpus() > 1)); 360 BUG_ON(!smp && (num_online_cpus() > 1));
361 361
362 spin_lock(&smp_alt); 362 mutex_lock(&smp_alt);
363 363
364 /* 364 /*
365 * Avoid unnecessary switches because it forces JIT based VMs to 365 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
383 mod->text, mod->text_end); 383 mod->text, mod->text_end);
384 } 384 }
385 smp_mode = smp; 385 smp_mode = smp;
386 spin_unlock(&smp_alt); 386 mutex_unlock(&smp_alt);
387} 387}
388 388
389#endif 389#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a2..04ea83ccb979 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1110,6 +1110,74 @@ ENDPROC(xen_failsafe_callback)
1110 1110
1111#endif /* CONFIG_XEN */ 1111#endif /* CONFIG_XEN */
1112 1112
1113#ifdef CONFIG_FTRACE
1114#ifdef CONFIG_DYNAMIC_FTRACE
1115
1116ENTRY(mcount)
1117 pushl %eax
1118 pushl %ecx
1119 pushl %edx
1120 movl 0xc(%esp), %eax
1121
1122.globl mcount_call
1123mcount_call:
1124 call ftrace_stub
1125
1126 popl %edx
1127 popl %ecx
1128 popl %eax
1129
1130 ret
1131END(mcount)
1132
1133ENTRY(ftrace_caller)
1134 pushl %eax
1135 pushl %ecx
1136 pushl %edx
1137 movl 0xc(%esp), %eax
1138 movl 0x4(%ebp), %edx
1139
1140.globl ftrace_call
1141ftrace_call:
1142 call ftrace_stub
1143
1144 popl %edx
1145 popl %ecx
1146 popl %eax
1147
1148.globl ftrace_stub
1149ftrace_stub:
1150 ret
1151END(ftrace_caller)
1152
1153#else /* ! CONFIG_DYNAMIC_FTRACE */
1154
1155ENTRY(mcount)
1156 cmpl $ftrace_stub, ftrace_trace_function
1157 jnz trace
1158.globl ftrace_stub
1159ftrace_stub:
1160 ret
1161
1162 /* taken from glibc */
1163trace:
1164 pushl %eax
1165 pushl %ecx
1166 pushl %edx
1167 movl 0xc(%esp), %eax
1168 movl 0x4(%ebp), %edx
1169
1170 call *ftrace_trace_function
1171
1172 popl %edx
1173 popl %ecx
1174 popl %eax
1175
1176 jmp ftrace_stub
1177END(mcount)
1178#endif /* CONFIG_DYNAMIC_FTRACE */
1179#endif /* CONFIG_FTRACE */
1180
1113.section .rodata,"a" 1181.section .rodata,"a"
1114#include "syscall_table_32.S" 1182#include "syscall_table_32.S"
1115 1183
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..fe25e5febca3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -54,6 +54,108 @@
54 54
55 .code64 55 .code64
56 56
57#ifdef CONFIG_FTRACE
58#ifdef CONFIG_DYNAMIC_FTRACE
59ENTRY(mcount)
60
61 subq $0x38, %rsp
62 movq %rax, (%rsp)
63 movq %rcx, 8(%rsp)
64 movq %rdx, 16(%rsp)
65 movq %rsi, 24(%rsp)
66 movq %rdi, 32(%rsp)
67 movq %r8, 40(%rsp)
68 movq %r9, 48(%rsp)
69
70 movq 0x38(%rsp), %rdi
71
72.globl mcount_call
73mcount_call:
74 call ftrace_stub
75
76 movq 48(%rsp), %r9
77 movq 40(%rsp), %r8
78 movq 32(%rsp), %rdi
79 movq 24(%rsp), %rsi
80 movq 16(%rsp), %rdx
81 movq 8(%rsp), %rcx
82 movq (%rsp), %rax
83 addq $0x38, %rsp
84
85 retq
86END(mcount)
87
88ENTRY(ftrace_caller)
89
90 /* taken from glibc */
91 subq $0x38, %rsp
92 movq %rax, (%rsp)
93 movq %rcx, 8(%rsp)
94 movq %rdx, 16(%rsp)
95 movq %rsi, 24(%rsp)
96 movq %rdi, 32(%rsp)
97 movq %r8, 40(%rsp)
98 movq %r9, 48(%rsp)
99
100 movq 0x38(%rsp), %rdi
101 movq 8(%rbp), %rsi
102
103.globl ftrace_call
104ftrace_call:
105 call ftrace_stub
106
107 movq 48(%rsp), %r9
108 movq 40(%rsp), %r8
109 movq 32(%rsp), %rdi
110 movq 24(%rsp), %rsi
111 movq 16(%rsp), %rdx
112 movq 8(%rsp), %rcx
113 movq (%rsp), %rax
114 addq $0x38, %rsp
115
116.globl ftrace_stub
117ftrace_stub:
118 retq
119END(ftrace_caller)
120
121#else /* ! CONFIG_DYNAMIC_FTRACE */
122ENTRY(mcount)
123 cmpq $ftrace_stub, ftrace_trace_function
124 jnz trace
125.globl ftrace_stub
126ftrace_stub:
127 retq
128
129trace:
130 /* taken from glibc */
131 subq $0x38, %rsp
132 movq %rax, (%rsp)
133 movq %rcx, 8(%rsp)
134 movq %rdx, 16(%rsp)
135 movq %rsi, 24(%rsp)
136 movq %rdi, 32(%rsp)
137 movq %r8, 40(%rsp)
138 movq %r9, 48(%rsp)
139
140 movq 0x38(%rsp), %rdi
141 movq 8(%rbp), %rsi
142
143 call *ftrace_trace_function
144
145 movq 48(%rsp), %r9
146 movq 40(%rsp), %r8
147 movq 32(%rsp), %rdi
148 movq 24(%rsp), %rsi
149 movq 16(%rsp), %rdx
150 movq 8(%rsp), %rcx
151 movq (%rsp), %rax
152 addq $0x38, %rsp
153
154 jmp ftrace_stub
155END(mcount)
156#endif /* CONFIG_DYNAMIC_FTRACE */
157#endif /* CONFIG_FTRACE */
158
57#ifndef CONFIG_PREEMPT 159#ifndef CONFIG_PREEMPT
58#define retint_kernel retint_restore_args 160#define retint_kernel retint_restore_args
59#endif 161#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..498608c015fb
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,159 @@
1/*
2 * Code for replacing ftrace calls with jumps.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 *
6 * Thanks goes to Ingo Molnar, for suggesting the idea.
7 * Mathieu Desnoyers, for suggesting postponing the modifications.
8 * Arjan van de Ven, for keeping me straight, and explaining to me
9 * the dangers of modifying code on the run.
10 */
11
12#include <linux/spinlock.h>
13#include <linux/hardirq.h>
14#include <linux/ftrace.h>
15#include <linux/percpu.h>
16#include <linux/init.h>
17#include <linux/list.h>
18
19#include <asm/alternative.h>
20
21#define CALL_BACK 5
22
23/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop;
25
26union ftrace_code_union {
27 char code[5];
28 struct {
29 char e8;
30 int offset;
31 } __attribute__((packed));
32};
33
34notrace int ftrace_ip_converted(unsigned long ip)
35{
36 unsigned long save;
37
38 ip -= CALL_BACK;
39 save = *(long *)ip;
40
41 return save == *ftrace_nop;
42}
43
44static int notrace ftrace_calc_offset(long ip, long addr)
45{
46 return (int)(addr - ip);
47}
48
49notrace unsigned char *ftrace_nop_replace(void)
50{
51 return (char *)ftrace_nop;
52}
53
54notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
55{
56 static union ftrace_code_union calc;
57
58 calc.e8 = 0xe8;
59 calc.offset = ftrace_calc_offset(ip, addr);
60
61 /*
62 * No locking needed, this must be called via kstop_machine
63 * which in essence is like running on a uniprocessor machine.
64 */
65 return calc.code;
66}
67
68notrace int
69ftrace_modify_code(unsigned long ip, unsigned char *old_code,
70 unsigned char *new_code)
71{
72 unsigned replaced;
73 unsigned old = *(unsigned *)old_code; /* 4 bytes */
74 unsigned new = *(unsigned *)new_code; /* 4 bytes */
75 unsigned char newch = new_code[4];
76 int faulted = 0;
77
78 /* move the IP back to the start of the call */
79 ip -= CALL_BACK;
80
81 /*
82 * Note: Due to modules and __init, code can
83 * disappear and change, we need to protect against faulting
84 * as well as code changing.
85 *
86 * No real locking needed, this code is run through
87 * kstop_machine.
88 */
89 asm volatile (
90 "1: lock\n"
91 " cmpxchg %3, (%2)\n"
92 " jnz 2f\n"
93 " movb %b4, 4(%2)\n"
94 "2:\n"
95 ".section .fixup, \"ax\"\n"
96 "3: movl $1, %0\n"
97 " jmp 2b\n"
98 ".previous\n"
99 _ASM_EXTABLE(1b, 3b)
100 : "=r"(faulted), "=a"(replaced)
101 : "r"(ip), "r"(new), "r"(newch),
102 "0"(faulted), "a"(old)
103 : "memory");
104 sync_core();
105
106 if (replaced != old && replaced != new)
107 faulted = 2;
108
109 return faulted;
110}
111
112notrace int ftrace_update_ftrace_func(ftrace_func_t func)
113{
114 unsigned long ip = (unsigned long)(&ftrace_call);
115 unsigned char old[5], *new;
116 int ret;
117
118 ip += CALL_BACK;
119
120 memcpy(old, &ftrace_call, 5);
121 new = ftrace_call_replace(ip, (unsigned long)func);
122 ret = ftrace_modify_code(ip, old, new);
123
124 return ret;
125}
126
127notrace int ftrace_mcount_set(unsigned long *data)
128{
129 unsigned long ip = (long)(&mcount_call);
130 unsigned long *addr = data;
131 unsigned char old[5], *new;
132
133 /* ip is at the location, but modify code will subtact this */
134 ip += CALL_BACK;
135
136 /*
137 * Replace the mcount stub with a pointer to the
138 * ip recorder function.
139 */
140 memcpy(old, &mcount_call, 5);
141 new = ftrace_call_replace(ip, *addr);
142 *addr = ftrace_modify_code(ip, old, new);
143
144 return 0;
145}
146
147int __init ftrace_dyn_arch_init(void *data)
148{
149 const unsigned char *const *noptable = find_nop_table();
150
151 /* This is running in kstop_machine */
152
153 ftrace_mcount_set(data);
154
155 ftrace_nop = (unsigned long *)noptable[CALL_BACK];
156
157 return 0;
158}
159
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..29999dbb754c 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
1#include <linux/ftrace.h>
1#include <linux/module.h> 2#include <linux/module.h>
3
2#include <asm/checksum.h> 4#include <asm/checksum.h>
3#include <asm/desc.h>
4#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/desc.h>
7
8#ifdef CONFIG_FTRACE
9/* mcount is defined in assembly */
10EXPORT_SYMBOL(mcount);
11#endif
5 12
6/* Networking helper routines. */ 13/* Networking helper routines. */
7EXPORT_SYMBOL(csum_partial_copy_generic); 14EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..88923fd7a6fc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
16#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
107 unsigned long page_list[PAGES_NR]; 109 unsigned long page_list[PAGES_NR];
108 void *control_page; 110 void *control_page;
109 111
112 tracer_disable();
113
110 /* Interrupts aren't acceptable while we reboot */ 114 /* Interrupts aren't acceptable while we reboot */
111 local_irq_disable(); 115 local_irq_disable();
112 116
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..1558fdc174f9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
16#include <asm/mmu_context.h> 18#include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
184 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
185 void *control_page; 187 void *control_page;
186 188
189 tracer_disable();
190
187 /* Interrupts aren't acceptable while we reboot */ 191 /* Interrupts aren't acceptable while we reboot */
188 local_irq_disable(); 192 local_irq_disable();
189 193
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6d5483356e74..61f7481c31dd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -185,7 +185,10 @@ void cpu_idle(void)
185 185
186 local_irq_disable(); 186 local_irq_disable();
187 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 187 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
188 /* Don't trace irqs off for idle */
189 stop_critical_timings();
188 idle(); 190 idle();
191 start_critical_timings();
189 } 192 }
190 tick_nohz_restart_sched_tick(); 193 tick_nohz_restart_sched_tick();
191 preempt_enable_no_resched(); 194 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ac54ff56df80..dc534f40c8d3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -165,7 +165,10 @@ void cpu_idle(void)
165 */ 165 */
166 local_irq_disable(); 166 local_irq_disable();
167 enter_idle(); 167 enter_idle();
168 /* Don't trace irqs off for idle */
169 stop_critical_timings();
168 idle(); 170 idle();
171 start_critical_timings();
169 /* In many cases the interrupt that ended idle 172 /* In many cases the interrupt that ended idle
170 has already called exit_idle. But some idle 173 has already called exit_idle. But some idle
171 loops can be woken up without interrupt. */ 174 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..4063dfa2a02d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
42#include <asm/topology.h> 42#include <asm/topology.h>
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) \
46 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
46#define __syscall_clobber "r11","cx","memory" 47#define __syscall_clobber "r11","cx","memory"
47 48
48/* 49/*
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..122885bc5f3b 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,15 +1,22 @@
1/* Exports for assembly files. 1/* Exports for assembly files.
2 All C exports should go in the respective C files. */ 2 All C exports should go in the respective C files. */
3 3
4#include <linux/ftrace.h>
4#include <linux/module.h> 5#include <linux/module.h>
5#include <net/checksum.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7 7
8#include <net/checksum.h>
9
8#include <asm/processor.h> 10#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/uaccess.h>
11#include <asm/desc.h> 13#include <asm/desc.h>
12 14
15#ifdef CONFIG_FTRACE
16/* mcount is defined in assembly */
17EXPORT_SYMBOL(mcount);
18#endif
19
13EXPORT_SYMBOL(kernel_thread); 20EXPORT_SYMBOL(kernel_thread);
14 21
15EXPORT_SYMBOL(__get_user_1); 22EXPORT_SYMBOL(__get_user_1);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..84aa2883fe15 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -5,6 +5,7 @@
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr-on-cpu.o
6 6
7lib-y := delay_$(BITS).o 7lib-y := delay_$(BITS).o
8lib-y += thunk_$(BITS).o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o 9lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
9lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
10 11
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
1/*
2 * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
3 * Copyright 2008 by Steven Rostedt, Red Hat, Inc
4 * (inspired by Andi Kleen's thunk_64.S)
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */
7
8 #include <linux/linkage.h>
9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func
31 .globl \name
32\name:
33 pushl %eax
34 pushl %ecx
35 pushl %edx
36 /* Place EIP in the arg1 */
37 movl 3*4(%esp), %eax
38 call \func
39 popl %edx
40 popl %ecx
41 popl %eax
42 ret
43 .endm
44
45 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
46 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
2 * Save registers before calling assembly functions. This avoids 2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs. 3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs. 4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
5 * Subject to the GNU public license, v.2. No warranty of any kind. 6 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */ 7 */
7 8
@@ -42,8 +43,22 @@
42#endif 43#endif
43 44
44#ifdef CONFIG_TRACE_IRQFLAGS 45#ifdef CONFIG_TRACE_IRQFLAGS
45 thunk trace_hardirqs_on_thunk,trace_hardirqs_on 46 /* put return address in rdi (arg1) */
46 thunk trace_hardirqs_off_thunk,trace_hardirqs_off 47 .macro thunk_ra name,func
48 .globl \name
49\name:
50 CFI_STARTPROC
51 SAVE_ARGS
52 /* SAVE_ARGS pushs 9 elements */
53 /* the next element would be the rip */
54 movq 9*8(%rsp), %rdi
55 call \func
56 jmp restore
57 CFI_ENDPROC
58 .endm
59
60 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
61 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif 62#endif
48 63
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..07dab503c9e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
11ifeq ($(CONFIG_X86_32),y) 16ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 17obj-$(CONFIG_NUMA) += discontig_32.o
13else 18else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..0a778e3c43ee 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -49,6 +50,16 @@
49#define PF_RSVD (1<<3) 50#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 51#define PF_INSTR (1<<4)
51 52
53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
54{
55#ifdef CONFIG_MMIOTRACE_HOOKS
56 if (unlikely(is_kmmio_active()))
57 if (kmmio_handler(regs, addr) == 1)
58 return -1;
59#endif
60 return 0;
61}
62
52static inline int notify_page_fault(struct pt_regs *regs) 63static inline int notify_page_fault(struct pt_regs *regs)
53{ 64{
54#ifdef CONFIG_KPROBES 65#ifdef CONFIG_KPROBES
@@ -606,6 +617,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
606 617
607 if (notify_page_fault(regs)) 618 if (notify_page_fault(regs))
608 return; 619 return;
620 if (unlikely(kmmio_fault(regs, address)))
621 return;
609 622
610 /* 623 /*
611 * We fault-in kernel-space virtual memory on-demand. The 624 * We fault-in kernel-space virtual memory on-demand. The
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..f96eca21ad8f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -710,6 +710,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 710 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 711 unsigned long size = PFN_ALIGN(_etext) - start;
712 712
713#ifndef CONFIG_DYNAMIC_FTRACE
714 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 715 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 716 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 717 size >> 10);
@@ -722,6 +724,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 724 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 725 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 726#endif
727#endif /* CONFIG_DYNAMIC_FTRACE */
728
725 start += size; 729 start += size;
726 size = (unsigned long)__end_rodata - start; 730 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 731 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e32..a5fd2e06f5c9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -766,6 +766,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
766void mark_rodata_ro(void) 766void mark_rodata_ro(void)
767{ 767{
768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
769 unsigned long rodata_start =
770 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
771
772#ifdef CONFIG_DYNAMIC_FTRACE
773 /* Dynamic tracing modifies the kernel text section */
774 start = rodata_start;
775#endif
769 776
770 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 777 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
771 (end - start) >> 10); 778 (end - start) >> 10);
@@ -775,8 +782,7 @@ void mark_rodata_ro(void)
775 * The rodata section (but not the kernel text!) should also be 782 * The rodata section (but not the kernel text!) should also be
776 * not-executable. 783 * not-executable.
777 */ 784 */
778 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 785 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
779 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
780 786
781 rodata_test(); 787 rodata_test();
782 788
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2b2bb3f9b683..e92aa461f4d6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 123{
123 unsigned long pfn, offset, vaddr; 124 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 125 resource_size_t last_addr;
126 const resource_size_t unaligned_phys_addr = phys_addr;
127 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 128 struct vm_struct *area;
126 unsigned long new_prot_val; 129 unsigned long new_prot_val;
127 pgprot_t prot; 130 pgprot_t prot;
128 int retval; 131 int retval;
132 void __iomem *ret_addr;
129 133
130 /* Don't allow wraparound or zero size */ 134 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 135 last_addr = phys_addr + size - 1;
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 237 return NULL;
234 } 238 }
235 239
236 return (void __iomem *) (vaddr + offset); 240 ret_addr = (void __iomem *) (vaddr + offset);
241 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
242
243 return ret_addr;
237} 244}
238 245
239/** 246/**
@@ -325,6 +332,8 @@ void iounmap(volatile void __iomem *addr)
325 addr = (volatile void __iomem *) 332 addr = (volatile void __iomem *)
326 (PAGE_MASK & (unsigned long __force)addr); 333 (PAGE_MASK & (unsigned long __force)addr);
327 334
335 mmiotrace_iounmap(addr);
336
328 /* Use the vm area unlocked, assuming the caller 337 /* Use the vm area unlocked, assuming the caller
329 ensures there isn't another iounmap for the same address 338 ensures there isn't another iounmap for the same address
330 in parallel. Reuse of the virtual address is prevented by 339 in parallel. Reuse of the virtual address is prevented by
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..e7397e108beb
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,515 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433static void leave_uniprocessor(void)
434{
435 int cpu;
436 int err;
437
438 if (cpus_weight(downed_cpus) == 0)
439 return;
440 pr_notice(NAME "Re-enabling CPUs...\n");
441 for_each_cpu_mask(cpu, downed_cpus) {
442 err = cpu_up(cpu);
443 if (!err)
444 pr_info(NAME "enabled CPU%d.\n", cpu);
445 else
446 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
447 }
448}
449
450#else /* !CONFIG_HOTPLUG_CPU */
451static void enter_uniprocessor(void)
452{
453 if (num_online_cpus() > 1)
454 pr_warning(NAME "multiple CPUs are online, may miss events. "
455 "Suggest booting with maxcpus=1 kernel argument.\n");
456}
457
458static void leave_uniprocessor(void)
459{
460}
461#endif
462
463#if 0 /* XXX: out of order */
464static struct file_operations fops_marker = {
465 .owner = THIS_MODULE,
466 .write = write_marker
467};
468#endif
469
470void enable_mmiotrace(void)
471{
472 mutex_lock(&mmiotrace_mutex);
473 if (is_enabled())
474 goto out;
475
476#if 0 /* XXX: tracing does not support text entries */
477 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
478 &fops_marker);
479 if (!marker_file)
480 pr_err(NAME "marker file creation failed.\n");
481#endif
482
483 if (nommiotrace)
484 pr_info(NAME "MMIO tracing disabled.\n");
485 enter_uniprocessor();
486 spin_lock_irq(&trace_lock);
487 atomic_inc(&mmiotrace_enabled);
488 spin_unlock_irq(&trace_lock);
489 pr_info(NAME "enabled.\n");
490out:
491 mutex_unlock(&mmiotrace_mutex);
492}
493
494void disable_mmiotrace(void)
495{
496 mutex_lock(&mmiotrace_mutex);
497 if (!is_enabled())
498 goto out;
499
500 spin_lock_irq(&trace_lock);
501 atomic_dec(&mmiotrace_enabled);
502 BUG_ON(is_enabled());
503 spin_unlock_irq(&trace_lock);
504
505 clear_trace_list(); /* guarantees: no more kmmio callbacks */
506 leave_uniprocessor();
507 if (marker_file) {
508 debugfs_remove(marker_file);
509 marker_file = NULL;
510 }
511
512 pr_info(NAME "disabled.\n");
513out:
514 mutex_unlock(&mmiotrace_mutex);
515}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..57970f2935c0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -227,6 +227,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 227
228 return pte_offset_kernel(pmd, address); 228 return pte_offset_kernel(pmd, address);
229} 229}
230EXPORT_SYMBOL_GPL(lookup_address);
230 231
231/* 232/*
232 * Set the new pmd in all the pgds we know about: 233 * Set the new pmd in all the pgds we know about:
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index efa2ba7c6005..1ef0f90813d6 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
23 23
24#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
25 25
26static long vdso_fallback_gettime(long clock, struct timespec *ts) 26notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
27{ 27{
28 long ret; 28 long ret;
29 asm("syscall" : "=a" (ret) : 29 asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
31 return ret; 31 return ret;
32} 32}
33 33
34static inline long vgetns(void) 34notrace static inline long vgetns(void)
35{ 35{
36 long v; 36 long v;
37 cycles_t (*vread)(void); 37 cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
40 return (v * gtod->clock.mult) >> gtod->clock.shift; 40 return (v * gtod->clock.mult) >> gtod->clock.shift;
41} 41}
42 42
43static noinline int do_realtime(struct timespec *ts) 43notrace static noinline int do_realtime(struct timespec *ts)
44{ 44{
45 unsigned long seq, ns; 45 unsigned long seq, ns;
46 do { 46 do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
54} 54}
55 55
56/* Copy of the version in kernel/time.c which we cannot directly access */ 56/* Copy of the version in kernel/time.c which we cannot directly access */
57static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) 57notrace static void
58vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58{ 59{
59 while (nsec >= NSEC_PER_SEC) { 60 while (nsec >= NSEC_PER_SEC) {
60 nsec -= NSEC_PER_SEC; 61 nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
68 ts->tv_nsec = nsec; 69 ts->tv_nsec = nsec;
69} 70}
70 71
71static noinline int do_monotonic(struct timespec *ts) 72notrace static noinline int do_monotonic(struct timespec *ts)
72{ 73{
73 unsigned long seq, ns, secs; 74 unsigned long seq, ns, secs;
74 do { 75 do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
82 return 0; 83 return 0;
83} 84}
84 85
85int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 86notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86{ 87{
87 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 88 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88 switch (clock) { 89 switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
96int clock_gettime(clockid_t, struct timespec *) 97int clock_gettime(clockid_t, struct timespec *)
97 __attribute__((weak, alias("__vdso_clock_gettime"))); 98 __attribute__((weak, alias("__vdso_clock_gettime")));
98 99
99int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 100notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100{ 101{
101 long ret; 102 long ret;
102 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 103 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h" 14#include "vextern.h"
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 18{
18 unsigned int p; 19 unsigned int p;
19 20
diff --git a/include/asm-powerpc/hw_irq.h b/include/asm-powerpc/hw_irq.h
index ad8c9f7fd0e3..f75a5fc64d2e 100644
--- a/include/asm-powerpc/hw_irq.h
+++ b/include/asm-powerpc/hw_irq.h
@@ -59,6 +59,11 @@ extern void iseries_handle_interrupts(void);
59 get_paca()->hard_enabled = 0; \ 59 get_paca()->hard_enabled = 0; \
60 } while(0) 60 } while(0)
61 61
62static inline int irqs_disabled_flags(unsigned long flags)
63{
64 return flags == 0;
65}
66
62#else 67#else
63 68
64#if defined(CONFIG_BOOKE) 69#if defined(CONFIG_BOOKE)
@@ -113,6 +118,11 @@ static inline void local_irq_save_ptr(unsigned long *flags)
113#define hard_irq_enable() local_irq_enable() 118#define hard_irq_enable() local_irq_enable()
114#define hard_irq_disable() local_irq_disable() 119#define hard_irq_disable() local_irq_disable()
115 120
121static inline int irqs_disabled_flags(unsigned long flags)
122{
123 return (flags & MSR_EE) == 0;
124}
125
116#endif /* CONFIG_PPC64 */ 126#endif /* CONFIG_PPC64 */
117 127
118/* 128/*
diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h
index 1f6a9ca10126..f6aa18eadf71 100644
--- a/include/asm-x86/alternative.h
+++ b/include/asm-x86/alternative.h
@@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
72static inline void alternatives_smp_switch(int smp) {} 72static inline void alternatives_smp_switch(int smp) {}
73#endif /* CONFIG_SMP */ 73#endif /* CONFIG_SMP */
74 74
75const unsigned char *const *find_nop_table(void);
76
75/* 77/*
76 * Alternative instructions for different CPU types or capabilities. 78 * Alternative instructions for different CPU types or capabilities.
77 * 79 *
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index c242527f970e..24d71b1eb189 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void)
179 * have a reliable stack. x86_64 only. 179 * have a reliable stack. x86_64 only.
180 */ 180 */
181#define SWAPGS_UNSAFE_STACK swapgs 181#define SWAPGS_UNSAFE_STACK swapgs
182#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
183#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
184#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk 182#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
185#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ 183#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
186 TRACE_IRQS_ON; \ 184 TRACE_IRQS_ON; \
@@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void)
192 TRACE_IRQS_OFF; 190 TRACE_IRQS_OFF;
193 191
194#else 192#else
195#define ARCH_TRACE_IRQS_ON \
196 pushl %eax; \
197 pushl %ecx; \
198 pushl %edx; \
199 call trace_hardirqs_on; \
200 popl %edx; \
201 popl %ecx; \
202 popl %eax;
203
204#define ARCH_TRACE_IRQS_OFF \
205 pushl %eax; \
206 pushl %ecx; \
207 pushl %edx; \
208 call trace_hardirqs_off; \
209 popl %edx; \
210 popl %ecx; \
211 popl %eax;
212
213#define ARCH_LOCKDEP_SYS_EXIT \ 193#define ARCH_LOCKDEP_SYS_EXIT \
214 pushl %eax; \ 194 pushl %eax; \
215 pushl %ecx; \ 195 pushl %ecx; \
@@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void)
223#endif 203#endif
224 204
225#ifdef CONFIG_TRACE_IRQFLAGS 205#ifdef CONFIG_TRACE_IRQFLAGS
226# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON 206# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
227# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF 207# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
228#else 208#else
229# define TRACE_IRQS_ON 209# define TRACE_IRQS_ON
230# define TRACE_IRQS_OFF 210# define TRACE_IRQS_OFF
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h
index 17b3700949bf..6b66ff905af0 100644
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -24,7 +24,8 @@ enum vsyscall_num {
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) 24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \ 25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16))) 26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) 27#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
28 29
29#define VGETCPU_RDTSCP 1 30#define VGETCPU_RDTSCP 1
30#define VGETCPU_LSL 2 31#define VGETCPU_LSL 2
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644
index 000000000000..922e23d0196f
--- /dev/null
+++ b/include/linux/ftrace.h
@@ -0,0 +1,132 @@
1#ifndef _LINUX_FTRACE_H
2#define _LINUX_FTRACE_H
3
4#ifdef CONFIG_FTRACE
5
6#include <linux/linkage.h>
7#include <linux/fs.h>
8
9extern int ftrace_enabled;
10extern int
11ftrace_enable_sysctl(struct ctl_table *table, int write,
12 struct file *filp, void __user *buffer, size_t *lenp,
13 loff_t *ppos);
14
15typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
16
17struct ftrace_ops {
18 ftrace_func_t func;
19 struct ftrace_ops *next;
20};
21
22/*
23 * The ftrace_ops must be a static and should also
24 * be read_mostly. These functions do modify read_mostly variables
25 * so use them sparely. Never free an ftrace_op or modify the
26 * next pointer after it has been registered. Even after unregistering
27 * it, the next pointer may still be used internally.
28 */
29int register_ftrace_function(struct ftrace_ops *ops);
30int unregister_ftrace_function(struct ftrace_ops *ops);
31void clear_ftrace_function(void);
32
33extern void ftrace_stub(unsigned long a0, unsigned long a1);
34extern void mcount(void);
35
36#else /* !CONFIG_FTRACE */
37# define register_ftrace_function(ops) do { } while (0)
38# define unregister_ftrace_function(ops) do { } while (0)
39# define clear_ftrace_function(ops) do { } while (0)
40#endif /* CONFIG_FTRACE */
41
42#ifdef CONFIG_DYNAMIC_FTRACE
43# define FTRACE_HASHBITS 10
44# define FTRACE_HASHSIZE (1<<FTRACE_HASHBITS)
45
46enum {
47 FTRACE_FL_FREE = (1 << 0),
48 FTRACE_FL_FAILED = (1 << 1),
49 FTRACE_FL_FILTER = (1 << 2),
50 FTRACE_FL_ENABLED = (1 << 3),
51};
52
53struct dyn_ftrace {
54 struct hlist_node node;
55 unsigned long ip;
56 unsigned long flags;
57};
58
59int ftrace_force_update(void);
60void ftrace_set_filter(unsigned char *buf, int len, int reset);
61
62/* defined in arch */
63extern int ftrace_ip_converted(unsigned long ip);
64extern unsigned char *ftrace_nop_replace(void);
65extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
66extern int ftrace_dyn_arch_init(void *data);
67extern int ftrace_mcount_set(unsigned long *data);
68extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
69 unsigned char *new_code);
70extern int ftrace_update_ftrace_func(ftrace_func_t func);
71extern void ftrace_caller(void);
72extern void ftrace_call(void);
73extern void mcount_call(void);
74#else
75# define ftrace_force_update() ({ 0; })
76# define ftrace_set_filter(buf, len, reset) do { } while (0)
77#endif
78
79/* totally disable ftrace - can not re-enable after this */
80void ftrace_kill(void);
81
82static inline void tracer_disable(void)
83{
84#ifdef CONFIG_FTRACE
85 ftrace_enabled = 0;
86#endif
87}
88
89#ifdef CONFIG_FRAME_POINTER
90/* TODO: need to fix this for ARM */
91# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
92# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
93# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
94# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
95# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
96# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
97# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
98#else
99# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
100# define CALLER_ADDR1 0UL
101# define CALLER_ADDR2 0UL
102# define CALLER_ADDR3 0UL
103# define CALLER_ADDR4 0UL
104# define CALLER_ADDR5 0UL
105# define CALLER_ADDR6 0UL
106#endif
107
108#ifdef CONFIG_IRQSOFF_TRACER
109 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
110 extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
111#else
112# define time_hardirqs_on(a0, a1) do { } while (0)
113# define time_hardirqs_off(a0, a1) do { } while (0)
114#endif
115
116#ifdef CONFIG_PREEMPT_TRACER
117 extern void trace_preempt_on(unsigned long a0, unsigned long a1);
118 extern void trace_preempt_off(unsigned long a0, unsigned long a1);
119#else
120# define trace_preempt_on(a0, a1) do { } while (0)
121# define trace_preempt_off(a0, a1) do { } while (0)
122#endif
123
124#ifdef CONFIG_CONTEXT_SWITCH_TRACER
125extern void
126ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
127#else
128static inline void
129ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
130#endif
131
132#endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index e600c4e9b8c5..2b1c2e58566e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,10 +12,10 @@
12#define _LINUX_TRACE_IRQFLAGS_H 12#define _LINUX_TRACE_IRQFLAGS_H
13 13
14#ifdef CONFIG_TRACE_IRQFLAGS 14#ifdef CONFIG_TRACE_IRQFLAGS
15 extern void trace_hardirqs_on(void);
16 extern void trace_hardirqs_off(void);
17 extern void trace_softirqs_on(unsigned long ip); 15 extern void trace_softirqs_on(unsigned long ip);
18 extern void trace_softirqs_off(unsigned long ip); 16 extern void trace_softirqs_off(unsigned long ip);
17 extern void trace_hardirqs_on(void);
18 extern void trace_hardirqs_off(void);
19# define trace_hardirq_context(p) ((p)->hardirq_context) 19# define trace_hardirq_context(p) ((p)->hardirq_context)
20# define trace_softirq_context(p) ((p)->softirq_context) 20# define trace_softirq_context(p) ((p)->softirq_context)
21# define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) 21# define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled)
@@ -41,6 +41,15 @@
41# define INIT_TRACE_IRQFLAGS 41# define INIT_TRACE_IRQFLAGS
42#endif 42#endif
43 43
44#if defined(CONFIG_IRQSOFF_TRACER) || \
45 defined(CONFIG_PREEMPT_TRACER)
46 extern void stop_critical_timings(void);
47 extern void start_critical_timings(void);
48#else
49# define stop_critical_timings() do { } while (0)
50# define start_critical_timings() do { } while (0)
51#endif
52
44#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 53#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
45 54
46#include <asm/irqflags.h> 55#include <asm/irqflags.h>
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 2119610b24f8..14f329c64ba8 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -3,6 +3,8 @@
3 3
4#include <asm/linkage.h> 4#include <asm/linkage.h>
5 5
6#define notrace __attribute__((no_instrument_function))
7
6#ifdef __cplusplus 8#ifdef __cplusplus
7#define CPP_ASMLINKAGE extern "C" 9#define CPP_ASMLINKAGE extern "C"
8#else 10#else
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 430f6adf9762..1290653f9241 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -44,8 +44,8 @@ struct marker {
44 */ 44 */
45 char state; /* Marker state. */ 45 char state; /* Marker state. */
46 char ptype; /* probe type : 0 : single, 1 : multi */ 46 char ptype; /* probe type : 0 : single, 1 : multi */
47 void (*call)(const struct marker *mdata, /* Probe wrapper */ 47 /* Probe wrapper */
48 void *call_private, const char *fmt, ...); 48 void (*call)(const struct marker *mdata, void *call_private, ...);
49 struct marker_probe_closure single; 49 struct marker_probe_closure single;
50 struct marker_probe_closure *multi; 50 struct marker_probe_closure *multi;
51} __attribute__((aligned(8))); 51} __attribute__((aligned(8)));
@@ -58,8 +58,12 @@ struct marker {
58 * Make sure the alignment of the structure in the __markers section will 58 * Make sure the alignment of the structure in the __markers section will
59 * not add unwanted padding between the beginning of the section and the 59 * not add unwanted padding between the beginning of the section and the
60 * structure. Force alignment to the same alignment as the section start. 60 * structure. Force alignment to the same alignment as the section start.
61 *
62 * The "generic" argument controls which marker enabling mechanism must be used.
63 * If generic is true, a variable read is used.
64 * If generic is false, immediate values are used.
61 */ 65 */
62#define __trace_mark(name, call_private, format, args...) \ 66#define __trace_mark(generic, name, call_private, format, args...) \
63 do { \ 67 do { \
64 static const char __mstrtab_##name[] \ 68 static const char __mstrtab_##name[] \
65 __attribute__((section("__markers_strings"))) \ 69 __attribute__((section("__markers_strings"))) \
@@ -72,15 +76,14 @@ struct marker {
72 __mark_check_format(format, ## args); \ 76 __mark_check_format(format, ## args); \
73 if (unlikely(__mark_##name.state)) { \ 77 if (unlikely(__mark_##name.state)) { \
74 (*__mark_##name.call) \ 78 (*__mark_##name.call) \
75 (&__mark_##name, call_private, \ 79 (&__mark_##name, call_private, ## args);\
76 format, ## args); \
77 } \ 80 } \
78 } while (0) 81 } while (0)
79 82
80extern void marker_update_probe_range(struct marker *begin, 83extern void marker_update_probe_range(struct marker *begin,
81 struct marker *end); 84 struct marker *end);
82#else /* !CONFIG_MARKERS */ 85#else /* !CONFIG_MARKERS */
83#define __trace_mark(name, call_private, format, args...) \ 86#define __trace_mark(generic, name, call_private, format, args...) \
84 __mark_check_format(format, ## args) 87 __mark_check_format(format, ## args)
85static inline void marker_update_probe_range(struct marker *begin, 88static inline void marker_update_probe_range(struct marker *begin,
86 struct marker *end) 89 struct marker *end)
@@ -88,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin,
88#endif /* CONFIG_MARKERS */ 91#endif /* CONFIG_MARKERS */
89 92
90/** 93/**
91 * trace_mark - Marker 94 * trace_mark - Marker using code patching
92 * @name: marker name, not quoted. 95 * @name: marker name, not quoted.
93 * @format: format string 96 * @format: format string
94 * @args...: variable argument list 97 * @args...: variable argument list
95 * 98 *
96 * Places a marker. 99 * Places a marker using optimized code patching technique (imv_read())
100 * to be enabled when immediate values are present.
97 */ 101 */
98#define trace_mark(name, format, args...) \ 102#define trace_mark(name, format, args...) \
99 __trace_mark(name, NULL, format, ## args) 103 __trace_mark(0, name, NULL, format, ## args)
104
105/**
106 * _trace_mark - Marker using variable read
107 * @name: marker name, not quoted.
108 * @format: format string
109 * @args...: variable argument list
110 *
111 * Places a marker using a standard memory read (_imv_read()) to be
112 * enabled. Should be used for markers in code paths where instruction
113 * modification based enabling is not welcome. (__init and __exit functions,
114 * lockdep, some traps, printk).
115 */
116#define _trace_mark(name, format, args...) \
117 __trace_mark(1, name, NULL, format, ## args)
100 118
101/** 119/**
102 * MARK_NOARGS - Format string for a marker with no argument. 120 * MARK_NOARGS - Format string for a marker with no argument.
@@ -117,9 +135,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...)
117extern marker_probe_func __mark_empty_function; 135extern marker_probe_func __mark_empty_function;
118 136
119extern void marker_probe_cb(const struct marker *mdata, 137extern void marker_probe_cb(const struct marker *mdata,
120 void *call_private, const char *fmt, ...); 138 void *call_private, ...);
121extern void marker_probe_cb_noarg(const struct marker *mdata, 139extern void marker_probe_cb_noarg(const struct marker *mdata,
122 void *call_private, const char *fmt, ...); 140 void *call_private, ...);
123 141
124/* 142/*
125 * Connect a probe to a marker. 143 * Connect a probe to a marker.
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
new file mode 100644
index 000000000000..61d19e1b7a0b
--- /dev/null
+++ b/include/linux/mmiotrace.h
@@ -0,0 +1,85 @@
1#ifndef MMIOTRACE_H
2#define MMIOTRACE_H
3
4#include <linux/types.h>
5#include <linux/list.h>
6
7struct kmmio_probe;
8struct pt_regs;
9
10typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
11 struct pt_regs *, unsigned long addr);
12typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
13 unsigned long condition, struct pt_regs *);
14
15struct kmmio_probe {
16 struct list_head list; /* kmmio internal list */
17 unsigned long addr; /* start location of the probe point */
18 unsigned long len; /* length of the probe region */
19 kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
20 kmmio_post_handler_t post_handler; /* Called after addr is executed */
21 void *private;
22};
23
24/* kmmio is active by some kmmio_probes? */
25static inline int is_kmmio_active(void)
26{
27 extern unsigned int kmmio_count;
28 return kmmio_count;
29}
30
31extern int register_kmmio_probe(struct kmmio_probe *p);
32extern void unregister_kmmio_probe(struct kmmio_probe *p);
33
34/* Called from page fault handler. */
35extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
36
37/* Called from ioremap.c */
38#ifdef CONFIG_MMIOTRACE
39extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
40 void __iomem *addr);
41extern void mmiotrace_iounmap(volatile void __iomem *addr);
42#else
43static inline void mmiotrace_ioremap(resource_size_t offset,
44 unsigned long size, void __iomem *addr)
45{
46}
47
48static inline void mmiotrace_iounmap(volatile void __iomem *addr)
49{
50}
51#endif /* CONFIG_MMIOTRACE_HOOKS */
52
53enum mm_io_opcode {
54 MMIO_READ = 0x1, /* struct mmiotrace_rw */
55 MMIO_WRITE = 0x2, /* struct mmiotrace_rw */
56 MMIO_PROBE = 0x3, /* struct mmiotrace_map */
57 MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */
58 MMIO_MARKER = 0x5, /* raw char data */
59 MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
60};
61
62struct mmiotrace_rw {
63 resource_size_t phys; /* PCI address of register */
64 unsigned long value;
65 unsigned long pc; /* optional program counter */
66 int map_id;
67 unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
68 unsigned char width; /* size of register access in bytes */
69};
70
71struct mmiotrace_map {
72 resource_size_t phys; /* base address in PCI space */
73 unsigned long virt; /* base virtual address */
74 unsigned long len; /* mapping size */
75 int map_id;
76 unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */
77};
78
79/* in kernel/trace/trace_mmiotrace.c */
80extern void enable_mmiotrace(void);
81extern void disable_mmiotrace(void);
82extern void mmio_trace_rw(struct mmiotrace_rw *rw);
83extern void mmio_trace_mapping(struct mmiotrace_map *map);
84
85#endif /* MMIOTRACE_H */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 23f0c54175cd..72b1a10a59b6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,7 +10,7 @@
10#include <linux/linkage.h> 10#include <linux/linkage.h>
11#include <linux/list.h> 11#include <linux/list.h>
12 12
13#ifdef CONFIG_DEBUG_PREEMPT 13#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
14 extern void add_preempt_count(int val); 14 extern void add_preempt_count(int val);
15 extern void sub_preempt_count(int val); 15 extern void sub_preempt_count(int val);
16#else 16#else
@@ -52,6 +52,34 @@ do { \
52 preempt_check_resched(); \ 52 preempt_check_resched(); \
53} while (0) 53} while (0)
54 54
55/* For debugging and tracer internals only! */
56#define add_preempt_count_notrace(val) \
57 do { preempt_count() += (val); } while (0)
58#define sub_preempt_count_notrace(val) \
59 do { preempt_count() -= (val); } while (0)
60#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
61#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
62
63#define preempt_disable_notrace() \
64do { \
65 inc_preempt_count_notrace(); \
66 barrier(); \
67} while (0)
68
69#define preempt_enable_no_resched_notrace() \
70do { \
71 barrier(); \
72 dec_preempt_count_notrace(); \
73} while (0)
74
75/* preempt_check_resched is OK to trace */
76#define preempt_enable_notrace() \
77do { \
78 preempt_enable_no_resched_notrace(); \
79 barrier(); \
80 preempt_check_resched(); \
81} while (0)
82
55#else 83#else
56 84
57#define preempt_disable() do { } while (0) 85#define preempt_disable() do { } while (0)
@@ -59,6 +87,10 @@ do { \
59#define preempt_enable() do { } while (0) 87#define preempt_enable() do { } while (0)
60#define preempt_check_resched() do { } while (0) 88#define preempt_check_resched() do { } while (0)
61 89
90#define preempt_disable_notrace() do { } while (0)
91#define preempt_enable_no_resched_notrace() do { } while (0)
92#define preempt_enable_notrace() do { } while (0)
93
62#endif 94#endif
63 95
64#ifdef CONFIG_PREEMPT_NOTIFIERS 96#ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..aa609858aef0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
246extern void init_idle(struct task_struct *idle, int cpu); 246extern void init_idle(struct task_struct *idle, int cpu);
247extern void init_idle_bootup_task(struct task_struct *idle); 247extern void init_idle_bootup_task(struct task_struct *idle);
248 248
249extern int runqueue_is_locked(void);
250
249extern cpumask_t nohz_cpu_mask; 251extern cpumask_t nohz_cpu_mask;
250#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 252#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
251extern int select_nohz_load_balancer(int cpu); 253extern int select_nohz_load_balancer(int cpu);
@@ -2131,6 +2133,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
2131} 2133}
2132#endif 2134#endif
2133 2135
2136#ifdef CONFIG_TRACING
2137extern void
2138__trace_special(void *__tr, void *__data,
2139 unsigned long arg1, unsigned long arg2, unsigned long arg3);
2140#else
2141static inline void
2142__trace_special(void *__tr, void *__data,
2143 unsigned long arg1, unsigned long arg2, unsigned long arg3)
2144{
2145}
2146#endif
2147
2134extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); 2148extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
2135extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 2149extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
2136 2150
@@ -2225,6 +2239,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
2225} 2239}
2226#endif /* CONFIG_MM_OWNER */ 2240#endif /* CONFIG_MM_OWNER */
2227 2241
2242#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
2243
2228#endif /* __KERNEL__ */ 2244#endif /* __KERNEL__ */
2229 2245
2230#endif 2246#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439cc288..bd91987c065f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable;
105extern int block_dump; 105extern int block_dump;
106extern int laptop_mode; 106extern int laptop_mode;
107 107
108extern unsigned long determine_dirtyable_memory(void);
109
108extern int dirty_ratio_handler(struct ctl_table *table, int write, 110extern int dirty_ratio_handler(struct ctl_table *table, int write,
109 struct file *filp, void __user *buffer, size_t *lenp, 111 struct file *filp, void __user *buffer, size_t *lenp,
110 loff_t *ppos); 112 loff_t *ppos);
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..ca2433e84873 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -pg -mno-spe
15
16ifdef CONFIG_FTRACE
17# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg
19CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24endif
25
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 26obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 27obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 28obj-y += time/
@@ -69,6 +81,8 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 81obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 82obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 83obj-$(CONFIG_LATENCYTOP) += latencytop.o
84obj-$(CONFIG_FTRACE) += trace/
85obj-$(CONFIG_TRACING) += trace/
72 86
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 87ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 88# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d66d676dc362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
909 909
910 rt_mutex_init_task(p); 910 rt_mutex_init_task(p);
911 911
912#ifdef CONFIG_TRACE_IRQFLAGS 912#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
913 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 913 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
914 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 914 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
915#endif 915#endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..65548eff029e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
39#include <linux/irqflags.h> 39#include <linux/irqflags.h>
40#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/sections.h> 44#include <asm/sections.h>
44 45
@@ -81,6 +82,8 @@ static int graph_lock(void)
81 __raw_spin_unlock(&lockdep_lock); 82 __raw_spin_unlock(&lockdep_lock);
82 return 0; 83 return 0;
83 } 84 }
85 /* prevent any recursions within lockdep from causing deadlocks */
86 current->lockdep_recursion++;
84 return 1; 87 return 1;
85} 88}
86 89
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
89 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 92 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
90 return DEBUG_LOCKS_WARN_ON(1); 93 return DEBUG_LOCKS_WARN_ON(1);
91 94
95 current->lockdep_recursion--;
92 __raw_spin_unlock(&lockdep_lock); 96 __raw_spin_unlock(&lockdep_lock);
93 return 0; 97 return 0;
94} 98}
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
982 return 1; 986 return 1;
983} 987}
984 988
985#ifdef CONFIG_TRACE_IRQFLAGS 989#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
986/* 990/*
987 * Forwards and backwards subgraph searching, for the purposes of 991 * Forwards and backwards subgraph searching, for the purposes of
988 * proving that two subgraphs can be connected by a new dependency 992 * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1684,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
1680static int mark_lock(struct task_struct *curr, struct held_lock *this, 1684static int mark_lock(struct task_struct *curr, struct held_lock *this,
1681 enum lock_usage_bit new_bit); 1685 enum lock_usage_bit new_bit);
1682 1686
1683#ifdef CONFIG_TRACE_IRQFLAGS 1687#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1684 1688
1685/* 1689/*
1686 * print irq inversion bug: 1690 * print irq inversion bug:
@@ -2013,11 +2017,13 @@ void early_boot_irqs_on(void)
2013/* 2017/*
2014 * Hardirqs will be enabled: 2018 * Hardirqs will be enabled:
2015 */ 2019 */
2016void trace_hardirqs_on(void) 2020void trace_hardirqs_on_caller(unsigned long a0)
2017{ 2021{
2018 struct task_struct *curr = current; 2022 struct task_struct *curr = current;
2019 unsigned long ip; 2023 unsigned long ip;
2020 2024
2025 time_hardirqs_on(CALLER_ADDR0, a0);
2026
2021 if (unlikely(!debug_locks || current->lockdep_recursion)) 2027 if (unlikely(!debug_locks || current->lockdep_recursion))
2022 return; 2028 return;
2023 2029
@@ -2055,16 +2061,23 @@ void trace_hardirqs_on(void)
2055 curr->hardirq_enable_event = ++curr->irq_events; 2061 curr->hardirq_enable_event = ++curr->irq_events;
2056 debug_atomic_inc(&hardirqs_on_events); 2062 debug_atomic_inc(&hardirqs_on_events);
2057} 2063}
2064EXPORT_SYMBOL(trace_hardirqs_on_caller);
2058 2065
2066void trace_hardirqs_on(void)
2067{
2068 trace_hardirqs_on_caller(CALLER_ADDR0);
2069}
2059EXPORT_SYMBOL(trace_hardirqs_on); 2070EXPORT_SYMBOL(trace_hardirqs_on);
2060 2071
2061/* 2072/*
2062 * Hardirqs were disabled: 2073 * Hardirqs were disabled:
2063 */ 2074 */
2064void trace_hardirqs_off(void) 2075void trace_hardirqs_off_caller(unsigned long a0)
2065{ 2076{
2066 struct task_struct *curr = current; 2077 struct task_struct *curr = current;
2067 2078
2079 time_hardirqs_off(CALLER_ADDR0, a0);
2080
2068 if (unlikely(!debug_locks || current->lockdep_recursion)) 2081 if (unlikely(!debug_locks || current->lockdep_recursion))
2069 return; 2082 return;
2070 2083
@@ -2082,7 +2095,12 @@ void trace_hardirqs_off(void)
2082 } else 2095 } else
2083 debug_atomic_inc(&redundant_hardirqs_off); 2096 debug_atomic_inc(&redundant_hardirqs_off);
2084} 2097}
2098EXPORT_SYMBOL(trace_hardirqs_off_caller);
2085 2099
2100void trace_hardirqs_off(void)
2101{
2102 trace_hardirqs_off_caller(CALLER_ADDR0);
2103}
2086EXPORT_SYMBOL(trace_hardirqs_off); 2104EXPORT_SYMBOL(trace_hardirqs_off);
2087 2105
2088/* 2106/*
@@ -2246,7 +2264,7 @@ static inline int separate_irq_context(struct task_struct *curr,
2246 * Mark a lock with a usage bit, and validate the state transition: 2264 * Mark a lock with a usage bit, and validate the state transition:
2247 */ 2265 */
2248static int mark_lock(struct task_struct *curr, struct held_lock *this, 2266static int mark_lock(struct task_struct *curr, struct held_lock *this,
2249 enum lock_usage_bit new_bit) 2267 enum lock_usage_bit new_bit)
2250{ 2268{
2251 unsigned int new_mask = 1 << new_bit, ret = 1; 2269 unsigned int new_mask = 1 << new_bit, ret = 1;
2252 2270
@@ -2686,7 +2704,7 @@ static void check_flags(unsigned long flags)
2686 * and also avoid lockdep recursion: 2704 * and also avoid lockdep recursion:
2687 */ 2705 */
2688void lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2689 int trylock, int read, int check, unsigned long ip) 2707 int trylock, int read, int check, unsigned long ip)
2690{ 2708{
2691 unsigned long flags; 2709 unsigned long flags;
2692 2710
@@ -2708,7 +2726,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2708 2726
2709EXPORT_SYMBOL_GPL(lock_acquire); 2727EXPORT_SYMBOL_GPL(lock_acquire);
2710 2728
2711void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) 2729void lock_release(struct lockdep_map *lock, int nested,
2730 unsigned long ip)
2712{ 2731{
2713 unsigned long flags; 2732 unsigned long flags;
2714 2733
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..1abfb923b761 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
55struct marker_entry { 55struct marker_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 char *format; 57 char *format;
58 void (*call)(const struct marker *mdata, /* Probe wrapper */ 58 /* Probe wrapper */
59 void *call_private, const char *fmt, ...); 59 void (*call)(const struct marker *mdata, void *call_private, ...);
60 struct marker_probe_closure single; 60 struct marker_probe_closure single;
61 struct marker_probe_closure *multi; 61 struct marker_probe_closure *multi;
62 int refcount; /* Number of times armed. 0 if disarmed. */ 62 int refcount; /* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
91 * marker_probe_cb Callback that prepares the variable argument list for probes. 91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker 92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data 93 * @call_private: caller site private data
94 * @fmt: format string
95 * @...: Variable argument list. 94 * @...: Variable argument list.
96 * 95 *
97 * Since we do not use "typical" pointer based RCU in the 1 argument case, we 96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
98 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
99 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
100 */ 99 */
101void marker_probe_cb(const struct marker *mdata, void *call_private, 100void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
102 const char *fmt, ...)
103{ 101{
104 va_list args; 102 va_list args;
105 char ptype; 103 char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
120 /* Must read the ptr before private data. They are not data 118 /* Must read the ptr before private data. They are not data
121 * dependant, so we put an explicit smp_rmb() here. */ 119 * dependant, so we put an explicit smp_rmb() here. */
122 smp_rmb(); 120 smp_rmb();
123 va_start(args, fmt); 121 va_start(args, call_private);
124 func(mdata->single.probe_private, call_private, fmt, &args); 122 func(mdata->single.probe_private, call_private, mdata->format,
123 &args);
125 va_end(args); 124 va_end(args);
126 } else { 125 } else {
127 struct marker_probe_closure *multi; 126 struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
136 smp_read_barrier_depends(); 135 smp_read_barrier_depends();
137 multi = mdata->multi; 136 multi = mdata->multi;
138 for (i = 0; multi[i].func; i++) { 137 for (i = 0; multi[i].func; i++) {
139 va_start(args, fmt); 138 va_start(args, call_private);
140 multi[i].func(multi[i].probe_private, call_private, fmt, 139 multi[i].func(multi[i].probe_private, call_private,
141 &args); 140 mdata->format, &args);
142 va_end(args); 141 va_end(args);
143 } 142 }
144 } 143 }
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
150 * marker_probe_cb Callback that does not prepare the variable argument list. 149 * marker_probe_cb Callback that does not prepare the variable argument list.
151 * @mdata: pointer of type struct marker 150 * @mdata: pointer of type struct marker
152 * @call_private: caller site private data 151 * @call_private: caller site private data
153 * @fmt: format string
154 * @...: Variable argument list. 152 * @...: Variable argument list.
155 * 153 *
156 * Should be connected to markers "MARK_NOARGS". 154 * Should be connected to markers "MARK_NOARGS".
157 */ 155 */
158void marker_probe_cb_noarg(const struct marker *mdata, 156void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
159 void *call_private, const char *fmt, ...)
160{ 157{
161 va_list args; /* not initialized */ 158 va_list args; /* not initialized */
162 char ptype; 159 char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
172 /* Must read the ptr before private data. They are not data 169 /* Must read the ptr before private data. They are not data
173 * dependant, so we put an explicit smp_rmb() here. */ 170 * dependant, so we put an explicit smp_rmb() here. */
174 smp_rmb(); 171 smp_rmb();
175 func(mdata->single.probe_private, call_private, fmt, &args); 172 func(mdata->single.probe_private, call_private, mdata->format,
173 &args);
176 } else { 174 } else {
177 struct marker_probe_closure *multi; 175 struct marker_probe_closure *multi;
178 int i; 176 int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
186 smp_read_barrier_depends(); 184 smp_read_barrier_depends();
187 multi = mdata->multi; 185 multi = mdata->multi;
188 for (i = 0; multi[i].func; i++) 186 for (i = 0; multi[i].func; i++)
189 multi[i].func(multi[i].probe_private, call_private, fmt, 187 multi[i].func(multi[i].probe_private, call_private,
190 &args); 188 mdata->format, &args);
191 } 189 }
192 preempt_enable(); 190 preempt_enable();
193} 191}
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..ae7d5b9e535d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1041,7 +1041,9 @@ void release_console_sem(void)
1041 _log_end = log_end; 1041 _log_end = log_end;
1042 con_start = log_end; /* Flush */ 1042 con_start = log_end; /* Flush */
1043 spin_unlock(&logbuf_lock); 1043 spin_unlock(&logbuf_lock);
1044 stop_critical_timings(); /* don't trace print latency */
1044 call_console_drivers(_con_start, _log_end); 1045 call_console_drivers(_con_start, _log_end);
1046 start_critical_timings();
1045 local_irq_restore(flags); 1047 local_irq_restore(flags);
1046 } 1048 }
1047 console_locked = 0; 1049 console_locked = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..c994d12abbf6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
@@ -607,6 +608,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 608# define const_debug static const
608#endif 609#endif
609 610
611/**
612 * runqueue_is_locked
613 *
614 * Returns true if the current cpu runqueue is locked.
615 * This interface allows printk to be called with the runqueue lock
616 * held and know whether or not it is OK to wake up the klogd.
617 */
618int runqueue_is_locked(void)
619{
620 int cpu = get_cpu();
621 struct rq *rq = cpu_rq(cpu);
622 int ret;
623
624 ret = spin_is_locked(&rq->lock);
625 put_cpu();
626 return ret;
627}
628
610/* 629/*
611 * Debugging: various feature bits 630 * Debugging: various feature bits
612 */ 631 */
@@ -2147,6 +2166,9 @@ out_activate:
2147 success = 1; 2166 success = 1;
2148 2167
2149out_running: 2168out_running:
2169 trace_mark(kernel_sched_wakeup,
2170 "pid %d state %ld ## rq %p task %p rq->curr %p",
2171 p->pid, p->state, rq, p, rq->curr);
2150 check_preempt_curr(rq, p); 2172 check_preempt_curr(rq, p);
2151 2173
2152 p->state = TASK_RUNNING; 2174 p->state = TASK_RUNNING;
@@ -2277,6 +2299,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 p->sched_class->task_new(rq, p); 2299 p->sched_class->task_new(rq, p);
2278 inc_nr_running(p, rq); 2300 inc_nr_running(p, rq);
2279 } 2301 }
2302 trace_mark(kernel_sched_wakeup_new,
2303 "pid %d state %ld ## rq %p task %p rq->curr %p",
2304 p->pid, p->state, rq, p, rq->curr);
2280 check_preempt_curr(rq, p); 2305 check_preempt_curr(rq, p);
2281#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
2282 if (p->sched_class->task_wake_up) 2307 if (p->sched_class->task_wake_up)
@@ -2449,6 +2474,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2449 struct mm_struct *mm, *oldmm; 2474 struct mm_struct *mm, *oldmm;
2450 2475
2451 prepare_task_switch(rq, prev, next); 2476 prepare_task_switch(rq, prev, next);
2477 trace_mark(kernel_sched_schedule,
2478 "prev_pid %d next_pid %d prev_state %ld "
2479 "## rq %p prev %p next %p",
2480 prev->pid, next->pid, prev->state,
2481 rq, prev, next);
2452 mm = next->mm; 2482 mm = next->mm;
2453 oldmm = prev->active_mm; 2483 oldmm = prev->active_mm;
2454 /* 2484 /*
@@ -4019,26 +4049,44 @@ void scheduler_tick(void)
4019#endif 4049#endif
4020} 4050}
4021 4051
4022#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4052#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4053 defined(CONFIG_PREEMPT_TRACER))
4054
4055static inline unsigned long get_parent_ip(unsigned long addr)
4056{
4057 if (in_lock_functions(addr)) {
4058 addr = CALLER_ADDR2;
4059 if (in_lock_functions(addr))
4060 addr = CALLER_ADDR3;
4061 }
4062 return addr;
4063}
4023 4064
4024void __kprobes add_preempt_count(int val) 4065void __kprobes add_preempt_count(int val)
4025{ 4066{
4067#ifdef CONFIG_DEBUG_PREEMPT
4026 /* 4068 /*
4027 * Underflow? 4069 * Underflow?
4028 */ 4070 */
4029 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4071 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4030 return; 4072 return;
4073#endif
4031 preempt_count() += val; 4074 preempt_count() += val;
4075#ifdef CONFIG_DEBUG_PREEMPT
4032 /* 4076 /*
4033 * Spinlock count overflowing soon? 4077 * Spinlock count overflowing soon?
4034 */ 4078 */
4035 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4079 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4036 PREEMPT_MASK - 10); 4080 PREEMPT_MASK - 10);
4081#endif
4082 if (preempt_count() == val)
4083 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4037} 4084}
4038EXPORT_SYMBOL(add_preempt_count); 4085EXPORT_SYMBOL(add_preempt_count);
4039 4086
4040void __kprobes sub_preempt_count(int val) 4087void __kprobes sub_preempt_count(int val)
4041{ 4088{
4089#ifdef CONFIG_DEBUG_PREEMPT
4042 /* 4090 /*
4043 * Underflow? 4091 * Underflow?
4044 */ 4092 */
@@ -4050,7 +4098,10 @@ void __kprobes sub_preempt_count(int val)
4050 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4098 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4051 !(preempt_count() & PREEMPT_MASK))) 4099 !(preempt_count() & PREEMPT_MASK)))
4052 return; 4100 return;
4101#endif
4053 4102
4103 if (preempt_count() == val)
4104 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4054 preempt_count() -= val; 4105 preempt_count() -= val;
4055} 4106}
4056EXPORT_SYMBOL(sub_preempt_count); 4107EXPORT_SYMBOL(sub_preempt_count);
@@ -5384,7 +5435,7 @@ out_unlock:
5384 return retval; 5435 return retval;
5385} 5436}
5386 5437
5387static const char stat_nam[] = "RSDTtZX"; 5438static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5439
5389void sched_show_task(struct task_struct *p) 5440void sched_show_task(struct task_struct *p)
5390{ 5441{
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..1a064adab658 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/ftrace.h>
34 35
35static noinline void __down(struct semaphore *sem); 36static noinline void __down(struct semaphore *sem);
36static noinline int __down_interruptible(struct semaphore *sem); 37static noinline int __down_interruptible(struct semaphore *sem);
@@ -53,6 +54,7 @@ void down(struct semaphore *sem)
53{ 54{
54 unsigned long flags; 55 unsigned long flags;
55 56
57 ftrace_special(sem->count, 0, __LINE__);
56 spin_lock_irqsave(&sem->lock, flags); 58 spin_lock_irqsave(&sem->lock, flags);
57 if (likely(sem->count > 0)) 59 if (likely(sem->count > 0))
58 sem->count--; 60 sem->count--;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..a1fb54c93cdd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
436} 436}
437EXPORT_SYMBOL(_spin_trylock_bh); 437EXPORT_SYMBOL(_spin_trylock_bh);
438 438
439int in_lock_functions(unsigned long addr) 439notrace int in_lock_functions(unsigned long addr)
440{ 440{
441 /* Linker adds these: start and end of __lockfunc functions */ 441 /* Linker adds these: start and end of __lockfunc functions */
442 extern char __lock_text_start[], __lock_text_end[]; 442 extern char __lock_text_start[], __lock_text_end[];
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..efaf7c5500e9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
46#include <linux/nfs_fs.h> 46#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 47#include <linux/acpi.h>
48#include <linux/reboot.h> 48#include <linux/reboot.h>
49#include <linux/ftrace.h>
49 50
50#include <asm/uaccess.h> 51#include <asm/uaccess.h>
51#include <asm/processor.h> 52#include <asm/processor.h>
@@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = {
455 .mode = 0644, 456 .mode = 0644,
456 .proc_handler = &proc_dointvec, 457 .proc_handler = &proc_dointvec,
457 }, 458 },
459#ifdef CONFIG_FTRACE
460 {
461 .ctl_name = CTL_UNNUMBERED,
462 .procname = "ftrace_enabled",
463 .data = &ftrace_enabled,
464 .maxlen = sizeof(int),
465 .mode = 0644,
466 .proc_handler = &ftrace_enable_sysctl,
467 },
468#endif
458#ifdef CONFIG_KMOD 469#ifdef CONFIG_KMOD
459 { 470 {
460 .ctl_name = KERN_MODPROBE, 471 .ctl_name = KERN_MODPROBE,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..5c2295b29f2c
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,127 @@
1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
3#
4config HAVE_FTRACE
5 bool
6
7config HAVE_DYNAMIC_FTRACE
8 bool
9
10config TRACER_MAX_TRACE
11 bool
12
13config TRACING
14 bool
15 select DEBUG_FS
16 select STACKTRACE
17
18config FTRACE
19 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE
21 select FRAME_POINTER
22 select TRACING
23 select CONTEXT_SWITCH_TRACER
24 help
25 Enable the kernel to trace every kernel function. This is done
26 by using a compiler feature to insert a small, 5-byte No-Operation
27 instruction to the beginning of every kernel function, which NOP
28 sequence is then dynamically patched into a tracer call when
29 tracing is enabled by the administrator. If it's runtime disabled
30 (the bootup default), then the overhead of the instructions is very
31 small and not measurable even in micro-benchmarks.
32
33config IRQSOFF_TRACER
34 bool "Interrupts-off Latency Tracer"
35 default n
36 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME
38 depends on HAVE_FTRACE
39 select TRACE_IRQFLAGS
40 select TRACING
41 select TRACER_MAX_TRACE
42 help
43 This option measures the time spent in irqs-off critical
44 sections, with microsecond accuracy.
45
46 The default measurement method is a maximum search, which is
47 disabled by default and can be runtime (re-)started
48 via:
49
50 echo 0 > /debugfs/tracing/tracing_max_latency
51
52 (Note that kernel size and overhead increases with this option
53 enabled. This option and the preempt-off timing option can be
54 used together or separately.)
55
56config PREEMPT_TRACER
57 bool "Preemption-off Latency Tracer"
58 default n
59 depends on GENERIC_TIME
60 depends on PREEMPT
61 depends on HAVE_FTRACE
62 select TRACING
63 select TRACER_MAX_TRACE
64 help
65 This option measures the time spent in preemption off critical
66 sections, with microsecond accuracy.
67
68 The default measurement method is a maximum search, which is
69 disabled by default and can be runtime (re-)started
70 via:
71
72 echo 0 > /debugfs/tracing/tracing_max_latency
73
74 (Note that kernel size and overhead increases with this option
75 enabled. This option and the irqs-off timing option can be
76 used together or separately.)
77
78config SCHED_TRACER
79 bool "Scheduling Latency Tracer"
80 depends on HAVE_FTRACE
81 select TRACING
82 select CONTEXT_SWITCH_TRACER
83 select TRACER_MAX_TRACE
84 help
85 This tracer tracks the latency of the highest priority task
86 to be scheduled in, starting from the point it has woken up.
87
88config CONTEXT_SWITCH_TRACER
89 bool "Trace process context switches"
90 depends on HAVE_FTRACE
91 select TRACING
92 select MARKERS
93 help
94 This tracer gets called from the context switch and records
95 all switching of tasks.
96
97config DYNAMIC_FTRACE
98 bool "enable/disable ftrace tracepoints dynamically"
99 depends on FTRACE
100 depends on HAVE_DYNAMIC_FTRACE
101 default y
102 help
103 This option will modify all the calls to ftrace dynamically
104 (will patch them out of the binary image and replaces them
105 with a No-Op instruction) as they are called. A table is
106 created to dynamically enable them again.
107
108 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
109 has native performance as long as no tracing is active.
110
111 The changes to the code are done by a kernel thread that
112 wakes up once a second and checks to see if any ftrace calls
113 were made. If so, it runs stop_machine (stops all CPUS)
114 and modifies the code to jump over the call to ftrace.
115
116config FTRACE_SELFTEST
117 bool
118
119config FTRACE_STARTUP_TEST
120 bool "Perform a startup test on ftrace"
121 depends on TRACING
122 select FTRACE_SELFTEST
123 help
124 This option performs a series of startup tests on ftrace. On bootup
125 a series of tests are made to verify that the tracer is
126 functioning properly. It will do tests on all the configured
127 tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..c44a7dce9086
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,23 @@
1
2# Do not instrument the tracer itself:
3
4ifdef CONFIG_FTRACE
5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7
8# selftest needs instrumentation
9CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o
11endif
12
13obj-$(CONFIG_FTRACE) += libftrace.o
14
15obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
17obj-$(CONFIG_FTRACE) += trace_functions.o
18obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
19obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
21obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
22
23libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..89bd9a6f52ec
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1398 @@
1/*
2 * Infrastructure for profiling code inserted by 'gcc -pg'.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally ported from the -rt patch by:
8 * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code in the latency_tracer, that is:
11 *
12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III
14 */
15
16#include <linux/stop_machine.h>
17#include <linux/clocksource.h>
18#include <linux/kallsyms.h>
19#include <linux/seq_file.h>
20#include <linux/debugfs.h>
21#include <linux/hardirq.h>
22#include <linux/kthread.h>
23#include <linux/uaccess.h>
24#include <linux/ftrace.h>
25#include <linux/sysctl.h>
26#include <linux/ctype.h>
27#include <linux/hash.h>
28#include <linux/list.h>
29
30#include "trace.h"
31
32/* ftrace_enabled is a method to turn ftrace on or off */
33int ftrace_enabled __read_mostly;
34static int last_ftrace_enabled;
35
36/*
37 * ftrace_disabled is set when an anomaly is discovered.
38 * ftrace_disabled is much stronger than ftrace_enabled.
39 */
40static int ftrace_disabled __read_mostly;
41
42static DEFINE_SPINLOCK(ftrace_lock);
43static DEFINE_MUTEX(ftrace_sysctl_lock);
44
45static struct ftrace_ops ftrace_list_end __read_mostly =
46{
47 .func = ftrace_stub,
48};
49
50static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
51ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
52
53void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
54{
55 struct ftrace_ops *op = ftrace_list;
56
57 /* in case someone actually ports this to alpha! */
58 read_barrier_depends();
59
60 while (op != &ftrace_list_end) {
61 /* silly alpha */
62 read_barrier_depends();
63 op->func(ip, parent_ip);
64 op = op->next;
65 };
66}
67
68/**
69 * clear_ftrace_function - reset the ftrace function
70 *
71 * This NULLs the ftrace function and in essence stops
72 * tracing. There may be lag
73 */
74void clear_ftrace_function(void)
75{
76 ftrace_trace_function = ftrace_stub;
77}
78
79static int __register_ftrace_function(struct ftrace_ops *ops)
80{
81 /* Should never be called by interrupts */
82 spin_lock(&ftrace_lock);
83
84 ops->next = ftrace_list;
85 /*
86 * We are entering ops into the ftrace_list but another
87 * CPU might be walking that list. We need to make sure
88 * the ops->next pointer is valid before another CPU sees
89 * the ops pointer included into the ftrace_list.
90 */
91 smp_wmb();
92 ftrace_list = ops;
93
94 if (ftrace_enabled) {
95 /*
96 * For one func, simply call it directly.
97 * For more than one func, call the chain.
98 */
99 if (ops->next == &ftrace_list_end)
100 ftrace_trace_function = ops->func;
101 else
102 ftrace_trace_function = ftrace_list_func;
103 }
104
105 spin_unlock(&ftrace_lock);
106
107 return 0;
108}
109
110static int __unregister_ftrace_function(struct ftrace_ops *ops)
111{
112 struct ftrace_ops **p;
113 int ret = 0;
114
115 spin_lock(&ftrace_lock);
116
117 /*
118 * If we are removing the last function, then simply point
119 * to the ftrace_stub.
120 */
121 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
122 ftrace_trace_function = ftrace_stub;
123 ftrace_list = &ftrace_list_end;
124 goto out;
125 }
126
127 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
128 if (*p == ops)
129 break;
130
131 if (*p != ops) {
132 ret = -1;
133 goto out;
134 }
135
136 *p = (*p)->next;
137
138 if (ftrace_enabled) {
139 /* If we only have one func left, then call that directly */
140 if (ftrace_list == &ftrace_list_end ||
141 ftrace_list->next == &ftrace_list_end)
142 ftrace_trace_function = ftrace_list->func;
143 }
144
145 out:
146 spin_unlock(&ftrace_lock);
147
148 return ret;
149}
150
151#ifdef CONFIG_DYNAMIC_FTRACE
152
153static struct task_struct *ftraced_task;
154static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
155static unsigned long ftraced_iteration_counter;
156
157enum {
158 FTRACE_ENABLE_CALLS = (1 << 0),
159 FTRACE_DISABLE_CALLS = (1 << 1),
160 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
161 FTRACE_ENABLE_MCOUNT = (1 << 3),
162 FTRACE_DISABLE_MCOUNT = (1 << 4),
163};
164
165static int ftrace_filtered;
166
167static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
168
169static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
170
171static DEFINE_SPINLOCK(ftrace_shutdown_lock);
172static DEFINE_MUTEX(ftraced_lock);
173static DEFINE_MUTEX(ftrace_filter_lock);
174
175struct ftrace_page {
176 struct ftrace_page *next;
177 unsigned long index;
178 struct dyn_ftrace records[];
179};
180
181#define ENTRIES_PER_PAGE \
182 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
183
184/* estimate from running different kernels */
185#define NR_TO_INIT 10000
186
187static struct ftrace_page *ftrace_pages_start;
188static struct ftrace_page *ftrace_pages;
189
190static int ftraced_trigger;
191static int ftraced_suspend;
192
193static int ftrace_record_suspend;
194
195static struct dyn_ftrace *ftrace_free_records;
196
197static inline int
198ftrace_ip_in_hash(unsigned long ip, unsigned long key)
199{
200 struct dyn_ftrace *p;
201 struct hlist_node *t;
202 int found = 0;
203
204 hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
205 if (p->ip == ip) {
206 found = 1;
207 break;
208 }
209 }
210
211 return found;
212}
213
214static inline void
215ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
216{
217 hlist_add_head(&node->node, &ftrace_hash[key]);
218}
219
220static void ftrace_free_rec(struct dyn_ftrace *rec)
221{
222 /* no locking, only called from kstop_machine */
223
224 rec->ip = (unsigned long)ftrace_free_records;
225 ftrace_free_records = rec;
226 rec->flags |= FTRACE_FL_FREE;
227}
228
229static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
230{
231 struct dyn_ftrace *rec;
232
233 /* First check for freed records */
234 if (ftrace_free_records) {
235 rec = ftrace_free_records;
236
237 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
238 WARN_ON_ONCE(1);
239 ftrace_free_records = NULL;
240 ftrace_disabled = 1;
241 ftrace_enabled = 0;
242 return NULL;
243 }
244
245 ftrace_free_records = (void *)rec->ip;
246 memset(rec, 0, sizeof(*rec));
247 return rec;
248 }
249
250 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
251 if (!ftrace_pages->next)
252 return NULL;
253 ftrace_pages = ftrace_pages->next;
254 }
255
256 return &ftrace_pages->records[ftrace_pages->index++];
257}
258
259static void
260ftrace_record_ip(unsigned long ip)
261{
262 struct dyn_ftrace *node;
263 unsigned long flags;
264 unsigned long key;
265 int resched;
266 int atomic;
267 int cpu;
268
269 if (!ftrace_enabled || ftrace_disabled)
270 return;
271
272 resched = need_resched();
273 preempt_disable_notrace();
274
275 /*
276 * We simply need to protect against recursion.
277 * Use the the raw version of smp_processor_id and not
278 * __get_cpu_var which can call debug hooks that can
279 * cause a recursive crash here.
280 */
281 cpu = raw_smp_processor_id();
282 per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
283 if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
284 goto out;
285
286 if (unlikely(ftrace_record_suspend))
287 goto out;
288
289 key = hash_long(ip, FTRACE_HASHBITS);
290
291 WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
292
293 if (ftrace_ip_in_hash(ip, key))
294 goto out;
295
296 atomic = irqs_disabled();
297
298 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
299
300 /* This ip may have hit the hash before the lock */
301 if (ftrace_ip_in_hash(ip, key))
302 goto out_unlock;
303
304 /*
305 * There's a slight race that the ftraced will update the
306 * hash and reset here. If it is already converted, skip it.
307 */
308 if (ftrace_ip_converted(ip))
309 goto out_unlock;
310
311 node = ftrace_alloc_dyn_node(ip);
312 if (!node)
313 goto out_unlock;
314
315 node->ip = ip;
316
317 ftrace_add_hash(node, key);
318
319 ftraced_trigger = 1;
320
321 out_unlock:
322 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
323 out:
324 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
325
326 /* prevent recursion with scheduler */
327 if (resched)
328 preempt_enable_no_resched_notrace();
329 else
330 preempt_enable_notrace();
331}
332
333#define FTRACE_ADDR ((long)(ftrace_caller))
334#define MCOUNT_ADDR ((long)(mcount))
335
336static void
337__ftrace_replace_code(struct dyn_ftrace *rec,
338 unsigned char *old, unsigned char *new, int enable)
339{
340 unsigned long ip;
341 int failed;
342
343 ip = rec->ip;
344
345 if (ftrace_filtered && enable) {
346 unsigned long fl;
347 /*
348 * If filtering is on:
349 *
350 * If this record is set to be filtered and
351 * is enabled then do nothing.
352 *
353 * If this record is set to be filtered and
354 * it is not enabled, enable it.
355 *
356 * If this record is not set to be filtered
357 * and it is not enabled do nothing.
358 *
359 * If this record is not set to be filtered and
360 * it is enabled, disable it.
361 */
362 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
363
364 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
365 (fl == 0))
366 return;
367
368 /*
369 * If it is enabled disable it,
370 * otherwise enable it!
371 */
372 if (fl == FTRACE_FL_ENABLED) {
373 /* swap new and old */
374 new = old;
375 old = ftrace_call_replace(ip, FTRACE_ADDR);
376 rec->flags &= ~FTRACE_FL_ENABLED;
377 } else {
378 new = ftrace_call_replace(ip, FTRACE_ADDR);
379 rec->flags |= FTRACE_FL_ENABLED;
380 }
381 } else {
382
383 if (enable)
384 new = ftrace_call_replace(ip, FTRACE_ADDR);
385 else
386 old = ftrace_call_replace(ip, FTRACE_ADDR);
387
388 if (enable) {
389 if (rec->flags & FTRACE_FL_ENABLED)
390 return;
391 rec->flags |= FTRACE_FL_ENABLED;
392 } else {
393 if (!(rec->flags & FTRACE_FL_ENABLED))
394 return;
395 rec->flags &= ~FTRACE_FL_ENABLED;
396 }
397 }
398
399 failed = ftrace_modify_code(ip, old, new);
400 if (failed) {
401 unsigned long key;
402 /* It is possible that the function hasn't been converted yet */
403 key = hash_long(ip, FTRACE_HASHBITS);
404 if (!ftrace_ip_in_hash(ip, key)) {
405 rec->flags |= FTRACE_FL_FAILED;
406 ftrace_free_rec(rec);
407 }
408
409 }
410}
411
412static void ftrace_replace_code(int enable)
413{
414 unsigned char *new = NULL, *old = NULL;
415 struct dyn_ftrace *rec;
416 struct ftrace_page *pg;
417 int i;
418
419 if (enable)
420 old = ftrace_nop_replace();
421 else
422 new = ftrace_nop_replace();
423
424 for (pg = ftrace_pages_start; pg; pg = pg->next) {
425 for (i = 0; i < pg->index; i++) {
426 rec = &pg->records[i];
427
428 /* don't modify code that has already faulted */
429 if (rec->flags & FTRACE_FL_FAILED)
430 continue;
431
432 __ftrace_replace_code(rec, old, new, enable);
433 }
434 }
435}
436
437static void ftrace_shutdown_replenish(void)
438{
439 if (ftrace_pages->next)
440 return;
441
442 /* allocate another page */
443 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
444}
445
446static void
447ftrace_code_disable(struct dyn_ftrace *rec)
448{
449 unsigned long ip;
450 unsigned char *nop, *call;
451 int failed;
452
453 ip = rec->ip;
454
455 nop = ftrace_nop_replace();
456 call = ftrace_call_replace(ip, MCOUNT_ADDR);
457
458 failed = ftrace_modify_code(ip, call, nop);
459 if (failed) {
460 rec->flags |= FTRACE_FL_FAILED;
461 ftrace_free_rec(rec);
462 }
463}
464
465static int __ftrace_modify_code(void *data)
466{
467 unsigned long addr;
468 int *command = data;
469
470 if (*command & FTRACE_ENABLE_CALLS)
471 ftrace_replace_code(1);
472 else if (*command & FTRACE_DISABLE_CALLS)
473 ftrace_replace_code(0);
474
475 if (*command & FTRACE_UPDATE_TRACE_FUNC)
476 ftrace_update_ftrace_func(ftrace_trace_function);
477
478 if (*command & FTRACE_ENABLE_MCOUNT) {
479 addr = (unsigned long)ftrace_record_ip;
480 ftrace_mcount_set(&addr);
481 } else if (*command & FTRACE_DISABLE_MCOUNT) {
482 addr = (unsigned long)ftrace_stub;
483 ftrace_mcount_set(&addr);
484 }
485
486 return 0;
487}
488
489static void ftrace_run_update_code(int command)
490{
491 stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
492}
493
494static ftrace_func_t saved_ftrace_func;
495
496static void ftrace_startup(void)
497{
498 int command = 0;
499
500 if (unlikely(ftrace_disabled))
501 return;
502
503 mutex_lock(&ftraced_lock);
504 ftraced_suspend++;
505 if (ftraced_suspend == 1)
506 command |= FTRACE_ENABLE_CALLS;
507
508 if (saved_ftrace_func != ftrace_trace_function) {
509 saved_ftrace_func = ftrace_trace_function;
510 command |= FTRACE_UPDATE_TRACE_FUNC;
511 }
512
513 if (!command || !ftrace_enabled)
514 goto out;
515
516 ftrace_run_update_code(command);
517 out:
518 mutex_unlock(&ftraced_lock);
519}
520
521static void ftrace_shutdown(void)
522{
523 int command = 0;
524
525 if (unlikely(ftrace_disabled))
526 return;
527
528 mutex_lock(&ftraced_lock);
529 ftraced_suspend--;
530 if (!ftraced_suspend)
531 command |= FTRACE_DISABLE_CALLS;
532
533 if (saved_ftrace_func != ftrace_trace_function) {
534 saved_ftrace_func = ftrace_trace_function;
535 command |= FTRACE_UPDATE_TRACE_FUNC;
536 }
537
538 if (!command || !ftrace_enabled)
539 goto out;
540
541 ftrace_run_update_code(command);
542 out:
543 mutex_unlock(&ftraced_lock);
544}
545
546static void ftrace_startup_sysctl(void)
547{
548 int command = FTRACE_ENABLE_MCOUNT;
549
550 if (unlikely(ftrace_disabled))
551 return;
552
553 mutex_lock(&ftraced_lock);
554 /* Force update next time */
555 saved_ftrace_func = NULL;
556 /* ftraced_suspend is true if we want ftrace running */
557 if (ftraced_suspend)
558 command |= FTRACE_ENABLE_CALLS;
559
560 ftrace_run_update_code(command);
561 mutex_unlock(&ftraced_lock);
562}
563
564static void ftrace_shutdown_sysctl(void)
565{
566 int command = FTRACE_DISABLE_MCOUNT;
567
568 if (unlikely(ftrace_disabled))
569 return;
570
571 mutex_lock(&ftraced_lock);
572 /* ftraced_suspend is true if ftrace is running */
573 if (ftraced_suspend)
574 command |= FTRACE_DISABLE_CALLS;
575
576 ftrace_run_update_code(command);
577 mutex_unlock(&ftraced_lock);
578}
579
580static cycle_t ftrace_update_time;
581static unsigned long ftrace_update_cnt;
582unsigned long ftrace_update_tot_cnt;
583
584static int __ftrace_update_code(void *ignore)
585{
586 struct dyn_ftrace *p;
587 struct hlist_head head;
588 struct hlist_node *t;
589 int save_ftrace_enabled;
590 cycle_t start, stop;
591 int i;
592
593 /* Don't be recording funcs now */
594 save_ftrace_enabled = ftrace_enabled;
595 ftrace_enabled = 0;
596
597 start = ftrace_now(raw_smp_processor_id());
598 ftrace_update_cnt = 0;
599
600 /* No locks needed, the machine is stopped! */
601 for (i = 0; i < FTRACE_HASHSIZE; i++) {
602 if (hlist_empty(&ftrace_hash[i]))
603 continue;
604
605 head = ftrace_hash[i];
606 INIT_HLIST_HEAD(&ftrace_hash[i]);
607
608 /* all CPUS are stopped, we are safe to modify code */
609 hlist_for_each_entry(p, t, &head, node) {
610 ftrace_code_disable(p);
611 ftrace_update_cnt++;
612 }
613
614 }
615
616 stop = ftrace_now(raw_smp_processor_id());
617 ftrace_update_time = stop - start;
618 ftrace_update_tot_cnt += ftrace_update_cnt;
619
620 ftrace_enabled = save_ftrace_enabled;
621
622 return 0;
623}
624
625static void ftrace_update_code(void)
626{
627 if (unlikely(ftrace_disabled))
628 return;
629
630 stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
631}
632
633static int ftraced(void *ignore)
634{
635 unsigned long usecs;
636
637 while (!kthread_should_stop()) {
638
639 set_current_state(TASK_INTERRUPTIBLE);
640
641 /* check once a second */
642 schedule_timeout(HZ);
643
644 if (unlikely(ftrace_disabled))
645 continue;
646
647 mutex_lock(&ftrace_sysctl_lock);
648 mutex_lock(&ftraced_lock);
649 if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
650 ftrace_record_suspend++;
651 ftrace_update_code();
652 usecs = nsecs_to_usecs(ftrace_update_time);
653 if (ftrace_update_tot_cnt > 100000) {
654 ftrace_update_tot_cnt = 0;
655 pr_info("hm, dftrace overflow: %lu change%s"
656 " (%lu total) in %lu usec%s\n",
657 ftrace_update_cnt,
658 ftrace_update_cnt != 1 ? "s" : "",
659 ftrace_update_tot_cnt,
660 usecs, usecs != 1 ? "s" : "");
661 ftrace_disabled = 1;
662 WARN_ON_ONCE(1);
663 }
664 ftraced_trigger = 0;
665 ftrace_record_suspend--;
666 }
667 ftraced_iteration_counter++;
668 mutex_unlock(&ftraced_lock);
669 mutex_unlock(&ftrace_sysctl_lock);
670
671 wake_up_interruptible(&ftraced_waiters);
672
673 ftrace_shutdown_replenish();
674 }
675 __set_current_state(TASK_RUNNING);
676 return 0;
677}
678
679static int __init ftrace_dyn_table_alloc(void)
680{
681 struct ftrace_page *pg;
682 int cnt;
683 int i;
684
685 /* allocate a few pages */
686 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
687 if (!ftrace_pages_start)
688 return -1;
689
690 /*
691 * Allocate a few more pages.
692 *
693 * TODO: have some parser search vmlinux before
694 * final linking to find all calls to ftrace.
695 * Then we can:
696 * a) know how many pages to allocate.
697 * and/or
698 * b) set up the table then.
699 *
700 * The dynamic code is still necessary for
701 * modules.
702 */
703
704 pg = ftrace_pages = ftrace_pages_start;
705
706 cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
707
708 for (i = 0; i < cnt; i++) {
709 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
710
711 /* If we fail, we'll try later anyway */
712 if (!pg->next)
713 break;
714
715 pg = pg->next;
716 }
717
718 return 0;
719}
720
721enum {
722 FTRACE_ITER_FILTER = (1 << 0),
723 FTRACE_ITER_CONT = (1 << 1),
724};
725
726#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
727
728struct ftrace_iterator {
729 loff_t pos;
730 struct ftrace_page *pg;
731 unsigned idx;
732 unsigned flags;
733 unsigned char buffer[FTRACE_BUFF_MAX+1];
734 unsigned buffer_idx;
735 unsigned filtered;
736};
737
738static void *
739t_next(struct seq_file *m, void *v, loff_t *pos)
740{
741 struct ftrace_iterator *iter = m->private;
742 struct dyn_ftrace *rec = NULL;
743
744 (*pos)++;
745
746 retry:
747 if (iter->idx >= iter->pg->index) {
748 if (iter->pg->next) {
749 iter->pg = iter->pg->next;
750 iter->idx = 0;
751 goto retry;
752 }
753 } else {
754 rec = &iter->pg->records[iter->idx++];
755 if ((rec->flags & FTRACE_FL_FAILED) ||
756 ((iter->flags & FTRACE_ITER_FILTER) &&
757 !(rec->flags & FTRACE_FL_FILTER))) {
758 rec = NULL;
759 goto retry;
760 }
761 }
762
763 iter->pos = *pos;
764
765 return rec;
766}
767
768static void *t_start(struct seq_file *m, loff_t *pos)
769{
770 struct ftrace_iterator *iter = m->private;
771 void *p = NULL;
772 loff_t l = -1;
773
774 if (*pos != iter->pos) {
775 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
776 ;
777 } else {
778 l = *pos;
779 p = t_next(m, p, &l);
780 }
781
782 return p;
783}
784
785static void t_stop(struct seq_file *m, void *p)
786{
787}
788
789static int t_show(struct seq_file *m, void *v)
790{
791 struct dyn_ftrace *rec = v;
792 char str[KSYM_SYMBOL_LEN];
793
794 if (!rec)
795 return 0;
796
797 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
798
799 seq_printf(m, "%s\n", str);
800
801 return 0;
802}
803
804static struct seq_operations show_ftrace_seq_ops = {
805 .start = t_start,
806 .next = t_next,
807 .stop = t_stop,
808 .show = t_show,
809};
810
811static int
812ftrace_avail_open(struct inode *inode, struct file *file)
813{
814 struct ftrace_iterator *iter;
815 int ret;
816
817 if (unlikely(ftrace_disabled))
818 return -ENODEV;
819
820 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
821 if (!iter)
822 return -ENOMEM;
823
824 iter->pg = ftrace_pages_start;
825 iter->pos = -1;
826
827 ret = seq_open(file, &show_ftrace_seq_ops);
828 if (!ret) {
829 struct seq_file *m = file->private_data;
830
831 m->private = iter;
832 } else {
833 kfree(iter);
834 }
835
836 return ret;
837}
838
839int ftrace_avail_release(struct inode *inode, struct file *file)
840{
841 struct seq_file *m = (struct seq_file *)file->private_data;
842 struct ftrace_iterator *iter = m->private;
843
844 seq_release(inode, file);
845 kfree(iter);
846
847 return 0;
848}
849
850static void ftrace_filter_reset(void)
851{
852 struct ftrace_page *pg;
853 struct dyn_ftrace *rec;
854 unsigned i;
855
856 /* keep kstop machine from running */
857 preempt_disable();
858 ftrace_filtered = 0;
859 pg = ftrace_pages_start;
860 while (pg) {
861 for (i = 0; i < pg->index; i++) {
862 rec = &pg->records[i];
863 if (rec->flags & FTRACE_FL_FAILED)
864 continue;
865 rec->flags &= ~FTRACE_FL_FILTER;
866 }
867 pg = pg->next;
868 }
869 preempt_enable();
870}
871
872static int
873ftrace_filter_open(struct inode *inode, struct file *file)
874{
875 struct ftrace_iterator *iter;
876 int ret = 0;
877
878 if (unlikely(ftrace_disabled))
879 return -ENODEV;
880
881 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
882 if (!iter)
883 return -ENOMEM;
884
885 mutex_lock(&ftrace_filter_lock);
886 if ((file->f_mode & FMODE_WRITE) &&
887 !(file->f_flags & O_APPEND))
888 ftrace_filter_reset();
889
890 if (file->f_mode & FMODE_READ) {
891 iter->pg = ftrace_pages_start;
892 iter->pos = -1;
893 iter->flags = FTRACE_ITER_FILTER;
894
895 ret = seq_open(file, &show_ftrace_seq_ops);
896 if (!ret) {
897 struct seq_file *m = file->private_data;
898 m->private = iter;
899 } else
900 kfree(iter);
901 } else
902 file->private_data = iter;
903 mutex_unlock(&ftrace_filter_lock);
904
905 return ret;
906}
907
908static ssize_t
909ftrace_filter_read(struct file *file, char __user *ubuf,
910 size_t cnt, loff_t *ppos)
911{
912 if (file->f_mode & FMODE_READ)
913 return seq_read(file, ubuf, cnt, ppos);
914 else
915 return -EPERM;
916}
917
918static loff_t
919ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
920{
921 loff_t ret;
922
923 if (file->f_mode & FMODE_READ)
924 ret = seq_lseek(file, offset, origin);
925 else
926 file->f_pos = ret = 1;
927
928 return ret;
929}
930
931enum {
932 MATCH_FULL,
933 MATCH_FRONT_ONLY,
934 MATCH_MIDDLE_ONLY,
935 MATCH_END_ONLY,
936};
937
938static void
939ftrace_match(unsigned char *buff, int len)
940{
941 char str[KSYM_SYMBOL_LEN];
942 char *search = NULL;
943 struct ftrace_page *pg;
944 struct dyn_ftrace *rec;
945 int type = MATCH_FULL;
946 unsigned i, match = 0, search_len = 0;
947
948 for (i = 0; i < len; i++) {
949 if (buff[i] == '*') {
950 if (!i) {
951 search = buff + i + 1;
952 type = MATCH_END_ONLY;
953 search_len = len - (i + 1);
954 } else {
955 if (type == MATCH_END_ONLY) {
956 type = MATCH_MIDDLE_ONLY;
957 } else {
958 match = i;
959 type = MATCH_FRONT_ONLY;
960 }
961 buff[i] = 0;
962 break;
963 }
964 }
965 }
966
967 /* keep kstop machine from running */
968 preempt_disable();
969 ftrace_filtered = 1;
970 pg = ftrace_pages_start;
971 while (pg) {
972 for (i = 0; i < pg->index; i++) {
973 int matched = 0;
974 char *ptr;
975
976 rec = &pg->records[i];
977 if (rec->flags & FTRACE_FL_FAILED)
978 continue;
979 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
980 switch (type) {
981 case MATCH_FULL:
982 if (strcmp(str, buff) == 0)
983 matched = 1;
984 break;
985 case MATCH_FRONT_ONLY:
986 if (memcmp(str, buff, match) == 0)
987 matched = 1;
988 break;
989 case MATCH_MIDDLE_ONLY:
990 if (strstr(str, search))
991 matched = 1;
992 break;
993 case MATCH_END_ONLY:
994 ptr = strstr(str, search);
995 if (ptr && (ptr[search_len] == 0))
996 matched = 1;
997 break;
998 }
999 if (matched)
1000 rec->flags |= FTRACE_FL_FILTER;
1001 }
1002 pg = pg->next;
1003 }
1004 preempt_enable();
1005}
1006
1007static ssize_t
1008ftrace_filter_write(struct file *file, const char __user *ubuf,
1009 size_t cnt, loff_t *ppos)
1010{
1011 struct ftrace_iterator *iter;
1012 char ch;
1013 size_t read = 0;
1014 ssize_t ret;
1015
1016 if (!cnt || cnt < 0)
1017 return 0;
1018
1019 mutex_lock(&ftrace_filter_lock);
1020
1021 if (file->f_mode & FMODE_READ) {
1022 struct seq_file *m = file->private_data;
1023 iter = m->private;
1024 } else
1025 iter = file->private_data;
1026
1027 if (!*ppos) {
1028 iter->flags &= ~FTRACE_ITER_CONT;
1029 iter->buffer_idx = 0;
1030 }
1031
1032 ret = get_user(ch, ubuf++);
1033 if (ret)
1034 goto out;
1035 read++;
1036 cnt--;
1037
1038 if (!(iter->flags & ~FTRACE_ITER_CONT)) {
1039 /* skip white space */
1040 while (cnt && isspace(ch)) {
1041 ret = get_user(ch, ubuf++);
1042 if (ret)
1043 goto out;
1044 read++;
1045 cnt--;
1046 }
1047
1048
1049 if (isspace(ch)) {
1050 file->f_pos += read;
1051 ret = read;
1052 goto out;
1053 }
1054
1055 iter->buffer_idx = 0;
1056 }
1057
1058 while (cnt && !isspace(ch)) {
1059 if (iter->buffer_idx < FTRACE_BUFF_MAX)
1060 iter->buffer[iter->buffer_idx++] = ch;
1061 else {
1062 ret = -EINVAL;
1063 goto out;
1064 }
1065 ret = get_user(ch, ubuf++);
1066 if (ret)
1067 goto out;
1068 read++;
1069 cnt--;
1070 }
1071
1072 if (isspace(ch)) {
1073 iter->filtered++;
1074 iter->buffer[iter->buffer_idx] = 0;
1075 ftrace_match(iter->buffer, iter->buffer_idx);
1076 iter->buffer_idx = 0;
1077 } else
1078 iter->flags |= FTRACE_ITER_CONT;
1079
1080
1081 file->f_pos += read;
1082
1083 ret = read;
1084 out:
1085 mutex_unlock(&ftrace_filter_lock);
1086
1087 return ret;
1088}
1089
1090/**
1091 * ftrace_set_filter - set a function to filter on in ftrace
1092 * @buf - the string that holds the function filter text.
1093 * @len - the length of the string.
1094 * @reset - non zero to reset all filters before applying this filter.
1095 *
1096 * Filters denote which functions should be enabled when tracing is enabled.
1097 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
1098 */
1099void ftrace_set_filter(unsigned char *buf, int len, int reset)
1100{
1101 if (unlikely(ftrace_disabled))
1102 return;
1103
1104 mutex_lock(&ftrace_filter_lock);
1105 if (reset)
1106 ftrace_filter_reset();
1107 if (buf)
1108 ftrace_match(buf, len);
1109 mutex_unlock(&ftrace_filter_lock);
1110}
1111
1112static int
1113ftrace_filter_release(struct inode *inode, struct file *file)
1114{
1115 struct seq_file *m = (struct seq_file *)file->private_data;
1116 struct ftrace_iterator *iter;
1117
1118 mutex_lock(&ftrace_filter_lock);
1119 if (file->f_mode & FMODE_READ) {
1120 iter = m->private;
1121
1122 seq_release(inode, file);
1123 } else
1124 iter = file->private_data;
1125
1126 if (iter->buffer_idx) {
1127 iter->filtered++;
1128 iter->buffer[iter->buffer_idx] = 0;
1129 ftrace_match(iter->buffer, iter->buffer_idx);
1130 }
1131
1132 mutex_lock(&ftrace_sysctl_lock);
1133 mutex_lock(&ftraced_lock);
1134 if (iter->filtered && ftraced_suspend && ftrace_enabled)
1135 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1136 mutex_unlock(&ftraced_lock);
1137 mutex_unlock(&ftrace_sysctl_lock);
1138
1139 kfree(iter);
1140 mutex_unlock(&ftrace_filter_lock);
1141 return 0;
1142}
1143
1144static struct file_operations ftrace_avail_fops = {
1145 .open = ftrace_avail_open,
1146 .read = seq_read,
1147 .llseek = seq_lseek,
1148 .release = ftrace_avail_release,
1149};
1150
1151static struct file_operations ftrace_filter_fops = {
1152 .open = ftrace_filter_open,
1153 .read = ftrace_filter_read,
1154 .write = ftrace_filter_write,
1155 .llseek = ftrace_filter_lseek,
1156 .release = ftrace_filter_release,
1157};
1158
1159/**
1160 * ftrace_force_update - force an update to all recording ftrace functions
1161 *
1162 * The ftrace dynamic update daemon only wakes up once a second.
1163 * There may be cases where an update needs to be done immediately
1164 * for tests or internal kernel tracing to begin. This function
1165 * wakes the daemon to do an update and will not return until the
1166 * update is complete.
1167 */
1168int ftrace_force_update(void)
1169{
1170 unsigned long last_counter;
1171 DECLARE_WAITQUEUE(wait, current);
1172 int ret = 0;
1173
1174 if (unlikely(ftrace_disabled))
1175 return -ENODEV;
1176
1177 mutex_lock(&ftraced_lock);
1178 last_counter = ftraced_iteration_counter;
1179
1180 set_current_state(TASK_INTERRUPTIBLE);
1181 add_wait_queue(&ftraced_waiters, &wait);
1182
1183 if (unlikely(!ftraced_task)) {
1184 ret = -ENODEV;
1185 goto out;
1186 }
1187
1188 do {
1189 mutex_unlock(&ftraced_lock);
1190 wake_up_process(ftraced_task);
1191 schedule();
1192 mutex_lock(&ftraced_lock);
1193 if (signal_pending(current)) {
1194 ret = -EINTR;
1195 break;
1196 }
1197 set_current_state(TASK_INTERRUPTIBLE);
1198 } while (last_counter == ftraced_iteration_counter);
1199
1200 out:
1201 mutex_unlock(&ftraced_lock);
1202 remove_wait_queue(&ftraced_waiters, &wait);
1203 set_current_state(TASK_RUNNING);
1204
1205 return ret;
1206}
1207
1208static void ftrace_force_shutdown(void)
1209{
1210 struct task_struct *task;
1211 int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
1212
1213 mutex_lock(&ftraced_lock);
1214 task = ftraced_task;
1215 ftraced_task = NULL;
1216 ftraced_suspend = -1;
1217 ftrace_run_update_code(command);
1218 mutex_unlock(&ftraced_lock);
1219
1220 if (task)
1221 kthread_stop(task);
1222}
1223
1224static __init int ftrace_init_debugfs(void)
1225{
1226 struct dentry *d_tracer;
1227 struct dentry *entry;
1228
1229 d_tracer = tracing_init_dentry();
1230
1231 entry = debugfs_create_file("available_filter_functions", 0444,
1232 d_tracer, NULL, &ftrace_avail_fops);
1233 if (!entry)
1234 pr_warning("Could not create debugfs "
1235 "'available_filter_functions' entry\n");
1236
1237 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
1238 NULL, &ftrace_filter_fops);
1239 if (!entry)
1240 pr_warning("Could not create debugfs "
1241 "'set_ftrace_filter' entry\n");
1242 return 0;
1243}
1244
1245fs_initcall(ftrace_init_debugfs);
1246
1247static int __init ftrace_dynamic_init(void)
1248{
1249 struct task_struct *p;
1250 unsigned long addr;
1251 int ret;
1252
1253 addr = (unsigned long)ftrace_record_ip;
1254
1255 stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
1256
1257 /* ftrace_dyn_arch_init places the return code in addr */
1258 if (addr) {
1259 ret = (int)addr;
1260 goto failed;
1261 }
1262
1263 ret = ftrace_dyn_table_alloc();
1264 if (ret)
1265 goto failed;
1266
1267 p = kthread_run(ftraced, NULL, "ftraced");
1268 if (IS_ERR(p)) {
1269 ret = -1;
1270 goto failed;
1271 }
1272
1273 last_ftrace_enabled = ftrace_enabled = 1;
1274 ftraced_task = p;
1275
1276 return 0;
1277
1278 failed:
1279 ftrace_disabled = 1;
1280 return ret;
1281}
1282
1283core_initcall(ftrace_dynamic_init);
1284#else
1285# define ftrace_startup() do { } while (0)
1286# define ftrace_shutdown() do { } while (0)
1287# define ftrace_startup_sysctl() do { } while (0)
1288# define ftrace_shutdown_sysctl() do { } while (0)
1289# define ftrace_force_shutdown() do { } while (0)
1290#endif /* CONFIG_DYNAMIC_FTRACE */
1291
1292/**
1293 * ftrace_kill - totally shutdown ftrace
1294 *
1295 * This is a safety measure. If something was detected that seems
1296 * wrong, calling this function will keep ftrace from doing
1297 * any more modifications, and updates.
1298 * used when something went wrong.
1299 */
1300void ftrace_kill(void)
1301{
1302 mutex_lock(&ftrace_sysctl_lock);
1303 ftrace_disabled = 1;
1304 ftrace_enabled = 0;
1305
1306 clear_ftrace_function();
1307 mutex_unlock(&ftrace_sysctl_lock);
1308
1309 /* Try to totally disable ftrace */
1310 ftrace_force_shutdown();
1311}
1312
1313/**
1314 * register_ftrace_function - register a function for profiling
1315 * @ops - ops structure that holds the function for profiling.
1316 *
1317 * Register a function to be called by all functions in the
1318 * kernel.
1319 *
1320 * Note: @ops->func and all the functions it calls must be labeled
1321 * with "notrace", otherwise it will go into a
1322 * recursive loop.
1323 */
1324int register_ftrace_function(struct ftrace_ops *ops)
1325{
1326 int ret;
1327
1328 if (unlikely(ftrace_disabled))
1329 return -1;
1330
1331 mutex_lock(&ftrace_sysctl_lock);
1332 ret = __register_ftrace_function(ops);
1333 ftrace_startup();
1334 mutex_unlock(&ftrace_sysctl_lock);
1335
1336 return ret;
1337}
1338
1339/**
1340 * unregister_ftrace_function - unresgister a function for profiling.
1341 * @ops - ops structure that holds the function to unregister
1342 *
1343 * Unregister a function that was added to be called by ftrace profiling.
1344 */
1345int unregister_ftrace_function(struct ftrace_ops *ops)
1346{
1347 int ret;
1348
1349 mutex_lock(&ftrace_sysctl_lock);
1350 ret = __unregister_ftrace_function(ops);
1351 ftrace_shutdown();
1352 mutex_unlock(&ftrace_sysctl_lock);
1353
1354 return ret;
1355}
1356
1357int
1358ftrace_enable_sysctl(struct ctl_table *table, int write,
1359 struct file *file, void __user *buffer, size_t *lenp,
1360 loff_t *ppos)
1361{
1362 int ret;
1363
1364 if (unlikely(ftrace_disabled))
1365 return -ENODEV;
1366
1367 mutex_lock(&ftrace_sysctl_lock);
1368
1369 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1370
1371 if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
1372 goto out;
1373
1374 last_ftrace_enabled = ftrace_enabled;
1375
1376 if (ftrace_enabled) {
1377
1378 ftrace_startup_sysctl();
1379
1380 /* we are starting ftrace again */
1381 if (ftrace_list != &ftrace_list_end) {
1382 if (ftrace_list->next == &ftrace_list_end)
1383 ftrace_trace_function = ftrace_list->func;
1384 else
1385 ftrace_trace_function = ftrace_list_func;
1386 }
1387
1388 } else {
1389 /* stopping ftrace calls (just send to ftrace_stub) */
1390 ftrace_trace_function = ftrace_stub;
1391
1392 ftrace_shutdown_sysctl();
1393 }
1394
1395 out:
1396 mutex_unlock(&ftrace_sysctl_lock);
1397 return ret;
1398}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..4dcc4e85c5d6
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3073 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally taken from the RT patch by:
8 * Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III
13 */
14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h>
16#include <linux/seq_file.h>
17#include <linux/debugfs.h>
18#include <linux/pagemap.h>
19#include <linux/hardirq.h>
20#include <linux/linkage.h>
21#include <linux/uaccess.h>
22#include <linux/ftrace.h>
23#include <linux/module.h>
24#include <linux/percpu.h>
25#include <linux/ctype.h>
26#include <linux/init.h>
27#include <linux/poll.h>
28#include <linux/gfp.h>
29#include <linux/fs.h>
30#include <linux/writeback.h>
31
32#include <linux/stacktrace.h>
33
34#include "trace.h"
35
36unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
37unsigned long __read_mostly tracing_thresh;
38
39static unsigned long __read_mostly tracing_nr_buffers;
40static cpumask_t __read_mostly tracing_buffer_mask;
41
42#define for_each_tracing_cpu(cpu) \
43 for_each_cpu_mask(cpu, tracing_buffer_mask)
44
45/* dummy trace to disable tracing */
46static struct tracer no_tracer __read_mostly = {
47 .name = "none",
48};
49
50static int trace_alloc_page(void);
51static int trace_free_page(void);
52
53static int tracing_disabled = 1;
54
55static unsigned long tracing_pages_allocated;
56
57long
58ns2usecs(cycle_t nsec)
59{
60 nsec += 500;
61 do_div(nsec, 1000);
62 return nsec;
63}
64
65cycle_t ftrace_now(int cpu)
66{
67 return cpu_clock(cpu);
68}
69
70/*
71 * The global_trace is the descriptor that holds the tracing
72 * buffers for the live tracing. For each CPU, it contains
73 * a link list of pages that will store trace entries. The
74 * page descriptor of the pages in the memory is used to hold
75 * the link list by linking the lru item in the page descriptor
76 * to each of the pages in the buffer per CPU.
77 *
78 * For each active CPU there is a data field that holds the
79 * pages for the buffer for that CPU. Each CPU has the same number
80 * of pages allocated for its buffer.
81 */
82static struct trace_array global_trace;
83
84static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
85
86/*
87 * The max_tr is used to snapshot the global_trace when a maximum
88 * latency is reached. Some tracers will use this to store a maximum
89 * trace while it continues examining live traces.
90 *
91 * The buffers for the max_tr are set up the same as the global_trace.
92 * When a snapshot is taken, the link list of the max_tr is swapped
93 * with the link list of the global_trace and the buffers are reset for
94 * the global_trace so the tracing can continue.
95 */
96static struct trace_array max_tr;
97
98static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
99
100/* tracer_enabled is used to toggle activation of a tracer */
101static int tracer_enabled = 1;
102
103/*
104 * trace_nr_entries is the number of entries that is allocated
105 * for a buffer. Note, the number of entries is always rounded
106 * to ENTRIES_PER_PAGE.
107 */
108static unsigned long trace_nr_entries = 65536UL;
109
110/* trace_types holds a link list of available tracers. */
111static struct tracer *trace_types __read_mostly;
112
113/* current_trace points to the tracer that is currently active */
114static struct tracer *current_trace __read_mostly;
115
116/*
117 * max_tracer_type_len is used to simplify the allocating of
118 * buffers to read userspace tracer names. We keep track of
119 * the longest tracer name registered.
120 */
121static int max_tracer_type_len;
122
123/*
124 * trace_types_lock is used to protect the trace_types list.
125 * This lock is also used to keep user access serialized.
126 * Accesses from userspace will grab this lock while userspace
127 * activities happen inside the kernel.
128 */
129static DEFINE_MUTEX(trace_types_lock);
130
131/* trace_wait is a waitqueue for tasks blocked on trace_poll */
132static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
133
134/* trace_flags holds iter_ctrl options */
135unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
136
137/**
138 * trace_wake_up - wake up tasks waiting for trace input
139 *
140 * Simply wakes up any task that is blocked on the trace_wait
141 * queue. These is used with trace_poll for tasks polling the trace.
142 */
143void trace_wake_up(void)
144{
145 /*
146 * The runqueue_is_locked() can fail, but this is the best we
147 * have for now:
148 */
149 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
150 wake_up(&trace_wait);
151}
152
153#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
154
155static int __init set_nr_entries(char *str)
156{
157 unsigned long nr_entries;
158 int ret;
159
160 if (!str)
161 return 0;
162 ret = strict_strtoul(str, 0, &nr_entries);
163 /* nr_entries can not be zero */
164 if (ret < 0 || nr_entries == 0)
165 return 0;
166 trace_nr_entries = nr_entries;
167 return 1;
168}
169__setup("trace_entries=", set_nr_entries);
170
171unsigned long nsecs_to_usecs(unsigned long nsecs)
172{
173 return nsecs / 1000;
174}
175
176/*
177 * trace_flag_type is an enumeration that holds different
178 * states when a trace occurs. These are:
179 * IRQS_OFF - interrupts were disabled
180 * NEED_RESCED - reschedule is requested
181 * HARDIRQ - inside an interrupt handler
182 * SOFTIRQ - inside a softirq handler
183 */
184enum trace_flag_type {
185 TRACE_FLAG_IRQS_OFF = 0x01,
186 TRACE_FLAG_NEED_RESCHED = 0x02,
187 TRACE_FLAG_HARDIRQ = 0x04,
188 TRACE_FLAG_SOFTIRQ = 0x08,
189};
190
191/*
192 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
193 * control the output of kernel symbols.
194 */
195#define TRACE_ITER_SYM_MASK \
196 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
197
198/* These must match the bit postions in trace_iterator_flags */
199static const char *trace_options[] = {
200 "print-parent",
201 "sym-offset",
202 "sym-addr",
203 "verbose",
204 "raw",
205 "hex",
206 "bin",
207 "block",
208 "stacktrace",
209 "sched-tree",
210 NULL
211};
212
213/*
214 * ftrace_max_lock is used to protect the swapping of buffers
215 * when taking a max snapshot. The buffers themselves are
216 * protected by per_cpu spinlocks. But the action of the swap
217 * needs its own lock.
218 *
219 * This is defined as a raw_spinlock_t in order to help
220 * with performance when lockdep debugging is enabled.
221 */
222static raw_spinlock_t ftrace_max_lock =
223 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
224
225/*
226 * Copy the new maximum trace into the separate maximum-trace
227 * structure. (this way the maximum trace is permanently saved,
228 * for later retrieval via /debugfs/tracing/latency_trace)
229 */
230static void
231__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
232{
233 struct trace_array_cpu *data = tr->data[cpu];
234
235 max_tr.cpu = cpu;
236 max_tr.time_start = data->preempt_timestamp;
237
238 data = max_tr.data[cpu];
239 data->saved_latency = tracing_max_latency;
240
241 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
242 data->pid = tsk->pid;
243 data->uid = tsk->uid;
244 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
245 data->policy = tsk->policy;
246 data->rt_priority = tsk->rt_priority;
247
248 /* record this tasks comm */
249 tracing_record_cmdline(current);
250}
251
252/**
253 * check_pages - integrity check of trace buffers
254 *
255 * As a safty measure we check to make sure the data pages have not
256 * been corrupted. TODO: configure to disable this because it adds
257 * a bit of overhead.
258 */
259void check_pages(struct trace_array_cpu *data)
260{
261 struct page *page, *tmp;
262
263 BUG_ON(data->trace_pages.next->prev != &data->trace_pages);
264 BUG_ON(data->trace_pages.prev->next != &data->trace_pages);
265
266 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
267 BUG_ON(page->lru.next->prev != &page->lru);
268 BUG_ON(page->lru.prev->next != &page->lru);
269 }
270}
271
272/**
273 * head_page - page address of the first page in per_cpu buffer.
274 *
275 * head_page returns the page address of the first page in
276 * a per_cpu buffer. This also preforms various consistency
277 * checks to make sure the buffer has not been corrupted.
278 */
279void *head_page(struct trace_array_cpu *data)
280{
281 struct page *page;
282
283 check_pages(data);
284 if (list_empty(&data->trace_pages))
285 return NULL;
286
287 page = list_entry(data->trace_pages.next, struct page, lru);
288 BUG_ON(&page->lru == &data->trace_pages);
289
290 return page_address(page);
291}
292
293/**
294 * trace_seq_printf - sequence printing of trace information
295 * @s: trace sequence descriptor
296 * @fmt: printf format string
297 *
298 * The tracer may use either sequence operations or its own
299 * copy to user routines. To simplify formating of a trace
300 * trace_seq_printf is used to store strings into a special
301 * buffer (@s). Then the output may be either used by
302 * the sequencer or pulled into another buffer.
303 */
304int
305trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
306{
307 int len = (PAGE_SIZE - 1) - s->len;
308 va_list ap;
309 int ret;
310
311 if (!len)
312 return 0;
313
314 va_start(ap, fmt);
315 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
316 va_end(ap);
317
318 /* If we can't write it all, don't bother writing anything */
319 if (ret >= len)
320 return 0;
321
322 s->len += ret;
323
324 return len;
325}
326
327/**
328 * trace_seq_puts - trace sequence printing of simple string
329 * @s: trace sequence descriptor
330 * @str: simple string to record
331 *
332 * The tracer may use either the sequence operations or its own
333 * copy to user routines. This function records a simple string
334 * into a special buffer (@s) for later retrieval by a sequencer
335 * or other mechanism.
336 */
337static int
338trace_seq_puts(struct trace_seq *s, const char *str)
339{
340 int len = strlen(str);
341
342 if (len > ((PAGE_SIZE - 1) - s->len))
343 return 0;
344
345 memcpy(s->buffer + s->len, str, len);
346 s->len += len;
347
348 return len;
349}
350
351static int
352trace_seq_putc(struct trace_seq *s, unsigned char c)
353{
354 if (s->len >= (PAGE_SIZE - 1))
355 return 0;
356
357 s->buffer[s->len++] = c;
358
359 return 1;
360}
361
362static int
363trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
364{
365 if (len > ((PAGE_SIZE - 1) - s->len))
366 return 0;
367
368 memcpy(s->buffer + s->len, mem, len);
369 s->len += len;
370
371 return len;
372}
373
374#define HEX_CHARS 17
375static const char hex2asc[] = "0123456789abcdef";
376
377static int
378trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
379{
380 unsigned char hex[HEX_CHARS];
381 unsigned char *data = mem;
382 unsigned char byte;
383 int i, j;
384
385 BUG_ON(len >= HEX_CHARS);
386
387#ifdef __BIG_ENDIAN
388 for (i = 0, j = 0; i < len; i++) {
389#else
390 for (i = len-1, j = 0; i >= 0; i--) {
391#endif
392 byte = data[i];
393
394 hex[j++] = hex2asc[byte & 0x0f];
395 hex[j++] = hex2asc[byte >> 4];
396 }
397 hex[j++] = ' ';
398
399 return trace_seq_putmem(s, hex, j);
400}
401
402static void
403trace_seq_reset(struct trace_seq *s)
404{
405 s->len = 0;
406 s->readpos = 0;
407}
408
409ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
410{
411 int len;
412 int ret;
413
414 if (s->len <= s->readpos)
415 return -EBUSY;
416
417 len = s->len - s->readpos;
418 if (cnt > len)
419 cnt = len;
420 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
421 if (ret)
422 return -EFAULT;
423
424 s->readpos += len;
425 return cnt;
426}
427
428static void
429trace_print_seq(struct seq_file *m, struct trace_seq *s)
430{
431 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
432
433 s->buffer[len] = 0;
434 seq_puts(m, s->buffer);
435
436 trace_seq_reset(s);
437}
438
439/*
440 * flip the trace buffers between two trace descriptors.
441 * This usually is the buffers between the global_trace and
442 * the max_tr to record a snapshot of a current trace.
443 *
444 * The ftrace_max_lock must be held.
445 */
446static void
447flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
448{
449 struct list_head flip_pages;
450
451 INIT_LIST_HEAD(&flip_pages);
452
453 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
454 sizeof(struct trace_array_cpu) -
455 offsetof(struct trace_array_cpu, trace_head_idx));
456
457 check_pages(tr1);
458 check_pages(tr2);
459 list_splice_init(&tr1->trace_pages, &flip_pages);
460 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
461 list_splice_init(&flip_pages, &tr2->trace_pages);
462 BUG_ON(!list_empty(&flip_pages));
463 check_pages(tr1);
464 check_pages(tr2);
465}
466
467/**
468 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
469 * @tr: tracer
470 * @tsk: the task with the latency
471 * @cpu: The cpu that initiated the trace.
472 *
473 * Flip the buffers between the @tr and the max_tr and record information
474 * about which task was the cause of this latency.
475 */
476void
477update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
478{
479 struct trace_array_cpu *data;
480 int i;
481
482 WARN_ON_ONCE(!irqs_disabled());
483 __raw_spin_lock(&ftrace_max_lock);
484 /* clear out all the previous traces */
485 for_each_tracing_cpu(i) {
486 data = tr->data[i];
487 flip_trace(max_tr.data[i], data);
488 tracing_reset(data);
489 }
490
491 __update_max_tr(tr, tsk, cpu);
492 __raw_spin_unlock(&ftrace_max_lock);
493}
494
495/**
496 * update_max_tr_single - only copy one trace over, and reset the rest
497 * @tr - tracer
498 * @tsk - task with the latency
499 * @cpu - the cpu of the buffer to copy.
500 *
501 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
502 */
503void
504update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
505{
506 struct trace_array_cpu *data = tr->data[cpu];
507 int i;
508
509 WARN_ON_ONCE(!irqs_disabled());
510 __raw_spin_lock(&ftrace_max_lock);
511 for_each_tracing_cpu(i)
512 tracing_reset(max_tr.data[i]);
513
514 flip_trace(max_tr.data[cpu], data);
515 tracing_reset(data);
516
517 __update_max_tr(tr, tsk, cpu);
518 __raw_spin_unlock(&ftrace_max_lock);
519}
520
521/**
522 * register_tracer - register a tracer with the ftrace system.
523 * @type - the plugin for the tracer
524 *
525 * Register a new plugin tracer.
526 */
527int register_tracer(struct tracer *type)
528{
529 struct tracer *t;
530 int len;
531 int ret = 0;
532
533 if (!type->name) {
534 pr_info("Tracer must have a name\n");
535 return -1;
536 }
537
538 mutex_lock(&trace_types_lock);
539 for (t = trace_types; t; t = t->next) {
540 if (strcmp(type->name, t->name) == 0) {
541 /* already found */
542 pr_info("Trace %s already registered\n",
543 type->name);
544 ret = -1;
545 goto out;
546 }
547 }
548
549#ifdef CONFIG_FTRACE_STARTUP_TEST
550 if (type->selftest) {
551 struct tracer *saved_tracer = current_trace;
552 struct trace_array_cpu *data;
553 struct trace_array *tr = &global_trace;
554 int saved_ctrl = tr->ctrl;
555 int i;
556 /*
557 * Run a selftest on this tracer.
558 * Here we reset the trace buffer, and set the current
559 * tracer to be this tracer. The tracer can then run some
560 * internal tracing to verify that everything is in order.
561 * If we fail, we do not register this tracer.
562 */
563 for_each_tracing_cpu(i) {
564 data = tr->data[i];
565 if (!head_page(data))
566 continue;
567 tracing_reset(data);
568 }
569 current_trace = type;
570 tr->ctrl = 0;
571 /* the test is responsible for initializing and enabling */
572 pr_info("Testing tracer %s: ", type->name);
573 ret = type->selftest(type, tr);
574 /* the test is responsible for resetting too */
575 current_trace = saved_tracer;
576 tr->ctrl = saved_ctrl;
577 if (ret) {
578 printk(KERN_CONT "FAILED!\n");
579 goto out;
580 }
581 /* Only reset on passing, to avoid touching corrupted buffers */
582 for_each_tracing_cpu(i) {
583 data = tr->data[i];
584 if (!head_page(data))
585 continue;
586 tracing_reset(data);
587 }
588 printk(KERN_CONT "PASSED\n");
589 }
590#endif
591
592 type->next = trace_types;
593 trace_types = type;
594 len = strlen(type->name);
595 if (len > max_tracer_type_len)
596 max_tracer_type_len = len;
597
598 out:
599 mutex_unlock(&trace_types_lock);
600
601 return ret;
602}
603
604void unregister_tracer(struct tracer *type)
605{
606 struct tracer **t;
607 int len;
608
609 mutex_lock(&trace_types_lock);
610 for (t = &trace_types; *t; t = &(*t)->next) {
611 if (*t == type)
612 goto found;
613 }
614 pr_info("Trace %s not registered\n", type->name);
615 goto out;
616
617 found:
618 *t = (*t)->next;
619 if (strlen(type->name) != max_tracer_type_len)
620 goto out;
621
622 max_tracer_type_len = 0;
623 for (t = &trace_types; *t; t = &(*t)->next) {
624 len = strlen((*t)->name);
625 if (len > max_tracer_type_len)
626 max_tracer_type_len = len;
627 }
628 out:
629 mutex_unlock(&trace_types_lock);
630}
631
632void tracing_reset(struct trace_array_cpu *data)
633{
634 data->trace_idx = 0;
635 data->overrun = 0;
636 data->trace_head = data->trace_tail = head_page(data);
637 data->trace_head_idx = 0;
638 data->trace_tail_idx = 0;
639}
640
641#define SAVED_CMDLINES 128
642static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
643static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
644static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
645static int cmdline_idx;
646static DEFINE_SPINLOCK(trace_cmdline_lock);
647
648/* trace in all context switches */
649atomic_t trace_record_cmdline_enabled __read_mostly;
650
651/* temporary disable recording */
652atomic_t trace_record_cmdline_disabled __read_mostly;
653
654static void trace_init_cmdlines(void)
655{
656 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
657 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
658 cmdline_idx = 0;
659}
660
661void trace_stop_cmdline_recording(void);
662
663static void trace_save_cmdline(struct task_struct *tsk)
664{
665 unsigned map;
666 unsigned idx;
667
668 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
669 return;
670
671 /*
672 * It's not the end of the world if we don't get
673 * the lock, but we also don't want to spin
674 * nor do we want to disable interrupts,
675 * so if we miss here, then better luck next time.
676 */
677 if (!spin_trylock(&trace_cmdline_lock))
678 return;
679
680 idx = map_pid_to_cmdline[tsk->pid];
681 if (idx >= SAVED_CMDLINES) {
682 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
683
684 map = map_cmdline_to_pid[idx];
685 if (map <= PID_MAX_DEFAULT)
686 map_pid_to_cmdline[map] = (unsigned)-1;
687
688 map_pid_to_cmdline[tsk->pid] = idx;
689
690 cmdline_idx = idx;
691 }
692
693 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
694
695 spin_unlock(&trace_cmdline_lock);
696}
697
698static char *trace_find_cmdline(int pid)
699{
700 char *cmdline = "<...>";
701 unsigned map;
702
703 if (!pid)
704 return "<idle>";
705
706 if (pid > PID_MAX_DEFAULT)
707 goto out;
708
709 map = map_pid_to_cmdline[pid];
710 if (map >= SAVED_CMDLINES)
711 goto out;
712
713 cmdline = saved_cmdlines[map];
714
715 out:
716 return cmdline;
717}
718
719void tracing_record_cmdline(struct task_struct *tsk)
720{
721 if (atomic_read(&trace_record_cmdline_disabled))
722 return;
723
724 trace_save_cmdline(tsk);
725}
726
727static inline struct list_head *
728trace_next_list(struct trace_array_cpu *data, struct list_head *next)
729{
730 /*
731 * Roundrobin - but skip the head (which is not a real page):
732 */
733 next = next->next;
734 if (unlikely(next == &data->trace_pages))
735 next = next->next;
736 BUG_ON(next == &data->trace_pages);
737
738 return next;
739}
740
741static inline void *
742trace_next_page(struct trace_array_cpu *data, void *addr)
743{
744 struct list_head *next;
745 struct page *page;
746
747 page = virt_to_page(addr);
748
749 next = trace_next_list(data, &page->lru);
750 page = list_entry(next, struct page, lru);
751
752 return page_address(page);
753}
754
755static inline struct trace_entry *
756tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
757{
758 unsigned long idx, idx_next;
759 struct trace_entry *entry;
760
761 data->trace_idx++;
762 idx = data->trace_head_idx;
763 idx_next = idx + 1;
764
765 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
766
767 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
768
769 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
770 data->trace_head = trace_next_page(data, data->trace_head);
771 idx_next = 0;
772 }
773
774 if (data->trace_head == data->trace_tail &&
775 idx_next == data->trace_tail_idx) {
776 /* overrun */
777 data->overrun++;
778 data->trace_tail_idx++;
779 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
780 data->trace_tail =
781 trace_next_page(data, data->trace_tail);
782 data->trace_tail_idx = 0;
783 }
784 }
785
786 data->trace_head_idx = idx_next;
787
788 return entry;
789}
790
791static inline void
792tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
793{
794 struct task_struct *tsk = current;
795 unsigned long pc;
796
797 pc = preempt_count();
798
799 entry->preempt_count = pc & 0xff;
800 entry->pid = (tsk) ? tsk->pid : 0;
801 entry->t = ftrace_now(raw_smp_processor_id());
802 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
803 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
804 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
805 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
806}
807
808void
809trace_function(struct trace_array *tr, struct trace_array_cpu *data,
810 unsigned long ip, unsigned long parent_ip, unsigned long flags)
811{
812 struct trace_entry *entry;
813 unsigned long irq_flags;
814
815 raw_local_irq_save(irq_flags);
816 __raw_spin_lock(&data->lock);
817 entry = tracing_get_trace_entry(tr, data);
818 tracing_generic_entry_update(entry, flags);
819 entry->type = TRACE_FN;
820 entry->fn.ip = ip;
821 entry->fn.parent_ip = parent_ip;
822 __raw_spin_unlock(&data->lock);
823 raw_local_irq_restore(irq_flags);
824}
825
826void
827ftrace(struct trace_array *tr, struct trace_array_cpu *data,
828 unsigned long ip, unsigned long parent_ip, unsigned long flags)
829{
830 if (likely(!atomic_read(&data->disabled)))
831 trace_function(tr, data, ip, parent_ip, flags);
832}
833
834#ifdef CONFIG_MMIOTRACE
835void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
836 struct mmiotrace_rw *rw)
837{
838 struct trace_entry *entry;
839 unsigned long irq_flags;
840
841 raw_local_irq_save(irq_flags);
842 __raw_spin_lock(&data->lock);
843
844 entry = tracing_get_trace_entry(tr, data);
845 tracing_generic_entry_update(entry, 0);
846 entry->type = TRACE_MMIO_RW;
847 entry->mmiorw = *rw;
848
849 __raw_spin_unlock(&data->lock);
850 raw_local_irq_restore(irq_flags);
851
852 trace_wake_up();
853}
854
855void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
856 struct mmiotrace_map *map)
857{
858 struct trace_entry *entry;
859 unsigned long irq_flags;
860
861 raw_local_irq_save(irq_flags);
862 __raw_spin_lock(&data->lock);
863
864 entry = tracing_get_trace_entry(tr, data);
865 tracing_generic_entry_update(entry, 0);
866 entry->type = TRACE_MMIO_MAP;
867 entry->mmiomap = *map;
868
869 __raw_spin_unlock(&data->lock);
870 raw_local_irq_restore(irq_flags);
871
872 trace_wake_up();
873}
874#endif
875
876void __trace_stack(struct trace_array *tr,
877 struct trace_array_cpu *data,
878 unsigned long flags,
879 int skip)
880{
881 struct trace_entry *entry;
882 struct stack_trace trace;
883
884 if (!(trace_flags & TRACE_ITER_STACKTRACE))
885 return;
886
887 entry = tracing_get_trace_entry(tr, data);
888 tracing_generic_entry_update(entry, flags);
889 entry->type = TRACE_STACK;
890
891 memset(&entry->stack, 0, sizeof(entry->stack));
892
893 trace.nr_entries = 0;
894 trace.max_entries = FTRACE_STACK_ENTRIES;
895 trace.skip = skip;
896 trace.entries = entry->stack.caller;
897
898 save_stack_trace(&trace);
899}
900
901void
902__trace_special(void *__tr, void *__data,
903 unsigned long arg1, unsigned long arg2, unsigned long arg3)
904{
905 struct trace_array_cpu *data = __data;
906 struct trace_array *tr = __tr;
907 struct trace_entry *entry;
908 unsigned long irq_flags;
909
910 raw_local_irq_save(irq_flags);
911 __raw_spin_lock(&data->lock);
912 entry = tracing_get_trace_entry(tr, data);
913 tracing_generic_entry_update(entry, 0);
914 entry->type = TRACE_SPECIAL;
915 entry->special.arg1 = arg1;
916 entry->special.arg2 = arg2;
917 entry->special.arg3 = arg3;
918 __trace_stack(tr, data, irq_flags, 4);
919 __raw_spin_unlock(&data->lock);
920 raw_local_irq_restore(irq_flags);
921
922 trace_wake_up();
923}
924
925void
926tracing_sched_switch_trace(struct trace_array *tr,
927 struct trace_array_cpu *data,
928 struct task_struct *prev,
929 struct task_struct *next,
930 unsigned long flags)
931{
932 struct trace_entry *entry;
933 unsigned long irq_flags;
934
935 raw_local_irq_save(irq_flags);
936 __raw_spin_lock(&data->lock);
937 entry = tracing_get_trace_entry(tr, data);
938 tracing_generic_entry_update(entry, flags);
939 entry->type = TRACE_CTX;
940 entry->ctx.prev_pid = prev->pid;
941 entry->ctx.prev_prio = prev->prio;
942 entry->ctx.prev_state = prev->state;
943 entry->ctx.next_pid = next->pid;
944 entry->ctx.next_prio = next->prio;
945 entry->ctx.next_state = next->state;
946 __trace_stack(tr, data, flags, 5);
947 __raw_spin_unlock(&data->lock);
948 raw_local_irq_restore(irq_flags);
949}
950
951void
952tracing_sched_wakeup_trace(struct trace_array *tr,
953 struct trace_array_cpu *data,
954 struct task_struct *wakee,
955 struct task_struct *curr,
956 unsigned long flags)
957{
958 struct trace_entry *entry;
959 unsigned long irq_flags;
960
961 raw_local_irq_save(irq_flags);
962 __raw_spin_lock(&data->lock);
963 entry = tracing_get_trace_entry(tr, data);
964 tracing_generic_entry_update(entry, flags);
965 entry->type = TRACE_WAKE;
966 entry->ctx.prev_pid = curr->pid;
967 entry->ctx.prev_prio = curr->prio;
968 entry->ctx.prev_state = curr->state;
969 entry->ctx.next_pid = wakee->pid;
970 entry->ctx.next_prio = wakee->prio;
971 entry->ctx.next_state = wakee->state;
972 __trace_stack(tr, data, flags, 6);
973 __raw_spin_unlock(&data->lock);
974 raw_local_irq_restore(irq_flags);
975
976 trace_wake_up();
977}
978
979#ifdef CONFIG_FTRACE
980static void
981function_trace_call(unsigned long ip, unsigned long parent_ip)
982{
983 struct trace_array *tr = &global_trace;
984 struct trace_array_cpu *data;
985 unsigned long flags;
986 long disabled;
987 int cpu;
988
989 if (unlikely(!tracer_enabled))
990 return;
991
992 local_irq_save(flags);
993 cpu = raw_smp_processor_id();
994 data = tr->data[cpu];
995 disabled = atomic_inc_return(&data->disabled);
996
997 if (likely(disabled == 1))
998 trace_function(tr, data, ip, parent_ip, flags);
999
1000 atomic_dec(&data->disabled);
1001 local_irq_restore(flags);
1002}
1003
1004static struct ftrace_ops trace_ops __read_mostly =
1005{
1006 .func = function_trace_call,
1007};
1008
1009void tracing_start_function_trace(void)
1010{
1011 register_ftrace_function(&trace_ops);
1012}
1013
1014void tracing_stop_function_trace(void)
1015{
1016 unregister_ftrace_function(&trace_ops);
1017}
1018#endif
1019
1020enum trace_file_type {
1021 TRACE_FILE_LAT_FMT = 1,
1022};
1023
1024static struct trace_entry *
1025trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1026 struct trace_iterator *iter, int cpu)
1027{
1028 struct page *page;
1029 struct trace_entry *array;
1030
1031 if (iter->next_idx[cpu] >= tr->entries ||
1032 iter->next_idx[cpu] >= data->trace_idx ||
1033 (data->trace_head == data->trace_tail &&
1034 data->trace_head_idx == data->trace_tail_idx))
1035 return NULL;
1036
1037 if (!iter->next_page[cpu]) {
1038 /* Initialize the iterator for this cpu trace buffer */
1039 WARN_ON(!data->trace_tail);
1040 page = virt_to_page(data->trace_tail);
1041 iter->next_page[cpu] = &page->lru;
1042 iter->next_page_idx[cpu] = data->trace_tail_idx;
1043 }
1044
1045 page = list_entry(iter->next_page[cpu], struct page, lru);
1046 BUG_ON(&data->trace_pages == &page->lru);
1047
1048 array = page_address(page);
1049
1050 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
1051 return &array[iter->next_page_idx[cpu]];
1052}
1053
1054static struct trace_entry *
1055find_next_entry(struct trace_iterator *iter, int *ent_cpu)
1056{
1057 struct trace_array *tr = iter->tr;
1058 struct trace_entry *ent, *next = NULL;
1059 int next_cpu = -1;
1060 int cpu;
1061
1062 for_each_tracing_cpu(cpu) {
1063 if (!head_page(tr->data[cpu]))
1064 continue;
1065 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
1066 /*
1067 * Pick the entry with the smallest timestamp:
1068 */
1069 if (ent && (!next || ent->t < next->t)) {
1070 next = ent;
1071 next_cpu = cpu;
1072 }
1073 }
1074
1075 if (ent_cpu)
1076 *ent_cpu = next_cpu;
1077
1078 return next;
1079}
1080
1081static void trace_iterator_increment(struct trace_iterator *iter)
1082{
1083 iter->idx++;
1084 iter->next_idx[iter->cpu]++;
1085 iter->next_page_idx[iter->cpu]++;
1086
1087 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1088 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1089
1090 iter->next_page_idx[iter->cpu] = 0;
1091 iter->next_page[iter->cpu] =
1092 trace_next_list(data, iter->next_page[iter->cpu]);
1093 }
1094}
1095
1096static void trace_consume(struct trace_iterator *iter)
1097{
1098 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1099
1100 data->trace_tail_idx++;
1101 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
1102 data->trace_tail = trace_next_page(data, data->trace_tail);
1103 data->trace_tail_idx = 0;
1104 }
1105
1106 /* Check if we empty it, then reset the index */
1107 if (data->trace_head == data->trace_tail &&
1108 data->trace_head_idx == data->trace_tail_idx)
1109 data->trace_idx = 0;
1110}
1111
1112static void *find_next_entry_inc(struct trace_iterator *iter)
1113{
1114 struct trace_entry *next;
1115 int next_cpu = -1;
1116
1117 next = find_next_entry(iter, &next_cpu);
1118
1119 iter->prev_ent = iter->ent;
1120 iter->prev_cpu = iter->cpu;
1121
1122 iter->ent = next;
1123 iter->cpu = next_cpu;
1124
1125 if (next)
1126 trace_iterator_increment(iter);
1127
1128 return next ? iter : NULL;
1129}
1130
1131static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1132{
1133 struct trace_iterator *iter = m->private;
1134 void *last_ent = iter->ent;
1135 int i = (int)*pos;
1136 void *ent;
1137
1138 (*pos)++;
1139
1140 /* can't go backwards */
1141 if (iter->idx > i)
1142 return NULL;
1143
1144 if (iter->idx < 0)
1145 ent = find_next_entry_inc(iter);
1146 else
1147 ent = iter;
1148
1149 while (ent && iter->idx < i)
1150 ent = find_next_entry_inc(iter);
1151
1152 iter->pos = *pos;
1153
1154 if (last_ent && !ent)
1155 seq_puts(m, "\n\nvim:ft=help\n");
1156
1157 return ent;
1158}
1159
1160static void *s_start(struct seq_file *m, loff_t *pos)
1161{
1162 struct trace_iterator *iter = m->private;
1163 void *p = NULL;
1164 loff_t l = 0;
1165 int i;
1166
1167 mutex_lock(&trace_types_lock);
1168
1169 if (!current_trace || current_trace != iter->trace) {
1170 mutex_unlock(&trace_types_lock);
1171 return NULL;
1172 }
1173
1174 atomic_inc(&trace_record_cmdline_disabled);
1175
1176 /* let the tracer grab locks here if needed */
1177 if (current_trace->start)
1178 current_trace->start(iter);
1179
1180 if (*pos != iter->pos) {
1181 iter->ent = NULL;
1182 iter->cpu = 0;
1183 iter->idx = -1;
1184 iter->prev_ent = NULL;
1185 iter->prev_cpu = -1;
1186
1187 for_each_tracing_cpu(i) {
1188 iter->next_idx[i] = 0;
1189 iter->next_page[i] = NULL;
1190 }
1191
1192 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1193 ;
1194
1195 } else {
1196 l = *pos - 1;
1197 p = s_next(m, p, &l);
1198 }
1199
1200 return p;
1201}
1202
1203static void s_stop(struct seq_file *m, void *p)
1204{
1205 struct trace_iterator *iter = m->private;
1206
1207 atomic_dec(&trace_record_cmdline_disabled);
1208
1209 /* let the tracer release locks here if needed */
1210 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1211 iter->trace->stop(iter);
1212
1213 mutex_unlock(&trace_types_lock);
1214}
1215
1216static int
1217seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1218{
1219#ifdef CONFIG_KALLSYMS
1220 char str[KSYM_SYMBOL_LEN];
1221
1222 kallsyms_lookup(address, NULL, NULL, NULL, str);
1223
1224 return trace_seq_printf(s, fmt, str);
1225#endif
1226 return 1;
1227}
1228
1229static int
1230seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1231 unsigned long address)
1232{
1233#ifdef CONFIG_KALLSYMS
1234 char str[KSYM_SYMBOL_LEN];
1235
1236 sprint_symbol(str, address);
1237 return trace_seq_printf(s, fmt, str);
1238#endif
1239 return 1;
1240}
1241
1242#ifndef CONFIG_64BIT
1243# define IP_FMT "%08lx"
1244#else
1245# define IP_FMT "%016lx"
1246#endif
1247
1248static int
1249seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1250{
1251 int ret;
1252
1253 if (!ip)
1254 return trace_seq_printf(s, "0");
1255
1256 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1257 ret = seq_print_sym_offset(s, "%s", ip);
1258 else
1259 ret = seq_print_sym_short(s, "%s", ip);
1260
1261 if (!ret)
1262 return 0;
1263
1264 if (sym_flags & TRACE_ITER_SYM_ADDR)
1265 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1266 return ret;
1267}
1268
1269static void print_lat_help_header(struct seq_file *m)
1270{
1271 seq_puts(m, "# _------=> CPU# \n");
1272 seq_puts(m, "# / _-----=> irqs-off \n");
1273 seq_puts(m, "# | / _----=> need-resched \n");
1274 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1275 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1276 seq_puts(m, "# |||| / \n");
1277 seq_puts(m, "# ||||| delay \n");
1278 seq_puts(m, "# cmd pid ||||| time | caller \n");
1279 seq_puts(m, "# \\ / ||||| \\ | / \n");
1280}
1281
1282static void print_func_help_header(struct seq_file *m)
1283{
1284 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1285 seq_puts(m, "# | | | | |\n");
1286}
1287
1288
1289static void
1290print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1291{
1292 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1293 struct trace_array *tr = iter->tr;
1294 struct trace_array_cpu *data = tr->data[tr->cpu];
1295 struct tracer *type = current_trace;
1296 unsigned long total = 0;
1297 unsigned long entries = 0;
1298 int cpu;
1299 const char *name = "preemption";
1300
1301 if (type)
1302 name = type->name;
1303
1304 for_each_tracing_cpu(cpu) {
1305 if (head_page(tr->data[cpu])) {
1306 total += tr->data[cpu]->trace_idx;
1307 if (tr->data[cpu]->trace_idx > tr->entries)
1308 entries += tr->entries;
1309 else
1310 entries += tr->data[cpu]->trace_idx;
1311 }
1312 }
1313
1314 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1315 name, UTS_RELEASE);
1316 seq_puts(m, "-----------------------------------"
1317 "---------------------------------\n");
1318 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
1319 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1320 nsecs_to_usecs(data->saved_latency),
1321 entries,
1322 total,
1323 tr->cpu,
1324#if defined(CONFIG_PREEMPT_NONE)
1325 "server",
1326#elif defined(CONFIG_PREEMPT_VOLUNTARY)
1327 "desktop",
1328#elif defined(CONFIG_PREEMPT_DESKTOP)
1329 "preempt",
1330#else
1331 "unknown",
1332#endif
1333 /* These are reserved for later use */
1334 0, 0, 0, 0);
1335#ifdef CONFIG_SMP
1336 seq_printf(m, " #P:%d)\n", num_online_cpus());
1337#else
1338 seq_puts(m, ")\n");
1339#endif
1340 seq_puts(m, " -----------------\n");
1341 seq_printf(m, " | task: %.16s-%d "
1342 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1343 data->comm, data->pid, data->uid, data->nice,
1344 data->policy, data->rt_priority);
1345 seq_puts(m, " -----------------\n");
1346
1347 if (data->critical_start) {
1348 seq_puts(m, " => started at: ");
1349 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1350 trace_print_seq(m, &iter->seq);
1351 seq_puts(m, "\n => ended at: ");
1352 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1353 trace_print_seq(m, &iter->seq);
1354 seq_puts(m, "\n");
1355 }
1356
1357 seq_puts(m, "\n");
1358}
1359
1360static void
1361lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1362{
1363 int hardirq, softirq;
1364 char *comm;
1365
1366 comm = trace_find_cmdline(entry->pid);
1367
1368 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1369 trace_seq_printf(s, "%d", cpu);
1370 trace_seq_printf(s, "%c%c",
1371 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
1372 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1373
1374 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1375 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1376 if (hardirq && softirq) {
1377 trace_seq_putc(s, 'H');
1378 } else {
1379 if (hardirq) {
1380 trace_seq_putc(s, 'h');
1381 } else {
1382 if (softirq)
1383 trace_seq_putc(s, 's');
1384 else
1385 trace_seq_putc(s, '.');
1386 }
1387 }
1388
1389 if (entry->preempt_count)
1390 trace_seq_printf(s, "%x", entry->preempt_count);
1391 else
1392 trace_seq_puts(s, ".");
1393}
1394
1395unsigned long preempt_mark_thresh = 100;
1396
1397static void
1398lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1399 unsigned long rel_usecs)
1400{
1401 trace_seq_printf(s, " %4lldus", abs_usecs);
1402 if (rel_usecs > preempt_mark_thresh)
1403 trace_seq_puts(s, "!: ");
1404 else if (rel_usecs > 1)
1405 trace_seq_puts(s, "+: ");
1406 else
1407 trace_seq_puts(s, " : ");
1408}
1409
1410static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1411
1412static int
1413print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1414{
1415 struct trace_seq *s = &iter->seq;
1416 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1417 struct trace_entry *next_entry = find_next_entry(iter, NULL);
1418 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1419 struct trace_entry *entry = iter->ent;
1420 unsigned long abs_usecs;
1421 unsigned long rel_usecs;
1422 char *comm;
1423 int S, T;
1424 int i;
1425 unsigned state;
1426
1427 if (!next_entry)
1428 next_entry = entry;
1429 rel_usecs = ns2usecs(next_entry->t - entry->t);
1430 abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
1431
1432 if (verbose) {
1433 comm = trace_find_cmdline(entry->pid);
1434 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
1435 " %ld.%03ldms (+%ld.%03ldms): ",
1436 comm,
1437 entry->pid, cpu, entry->flags,
1438 entry->preempt_count, trace_idx,
1439 ns2usecs(entry->t),
1440 abs_usecs/1000,
1441 abs_usecs % 1000, rel_usecs/1000,
1442 rel_usecs % 1000);
1443 } else {
1444 lat_print_generic(s, entry, cpu);
1445 lat_print_timestamp(s, abs_usecs, rel_usecs);
1446 }
1447 switch (entry->type) {
1448 case TRACE_FN:
1449 seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1450 trace_seq_puts(s, " (");
1451 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
1452 trace_seq_puts(s, ")\n");
1453 break;
1454 case TRACE_CTX:
1455 case TRACE_WAKE:
1456 T = entry->ctx.next_state < sizeof(state_to_char) ?
1457 state_to_char[entry->ctx.next_state] : 'X';
1458
1459 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
1460 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1461 comm = trace_find_cmdline(entry->ctx.next_pid);
1462 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
1463 entry->ctx.prev_pid,
1464 entry->ctx.prev_prio,
1465 S, entry->type == TRACE_CTX ? "==>" : " +",
1466 entry->ctx.next_pid,
1467 entry->ctx.next_prio,
1468 T, comm);
1469 break;
1470 case TRACE_SPECIAL:
1471 trace_seq_printf(s, "# %ld %ld %ld\n",
1472 entry->special.arg1,
1473 entry->special.arg2,
1474 entry->special.arg3);
1475 break;
1476 case TRACE_STACK:
1477 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1478 if (i)
1479 trace_seq_puts(s, " <= ");
1480 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
1481 }
1482 trace_seq_puts(s, "\n");
1483 break;
1484 default:
1485 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1486 }
1487 return 1;
1488}
1489
1490static int print_trace_fmt(struct trace_iterator *iter)
1491{
1492 struct trace_seq *s = &iter->seq;
1493 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1494 struct trace_entry *entry;
1495 unsigned long usec_rem;
1496 unsigned long long t;
1497 unsigned long secs;
1498 char *comm;
1499 int ret;
1500 int S, T;
1501 int i;
1502
1503 entry = iter->ent;
1504
1505 comm = trace_find_cmdline(iter->ent->pid);
1506
1507 t = ns2usecs(entry->t);
1508 usec_rem = do_div(t, 1000000ULL);
1509 secs = (unsigned long)t;
1510
1511 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1512 if (!ret)
1513 return 0;
1514 ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
1515 if (!ret)
1516 return 0;
1517 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1518 if (!ret)
1519 return 0;
1520
1521 switch (entry->type) {
1522 case TRACE_FN:
1523 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1524 if (!ret)
1525 return 0;
1526 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1527 entry->fn.parent_ip) {
1528 ret = trace_seq_printf(s, " <-");
1529 if (!ret)
1530 return 0;
1531 ret = seq_print_ip_sym(s, entry->fn.parent_ip,
1532 sym_flags);
1533 if (!ret)
1534 return 0;
1535 }
1536 ret = trace_seq_printf(s, "\n");
1537 if (!ret)
1538 return 0;
1539 break;
1540 case TRACE_CTX:
1541 case TRACE_WAKE:
1542 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1543 state_to_char[entry->ctx.prev_state] : 'X';
1544 T = entry->ctx.next_state < sizeof(state_to_char) ?
1545 state_to_char[entry->ctx.next_state] : 'X';
1546 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
1547 entry->ctx.prev_pid,
1548 entry->ctx.prev_prio,
1549 S,
1550 entry->type == TRACE_CTX ? "==>" : " +",
1551 entry->ctx.next_pid,
1552 entry->ctx.next_prio,
1553 T);
1554 if (!ret)
1555 return 0;
1556 break;
1557 case TRACE_SPECIAL:
1558 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1559 entry->special.arg1,
1560 entry->special.arg2,
1561 entry->special.arg3);
1562 if (!ret)
1563 return 0;
1564 break;
1565 case TRACE_STACK:
1566 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1567 if (i) {
1568 ret = trace_seq_puts(s, " <= ");
1569 if (!ret)
1570 return 0;
1571 }
1572 ret = seq_print_ip_sym(s, entry->stack.caller[i],
1573 sym_flags);
1574 if (!ret)
1575 return 0;
1576 }
1577 ret = trace_seq_puts(s, "\n");
1578 if (!ret)
1579 return 0;
1580 break;
1581 }
1582 return 1;
1583}
1584
1585static int print_raw_fmt(struct trace_iterator *iter)
1586{
1587 struct trace_seq *s = &iter->seq;
1588 struct trace_entry *entry;
1589 int ret;
1590 int S, T;
1591
1592 entry = iter->ent;
1593
1594 ret = trace_seq_printf(s, "%d %d %llu ",
1595 entry->pid, iter->cpu, entry->t);
1596 if (!ret)
1597 return 0;
1598
1599 switch (entry->type) {
1600 case TRACE_FN:
1601 ret = trace_seq_printf(s, "%x %x\n",
1602 entry->fn.ip, entry->fn.parent_ip);
1603 if (!ret)
1604 return 0;
1605 break;
1606 case TRACE_CTX:
1607 case TRACE_WAKE:
1608 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1609 state_to_char[entry->ctx.prev_state] : 'X';
1610 T = entry->ctx.next_state < sizeof(state_to_char) ?
1611 state_to_char[entry->ctx.next_state] : 'X';
1612 if (entry->type == TRACE_WAKE)
1613 S = '+';
1614 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
1615 entry->ctx.prev_pid,
1616 entry->ctx.prev_prio,
1617 S,
1618 entry->ctx.next_pid,
1619 entry->ctx.next_prio,
1620 T);
1621 if (!ret)
1622 return 0;
1623 break;
1624 case TRACE_SPECIAL:
1625 case TRACE_STACK:
1626 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1627 entry->special.arg1,
1628 entry->special.arg2,
1629 entry->special.arg3);
1630 if (!ret)
1631 return 0;
1632 break;
1633 }
1634 return 1;
1635}
1636
1637#define SEQ_PUT_FIELD_RET(s, x) \
1638do { \
1639 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
1640 return 0; \
1641} while (0)
1642
1643#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1644do { \
1645 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1646 return 0; \
1647} while (0)
1648
1649static int print_hex_fmt(struct trace_iterator *iter)
1650{
1651 struct trace_seq *s = &iter->seq;
1652 unsigned char newline = '\n';
1653 struct trace_entry *entry;
1654 int S, T;
1655
1656 entry = iter->ent;
1657
1658 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1659 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1660 SEQ_PUT_HEX_FIELD_RET(s, entry->t);
1661
1662 switch (entry->type) {
1663 case TRACE_FN:
1664 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
1665 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1666 break;
1667 case TRACE_CTX:
1668 case TRACE_WAKE:
1669 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1670 state_to_char[entry->ctx.prev_state] : 'X';
1671 T = entry->ctx.next_state < sizeof(state_to_char) ?
1672 state_to_char[entry->ctx.next_state] : 'X';
1673 if (entry->type == TRACE_WAKE)
1674 S = '+';
1675 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
1676 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
1677 SEQ_PUT_HEX_FIELD_RET(s, S);
1678 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
1679 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
1680 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1681 SEQ_PUT_HEX_FIELD_RET(s, T);
1682 break;
1683 case TRACE_SPECIAL:
1684 case TRACE_STACK:
1685 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
1686 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
1687 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
1688 break;
1689 }
1690 SEQ_PUT_FIELD_RET(s, newline);
1691
1692 return 1;
1693}
1694
1695static int print_bin_fmt(struct trace_iterator *iter)
1696{
1697 struct trace_seq *s = &iter->seq;
1698 struct trace_entry *entry;
1699
1700 entry = iter->ent;
1701
1702 SEQ_PUT_FIELD_RET(s, entry->pid);
1703 SEQ_PUT_FIELD_RET(s, entry->cpu);
1704 SEQ_PUT_FIELD_RET(s, entry->t);
1705
1706 switch (entry->type) {
1707 case TRACE_FN:
1708 SEQ_PUT_FIELD_RET(s, entry->fn.ip);
1709 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
1710 break;
1711 case TRACE_CTX:
1712 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
1713 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
1714 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
1715 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
1716 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
1717 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
1718 break;
1719 case TRACE_SPECIAL:
1720 case TRACE_STACK:
1721 SEQ_PUT_FIELD_RET(s, entry->special.arg1);
1722 SEQ_PUT_FIELD_RET(s, entry->special.arg2);
1723 SEQ_PUT_FIELD_RET(s, entry->special.arg3);
1724 break;
1725 }
1726 return 1;
1727}
1728
1729static int trace_empty(struct trace_iterator *iter)
1730{
1731 struct trace_array_cpu *data;
1732 int cpu;
1733
1734 for_each_tracing_cpu(cpu) {
1735 data = iter->tr->data[cpu];
1736
1737 if (head_page(data) && data->trace_idx &&
1738 (data->trace_tail != data->trace_head ||
1739 data->trace_tail_idx != data->trace_head_idx))
1740 return 0;
1741 }
1742 return 1;
1743}
1744
1745static int print_trace_line(struct trace_iterator *iter)
1746{
1747 if (iter->trace && iter->trace->print_line)
1748 return iter->trace->print_line(iter);
1749
1750 if (trace_flags & TRACE_ITER_BIN)
1751 return print_bin_fmt(iter);
1752
1753 if (trace_flags & TRACE_ITER_HEX)
1754 return print_hex_fmt(iter);
1755
1756 if (trace_flags & TRACE_ITER_RAW)
1757 return print_raw_fmt(iter);
1758
1759 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
1760 return print_lat_fmt(iter, iter->idx, iter->cpu);
1761
1762 return print_trace_fmt(iter);
1763}
1764
1765static int s_show(struct seq_file *m, void *v)
1766{
1767 struct trace_iterator *iter = v;
1768
1769 if (iter->ent == NULL) {
1770 if (iter->tr) {
1771 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1772 seq_puts(m, "#\n");
1773 }
1774 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1775 /* print nothing if the buffers are empty */
1776 if (trace_empty(iter))
1777 return 0;
1778 print_trace_header(m, iter);
1779 if (!(trace_flags & TRACE_ITER_VERBOSE))
1780 print_lat_help_header(m);
1781 } else {
1782 if (!(trace_flags & TRACE_ITER_VERBOSE))
1783 print_func_help_header(m);
1784 }
1785 } else {
1786 print_trace_line(iter);
1787 trace_print_seq(m, &iter->seq);
1788 }
1789
1790 return 0;
1791}
1792
1793static struct seq_operations tracer_seq_ops = {
1794 .start = s_start,
1795 .next = s_next,
1796 .stop = s_stop,
1797 .show = s_show,
1798};
1799
1800static struct trace_iterator *
1801__tracing_open(struct inode *inode, struct file *file, int *ret)
1802{
1803 struct trace_iterator *iter;
1804
1805 if (tracing_disabled) {
1806 *ret = -ENODEV;
1807 return NULL;
1808 }
1809
1810 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1811 if (!iter) {
1812 *ret = -ENOMEM;
1813 goto out;
1814 }
1815
1816 mutex_lock(&trace_types_lock);
1817 if (current_trace && current_trace->print_max)
1818 iter->tr = &max_tr;
1819 else
1820 iter->tr = inode->i_private;
1821 iter->trace = current_trace;
1822 iter->pos = -1;
1823
1824 /* TODO stop tracer */
1825 *ret = seq_open(file, &tracer_seq_ops);
1826 if (!*ret) {
1827 struct seq_file *m = file->private_data;
1828 m->private = iter;
1829
1830 /* stop the trace while dumping */
1831 if (iter->tr->ctrl)
1832 tracer_enabled = 0;
1833
1834 if (iter->trace && iter->trace->open)
1835 iter->trace->open(iter);
1836 } else {
1837 kfree(iter);
1838 iter = NULL;
1839 }
1840 mutex_unlock(&trace_types_lock);
1841
1842 out:
1843 return iter;
1844}
1845
1846int tracing_open_generic(struct inode *inode, struct file *filp)
1847{
1848 if (tracing_disabled)
1849 return -ENODEV;
1850
1851 filp->private_data = inode->i_private;
1852 return 0;
1853}
1854
1855int tracing_release(struct inode *inode, struct file *file)
1856{
1857 struct seq_file *m = (struct seq_file *)file->private_data;
1858 struct trace_iterator *iter = m->private;
1859
1860 mutex_lock(&trace_types_lock);
1861 if (iter->trace && iter->trace->close)
1862 iter->trace->close(iter);
1863
1864 /* reenable tracing if it was previously enabled */
1865 if (iter->tr->ctrl)
1866 tracer_enabled = 1;
1867 mutex_unlock(&trace_types_lock);
1868
1869 seq_release(inode, file);
1870 kfree(iter);
1871 return 0;
1872}
1873
1874static int tracing_open(struct inode *inode, struct file *file)
1875{
1876 int ret;
1877
1878 __tracing_open(inode, file, &ret);
1879
1880 return ret;
1881}
1882
1883static int tracing_lt_open(struct inode *inode, struct file *file)
1884{
1885 struct trace_iterator *iter;
1886 int ret;
1887
1888 iter = __tracing_open(inode, file, &ret);
1889
1890 if (!ret)
1891 iter->iter_flags |= TRACE_FILE_LAT_FMT;
1892
1893 return ret;
1894}
1895
1896
1897static void *
1898t_next(struct seq_file *m, void *v, loff_t *pos)
1899{
1900 struct tracer *t = m->private;
1901
1902 (*pos)++;
1903
1904 if (t)
1905 t = t->next;
1906
1907 m->private = t;
1908
1909 return t;
1910}
1911
1912static void *t_start(struct seq_file *m, loff_t *pos)
1913{
1914 struct tracer *t = m->private;
1915 loff_t l = 0;
1916
1917 mutex_lock(&trace_types_lock);
1918 for (; t && l < *pos; t = t_next(m, t, &l))
1919 ;
1920
1921 return t;
1922}
1923
1924static void t_stop(struct seq_file *m, void *p)
1925{
1926 mutex_unlock(&trace_types_lock);
1927}
1928
1929static int t_show(struct seq_file *m, void *v)
1930{
1931 struct tracer *t = v;
1932
1933 if (!t)
1934 return 0;
1935
1936 seq_printf(m, "%s", t->name);
1937 if (t->next)
1938 seq_putc(m, ' ');
1939 else
1940 seq_putc(m, '\n');
1941
1942 return 0;
1943}
1944
1945static struct seq_operations show_traces_seq_ops = {
1946 .start = t_start,
1947 .next = t_next,
1948 .stop = t_stop,
1949 .show = t_show,
1950};
1951
1952static int show_traces_open(struct inode *inode, struct file *file)
1953{
1954 int ret;
1955
1956 if (tracing_disabled)
1957 return -ENODEV;
1958
1959 ret = seq_open(file, &show_traces_seq_ops);
1960 if (!ret) {
1961 struct seq_file *m = file->private_data;
1962 m->private = trace_types;
1963 }
1964
1965 return ret;
1966}
1967
1968static struct file_operations tracing_fops = {
1969 .open = tracing_open,
1970 .read = seq_read,
1971 .llseek = seq_lseek,
1972 .release = tracing_release,
1973};
1974
1975static struct file_operations tracing_lt_fops = {
1976 .open = tracing_lt_open,
1977 .read = seq_read,
1978 .llseek = seq_lseek,
1979 .release = tracing_release,
1980};
1981
1982static struct file_operations show_traces_fops = {
1983 .open = show_traces_open,
1984 .read = seq_read,
1985 .release = seq_release,
1986};
1987
1988/*
1989 * Only trace on a CPU if the bitmask is set:
1990 */
1991static cpumask_t tracing_cpumask = CPU_MASK_ALL;
1992
1993/*
1994 * When tracing/tracing_cpu_mask is modified then this holds
1995 * the new bitmask we are about to install:
1996 */
1997static cpumask_t tracing_cpumask_new;
1998
1999/*
2000 * The tracer itself will not take this lock, but still we want
2001 * to provide a consistent cpumask to user-space:
2002 */
2003static DEFINE_MUTEX(tracing_cpumask_update_lock);
2004
2005/*
2006 * Temporary storage for the character representation of the
2007 * CPU bitmask (and one more byte for the newline):
2008 */
2009static char mask_str[NR_CPUS + 1];
2010
2011static ssize_t
2012tracing_cpumask_read(struct file *filp, char __user *ubuf,
2013 size_t count, loff_t *ppos)
2014{
2015 int len;
2016
2017 mutex_lock(&tracing_cpumask_update_lock);
2018
2019 len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
2020 if (count - len < 2) {
2021 count = -EINVAL;
2022 goto out_err;
2023 }
2024 len += sprintf(mask_str + len, "\n");
2025 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
2026
2027out_err:
2028 mutex_unlock(&tracing_cpumask_update_lock);
2029
2030 return count;
2031}
2032
2033static ssize_t
2034tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2035 size_t count, loff_t *ppos)
2036{
2037 int err, cpu;
2038
2039 mutex_lock(&tracing_cpumask_update_lock);
2040 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2041 if (err)
2042 goto err_unlock;
2043
2044 raw_local_irq_disable();
2045 __raw_spin_lock(&ftrace_max_lock);
2046 for_each_tracing_cpu(cpu) {
2047 /*
2048 * Increase/decrease the disabled counter if we are
2049 * about to flip a bit in the cpumask:
2050 */
2051 if (cpu_isset(cpu, tracing_cpumask) &&
2052 !cpu_isset(cpu, tracing_cpumask_new)) {
2053 atomic_inc(&global_trace.data[cpu]->disabled);
2054 }
2055 if (!cpu_isset(cpu, tracing_cpumask) &&
2056 cpu_isset(cpu, tracing_cpumask_new)) {
2057 atomic_dec(&global_trace.data[cpu]->disabled);
2058 }
2059 }
2060 __raw_spin_unlock(&ftrace_max_lock);
2061 raw_local_irq_enable();
2062
2063 tracing_cpumask = tracing_cpumask_new;
2064
2065 mutex_unlock(&tracing_cpumask_update_lock);
2066
2067 return count;
2068
2069err_unlock:
2070 mutex_unlock(&tracing_cpumask_update_lock);
2071
2072 return err;
2073}
2074
2075static struct file_operations tracing_cpumask_fops = {
2076 .open = tracing_open_generic,
2077 .read = tracing_cpumask_read,
2078 .write = tracing_cpumask_write,
2079};
2080
2081static ssize_t
2082tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2083 size_t cnt, loff_t *ppos)
2084{
2085 char *buf;
2086 int r = 0;
2087 int len = 0;
2088 int i;
2089
2090 /* calulate max size */
2091 for (i = 0; trace_options[i]; i++) {
2092 len += strlen(trace_options[i]);
2093 len += 3; /* "no" and space */
2094 }
2095
2096 /* +2 for \n and \0 */
2097 buf = kmalloc(len + 2, GFP_KERNEL);
2098 if (!buf)
2099 return -ENOMEM;
2100
2101 for (i = 0; trace_options[i]; i++) {
2102 if (trace_flags & (1 << i))
2103 r += sprintf(buf + r, "%s ", trace_options[i]);
2104 else
2105 r += sprintf(buf + r, "no%s ", trace_options[i]);
2106 }
2107
2108 r += sprintf(buf + r, "\n");
2109 WARN_ON(r >= len + 2);
2110
2111 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2112
2113 kfree(buf);
2114
2115 return r;
2116}
2117
2118static ssize_t
2119tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2120 size_t cnt, loff_t *ppos)
2121{
2122 char buf[64];
2123 char *cmp = buf;
2124 int neg = 0;
2125 int i;
2126
2127 if (cnt >= sizeof(buf))
2128 return -EINVAL;
2129
2130 if (copy_from_user(&buf, ubuf, cnt))
2131 return -EFAULT;
2132
2133 buf[cnt] = 0;
2134
2135 if (strncmp(buf, "no", 2) == 0) {
2136 neg = 1;
2137 cmp += 2;
2138 }
2139
2140 for (i = 0; trace_options[i]; i++) {
2141 int len = strlen(trace_options[i]);
2142
2143 if (strncmp(cmp, trace_options[i], len) == 0) {
2144 if (neg)
2145 trace_flags &= ~(1 << i);
2146 else
2147 trace_flags |= (1 << i);
2148 break;
2149 }
2150 }
2151 /*
2152 * If no option could be set, return an error:
2153 */
2154 if (!trace_options[i])
2155 return -EINVAL;
2156
2157 filp->f_pos += cnt;
2158
2159 return cnt;
2160}
2161
2162static struct file_operations tracing_iter_fops = {
2163 .open = tracing_open_generic,
2164 .read = tracing_iter_ctrl_read,
2165 .write = tracing_iter_ctrl_write,
2166};
2167
2168static const char readme_msg[] =
2169 "tracing mini-HOWTO:\n\n"
2170 "# mkdir /debug\n"
2171 "# mount -t debugfs nodev /debug\n\n"
2172 "# cat /debug/tracing/available_tracers\n"
2173 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
2174 "# cat /debug/tracing/current_tracer\n"
2175 "none\n"
2176 "# echo sched_switch > /debug/tracing/current_tracer\n"
2177 "# cat /debug/tracing/current_tracer\n"
2178 "sched_switch\n"
2179 "# cat /debug/tracing/iter_ctrl\n"
2180 "noprint-parent nosym-offset nosym-addr noverbose\n"
2181 "# echo print-parent > /debug/tracing/iter_ctrl\n"
2182 "# echo 1 > /debug/tracing/tracing_enabled\n"
2183 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2184 "echo 0 > /debug/tracing/tracing_enabled\n"
2185;
2186
2187static ssize_t
2188tracing_readme_read(struct file *filp, char __user *ubuf,
2189 size_t cnt, loff_t *ppos)
2190{
2191 return simple_read_from_buffer(ubuf, cnt, ppos,
2192 readme_msg, strlen(readme_msg));
2193}
2194
2195static struct file_operations tracing_readme_fops = {
2196 .open = tracing_open_generic,
2197 .read = tracing_readme_read,
2198};
2199
2200static ssize_t
2201tracing_ctrl_read(struct file *filp, char __user *ubuf,
2202 size_t cnt, loff_t *ppos)
2203{
2204 struct trace_array *tr = filp->private_data;
2205 char buf[64];
2206 int r;
2207
2208 r = sprintf(buf, "%ld\n", tr->ctrl);
2209 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2210}
2211
2212static ssize_t
2213tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2214 size_t cnt, loff_t *ppos)
2215{
2216 struct trace_array *tr = filp->private_data;
2217 char buf[64];
2218 long val;
2219 int ret;
2220
2221 if (cnt >= sizeof(buf))
2222 return -EINVAL;
2223
2224 if (copy_from_user(&buf, ubuf, cnt))
2225 return -EFAULT;
2226
2227 buf[cnt] = 0;
2228
2229 ret = strict_strtoul(buf, 10, &val);
2230 if (ret < 0)
2231 return ret;
2232
2233 val = !!val;
2234
2235 mutex_lock(&trace_types_lock);
2236 if (tr->ctrl ^ val) {
2237 if (val)
2238 tracer_enabled = 1;
2239 else
2240 tracer_enabled = 0;
2241
2242 tr->ctrl = val;
2243
2244 if (current_trace && current_trace->ctrl_update)
2245 current_trace->ctrl_update(tr);
2246 }
2247 mutex_unlock(&trace_types_lock);
2248
2249 filp->f_pos += cnt;
2250
2251 return cnt;
2252}
2253
2254static ssize_t
2255tracing_set_trace_read(struct file *filp, char __user *ubuf,
2256 size_t cnt, loff_t *ppos)
2257{
2258 char buf[max_tracer_type_len+2];
2259 int r;
2260
2261 mutex_lock(&trace_types_lock);
2262 if (current_trace)
2263 r = sprintf(buf, "%s\n", current_trace->name);
2264 else
2265 r = sprintf(buf, "\n");
2266 mutex_unlock(&trace_types_lock);
2267
2268 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2269}
2270
2271static ssize_t
2272tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2273 size_t cnt, loff_t *ppos)
2274{
2275 struct trace_array *tr = &global_trace;
2276 struct tracer *t;
2277 char buf[max_tracer_type_len+1];
2278 int i;
2279
2280 if (cnt > max_tracer_type_len)
2281 cnt = max_tracer_type_len;
2282
2283 if (copy_from_user(&buf, ubuf, cnt))
2284 return -EFAULT;
2285
2286 buf[cnt] = 0;
2287
2288 /* strip ending whitespace. */
2289 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2290 buf[i] = 0;
2291
2292 mutex_lock(&trace_types_lock);
2293 for (t = trace_types; t; t = t->next) {
2294 if (strcmp(t->name, buf) == 0)
2295 break;
2296 }
2297 if (!t || t == current_trace)
2298 goto out;
2299
2300 if (current_trace && current_trace->reset)
2301 current_trace->reset(tr);
2302
2303 current_trace = t;
2304 if (t->init)
2305 t->init(tr);
2306
2307 out:
2308 mutex_unlock(&trace_types_lock);
2309
2310 filp->f_pos += cnt;
2311
2312 return cnt;
2313}
2314
2315static ssize_t
2316tracing_max_lat_read(struct file *filp, char __user *ubuf,
2317 size_t cnt, loff_t *ppos)
2318{
2319 unsigned long *ptr = filp->private_data;
2320 char buf[64];
2321 int r;
2322
2323 r = snprintf(buf, sizeof(buf), "%ld\n",
2324 *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
2325 if (r > sizeof(buf))
2326 r = sizeof(buf);
2327 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2328}
2329
2330static ssize_t
2331tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2332 size_t cnt, loff_t *ppos)
2333{
2334 long *ptr = filp->private_data;
2335 char buf[64];
2336 long val;
2337 int ret;
2338
2339 if (cnt >= sizeof(buf))
2340 return -EINVAL;
2341
2342 if (copy_from_user(&buf, ubuf, cnt))
2343 return -EFAULT;
2344
2345 buf[cnt] = 0;
2346
2347 ret = strict_strtoul(buf, 10, &val);
2348 if (ret < 0)
2349 return ret;
2350
2351 *ptr = val * 1000;
2352
2353 return cnt;
2354}
2355
2356static atomic_t tracing_reader;
2357
2358static int tracing_open_pipe(struct inode *inode, struct file *filp)
2359{
2360 struct trace_iterator *iter;
2361
2362 if (tracing_disabled)
2363 return -ENODEV;
2364
2365 /* We only allow for reader of the pipe */
2366 if (atomic_inc_return(&tracing_reader) != 1) {
2367 atomic_dec(&tracing_reader);
2368 return -EBUSY;
2369 }
2370
2371 /* create a buffer to store the information to pass to userspace */
2372 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2373 if (!iter)
2374 return -ENOMEM;
2375
2376 mutex_lock(&trace_types_lock);
2377 iter->tr = &global_trace;
2378 iter->trace = current_trace;
2379 filp->private_data = iter;
2380
2381 if (iter->trace->pipe_open)
2382 iter->trace->pipe_open(iter);
2383 mutex_unlock(&trace_types_lock);
2384
2385 return 0;
2386}
2387
2388static int tracing_release_pipe(struct inode *inode, struct file *file)
2389{
2390 struct trace_iterator *iter = file->private_data;
2391
2392 kfree(iter);
2393 atomic_dec(&tracing_reader);
2394
2395 return 0;
2396}
2397
2398static unsigned int
2399tracing_poll_pipe(struct file *filp, poll_table *poll_table)
2400{
2401 struct trace_iterator *iter = filp->private_data;
2402
2403 if (trace_flags & TRACE_ITER_BLOCK) {
2404 /*
2405 * Always select as readable when in blocking mode
2406 */
2407 return POLLIN | POLLRDNORM;
2408 } else {
2409 if (!trace_empty(iter))
2410 return POLLIN | POLLRDNORM;
2411 poll_wait(filp, &trace_wait, poll_table);
2412 if (!trace_empty(iter))
2413 return POLLIN | POLLRDNORM;
2414
2415 return 0;
2416 }
2417}
2418
2419/*
2420 * Consumer reader.
2421 */
2422static ssize_t
2423tracing_read_pipe(struct file *filp, char __user *ubuf,
2424 size_t cnt, loff_t *ppos)
2425{
2426 struct trace_iterator *iter = filp->private_data;
2427 struct trace_array_cpu *data;
2428 static cpumask_t mask;
2429 unsigned long flags;
2430#ifdef CONFIG_FTRACE
2431 int ftrace_save;
2432#endif
2433 int cpu;
2434 ssize_t sret;
2435
2436 /* return any leftover data */
2437 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2438 if (sret != -EBUSY)
2439 return sret;
2440 sret = 0;
2441
2442 trace_seq_reset(&iter->seq);
2443
2444 mutex_lock(&trace_types_lock);
2445 if (iter->trace->read) {
2446 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2447 if (sret)
2448 goto out;
2449 }
2450
2451 while (trace_empty(iter)) {
2452
2453 if ((filp->f_flags & O_NONBLOCK)) {
2454 sret = -EAGAIN;
2455 goto out;
2456 }
2457
2458 /*
2459 * This is a make-shift waitqueue. The reason we don't use
2460 * an actual wait queue is because:
2461 * 1) we only ever have one waiter
2462 * 2) the tracing, traces all functions, we don't want
2463 * the overhead of calling wake_up and friends
2464 * (and tracing them too)
2465 * Anyway, this is really very primitive wakeup.
2466 */
2467 set_current_state(TASK_INTERRUPTIBLE);
2468 iter->tr->waiter = current;
2469
2470 mutex_unlock(&trace_types_lock);
2471
2472 /* sleep for 100 msecs, and try again. */
2473 schedule_timeout(HZ/10);
2474
2475 mutex_lock(&trace_types_lock);
2476
2477 iter->tr->waiter = NULL;
2478
2479 if (signal_pending(current)) {
2480 sret = -EINTR;
2481 goto out;
2482 }
2483
2484 if (iter->trace != current_trace)
2485 goto out;
2486
2487 /*
2488 * We block until we read something and tracing is disabled.
2489 * We still block if tracing is disabled, but we have never
2490 * read anything. This allows a user to cat this file, and
2491 * then enable tracing. But after we have read something,
2492 * we give an EOF when tracing is again disabled.
2493 *
2494 * iter->pos will be 0 if we haven't read anything.
2495 */
2496 if (!tracer_enabled && iter->pos)
2497 break;
2498
2499 continue;
2500 }
2501
2502 /* stop when tracing is finished */
2503 if (trace_empty(iter))
2504 goto out;
2505
2506 if (cnt >= PAGE_SIZE)
2507 cnt = PAGE_SIZE - 1;
2508
2509 /* reset all but tr, trace, and overruns */
2510 memset(&iter->seq, 0,
2511 sizeof(struct trace_iterator) -
2512 offsetof(struct trace_iterator, seq));
2513 iter->pos = -1;
2514
2515 /*
2516 * We need to stop all tracing on all CPUS to read the
2517 * the next buffer. This is a bit expensive, but is
2518 * not done often. We fill all what we can read,
2519 * and then release the locks again.
2520 */
2521
2522 cpus_clear(mask);
2523 local_irq_save(flags);
2524#ifdef CONFIG_FTRACE
2525 ftrace_save = ftrace_enabled;
2526 ftrace_enabled = 0;
2527#endif
2528 smp_wmb();
2529 for_each_tracing_cpu(cpu) {
2530 data = iter->tr->data[cpu];
2531
2532 if (!head_page(data) || !data->trace_idx)
2533 continue;
2534
2535 atomic_inc(&data->disabled);
2536 cpu_set(cpu, mask);
2537 }
2538
2539 for_each_cpu_mask(cpu, mask) {
2540 data = iter->tr->data[cpu];
2541 __raw_spin_lock(&data->lock);
2542
2543 if (data->overrun > iter->last_overrun[cpu])
2544 iter->overrun[cpu] +=
2545 data->overrun - iter->last_overrun[cpu];
2546 iter->last_overrun[cpu] = data->overrun;
2547 }
2548
2549 while (find_next_entry_inc(iter) != NULL) {
2550 int ret;
2551 int len = iter->seq.len;
2552
2553 ret = print_trace_line(iter);
2554 if (!ret) {
2555 /* don't print partial lines */
2556 iter->seq.len = len;
2557 break;
2558 }
2559
2560 trace_consume(iter);
2561
2562 if (iter->seq.len >= cnt)
2563 break;
2564 }
2565
2566 for_each_cpu_mask(cpu, mask) {
2567 data = iter->tr->data[cpu];
2568 __raw_spin_unlock(&data->lock);
2569 }
2570
2571 for_each_cpu_mask(cpu, mask) {
2572 data = iter->tr->data[cpu];
2573 atomic_dec(&data->disabled);
2574 }
2575#ifdef CONFIG_FTRACE
2576 ftrace_enabled = ftrace_save;
2577#endif
2578 local_irq_restore(flags);
2579
2580 /* Now copy what we have to the user */
2581 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2582 if (iter->seq.readpos >= iter->seq.len)
2583 trace_seq_reset(&iter->seq);
2584 if (sret == -EBUSY)
2585 sret = 0;
2586
2587out:
2588 mutex_unlock(&trace_types_lock);
2589
2590 return sret;
2591}
2592
2593static ssize_t
2594tracing_entries_read(struct file *filp, char __user *ubuf,
2595 size_t cnt, loff_t *ppos)
2596{
2597 struct trace_array *tr = filp->private_data;
2598 char buf[64];
2599 int r;
2600
2601 r = sprintf(buf, "%lu\n", tr->entries);
2602 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2603}
2604
2605static ssize_t
2606tracing_entries_write(struct file *filp, const char __user *ubuf,
2607 size_t cnt, loff_t *ppos)
2608{
2609 unsigned long val;
2610 char buf[64];
2611 int ret;
2612
2613 if (cnt >= sizeof(buf))
2614 return -EINVAL;
2615
2616 if (copy_from_user(&buf, ubuf, cnt))
2617 return -EFAULT;
2618
2619 buf[cnt] = 0;
2620
2621 ret = strict_strtoul(buf, 10, &val);
2622 if (ret < 0)
2623 return ret;
2624
2625 /* must have at least 1 entry */
2626 if (!val)
2627 return -EINVAL;
2628
2629 mutex_lock(&trace_types_lock);
2630
2631 if (current_trace != &no_tracer) {
2632 cnt = -EBUSY;
2633 pr_info("ftrace: set current_tracer to none"
2634 " before modifying buffer size\n");
2635 goto out;
2636 }
2637
2638 if (val > global_trace.entries) {
2639 long pages_requested;
2640 unsigned long freeable_pages;
2641
2642 /* make sure we have enough memory before mapping */
2643 pages_requested =
2644 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
2645
2646 /* account for each buffer (and max_tr) */
2647 pages_requested *= tracing_nr_buffers * 2;
2648
2649 /* Check for overflow */
2650 if (pages_requested < 0) {
2651 cnt = -ENOMEM;
2652 goto out;
2653 }
2654
2655 freeable_pages = determine_dirtyable_memory();
2656
2657 /* we only allow to request 1/4 of useable memory */
2658 if (pages_requested >
2659 ((freeable_pages + tracing_pages_allocated) / 4)) {
2660 cnt = -ENOMEM;
2661 goto out;
2662 }
2663
2664 while (global_trace.entries < val) {
2665 if (trace_alloc_page()) {
2666 cnt = -ENOMEM;
2667 goto out;
2668 }
2669 /* double check that we don't go over the known pages */
2670 if (tracing_pages_allocated > pages_requested)
2671 break;
2672 }
2673
2674 } else {
2675 /* include the number of entries in val (inc of page entries) */
2676 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2677 trace_free_page();
2678 }
2679
2680 filp->f_pos += cnt;
2681
2682 out:
2683 max_tr.entries = global_trace.entries;
2684 mutex_unlock(&trace_types_lock);
2685
2686 return cnt;
2687}
2688
2689static struct file_operations tracing_max_lat_fops = {
2690 .open = tracing_open_generic,
2691 .read = tracing_max_lat_read,
2692 .write = tracing_max_lat_write,
2693};
2694
2695static struct file_operations tracing_ctrl_fops = {
2696 .open = tracing_open_generic,
2697 .read = tracing_ctrl_read,
2698 .write = tracing_ctrl_write,
2699};
2700
2701static struct file_operations set_tracer_fops = {
2702 .open = tracing_open_generic,
2703 .read = tracing_set_trace_read,
2704 .write = tracing_set_trace_write,
2705};
2706
2707static struct file_operations tracing_pipe_fops = {
2708 .open = tracing_open_pipe,
2709 .poll = tracing_poll_pipe,
2710 .read = tracing_read_pipe,
2711 .release = tracing_release_pipe,
2712};
2713
2714static struct file_operations tracing_entries_fops = {
2715 .open = tracing_open_generic,
2716 .read = tracing_entries_read,
2717 .write = tracing_entries_write,
2718};
2719
2720#ifdef CONFIG_DYNAMIC_FTRACE
2721
2722static ssize_t
2723tracing_read_long(struct file *filp, char __user *ubuf,
2724 size_t cnt, loff_t *ppos)
2725{
2726 unsigned long *p = filp->private_data;
2727 char buf[64];
2728 int r;
2729
2730 r = sprintf(buf, "%ld\n", *p);
2731
2732 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2733}
2734
2735static struct file_operations tracing_read_long_fops = {
2736 .open = tracing_open_generic,
2737 .read = tracing_read_long,
2738};
2739#endif
2740
2741static struct dentry *d_tracer;
2742
2743struct dentry *tracing_init_dentry(void)
2744{
2745 static int once;
2746
2747 if (d_tracer)
2748 return d_tracer;
2749
2750 d_tracer = debugfs_create_dir("tracing", NULL);
2751
2752 if (!d_tracer && !once) {
2753 once = 1;
2754 pr_warning("Could not create debugfs directory 'tracing'\n");
2755 return NULL;
2756 }
2757
2758 return d_tracer;
2759}
2760
2761#ifdef CONFIG_FTRACE_SELFTEST
2762/* Let selftest have access to static functions in this file */
2763#include "trace_selftest.c"
2764#endif
2765
2766static __init void tracer_init_debugfs(void)
2767{
2768 struct dentry *d_tracer;
2769 struct dentry *entry;
2770
2771 d_tracer = tracing_init_dentry();
2772
2773 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
2774 &global_trace, &tracing_ctrl_fops);
2775 if (!entry)
2776 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2777
2778 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
2779 NULL, &tracing_iter_fops);
2780 if (!entry)
2781 pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
2782
2783 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2784 NULL, &tracing_cpumask_fops);
2785 if (!entry)
2786 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
2787
2788 entry = debugfs_create_file("latency_trace", 0444, d_tracer,
2789 &global_trace, &tracing_lt_fops);
2790 if (!entry)
2791 pr_warning("Could not create debugfs 'latency_trace' entry\n");
2792
2793 entry = debugfs_create_file("trace", 0444, d_tracer,
2794 &global_trace, &tracing_fops);
2795 if (!entry)
2796 pr_warning("Could not create debugfs 'trace' entry\n");
2797
2798 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2799 &global_trace, &show_traces_fops);
2800 if (!entry)
2801 pr_warning("Could not create debugfs 'trace' entry\n");
2802
2803 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2804 &global_trace, &set_tracer_fops);
2805 if (!entry)
2806 pr_warning("Could not create debugfs 'trace' entry\n");
2807
2808 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2809 &tracing_max_latency,
2810 &tracing_max_lat_fops);
2811 if (!entry)
2812 pr_warning("Could not create debugfs "
2813 "'tracing_max_latency' entry\n");
2814
2815 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
2816 &tracing_thresh, &tracing_max_lat_fops);
2817 if (!entry)
2818 pr_warning("Could not create debugfs "
2819 "'tracing_threash' entry\n");
2820 entry = debugfs_create_file("README", 0644, d_tracer,
2821 NULL, &tracing_readme_fops);
2822 if (!entry)
2823 pr_warning("Could not create debugfs 'README' entry\n");
2824
2825 entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
2826 NULL, &tracing_pipe_fops);
2827 if (!entry)
2828 pr_warning("Could not create debugfs "
2829 "'tracing_threash' entry\n");
2830
2831 entry = debugfs_create_file("trace_entries", 0644, d_tracer,
2832 &global_trace, &tracing_entries_fops);
2833 if (!entry)
2834 pr_warning("Could not create debugfs "
2835 "'tracing_threash' entry\n");
2836
2837#ifdef CONFIG_DYNAMIC_FTRACE
2838 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2839 &ftrace_update_tot_cnt,
2840 &tracing_read_long_fops);
2841 if (!entry)
2842 pr_warning("Could not create debugfs "
2843 "'dyn_ftrace_total_info' entry\n");
2844#endif
2845}
2846
2847static int trace_alloc_page(void)
2848{
2849 struct trace_array_cpu *data;
2850 struct page *page, *tmp;
2851 LIST_HEAD(pages);
2852 void *array;
2853 unsigned pages_allocated = 0;
2854 int i;
2855
2856 /* first allocate a page for each CPU */
2857 for_each_tracing_cpu(i) {
2858 array = (void *)__get_free_page(GFP_KERNEL);
2859 if (array == NULL) {
2860 printk(KERN_ERR "tracer: failed to allocate page"
2861 "for trace buffer!\n");
2862 goto free_pages;
2863 }
2864
2865 pages_allocated++;
2866 page = virt_to_page(array);
2867 list_add(&page->lru, &pages);
2868
2869/* Only allocate if we are actually using the max trace */
2870#ifdef CONFIG_TRACER_MAX_TRACE
2871 array = (void *)__get_free_page(GFP_KERNEL);
2872 if (array == NULL) {
2873 printk(KERN_ERR "tracer: failed to allocate page"
2874 "for trace buffer!\n");
2875 goto free_pages;
2876 }
2877 pages_allocated++;
2878 page = virt_to_page(array);
2879 list_add(&page->lru, &pages);
2880#endif
2881 }
2882
2883 /* Now that we successfully allocate a page per CPU, add them */
2884 for_each_tracing_cpu(i) {
2885 data = global_trace.data[i];
2886 page = list_entry(pages.next, struct page, lru);
2887 list_del_init(&page->lru);
2888 list_add_tail(&page->lru, &data->trace_pages);
2889 ClearPageLRU(page);
2890
2891#ifdef CONFIG_TRACER_MAX_TRACE
2892 data = max_tr.data[i];
2893 page = list_entry(pages.next, struct page, lru);
2894 list_del_init(&page->lru);
2895 list_add_tail(&page->lru, &data->trace_pages);
2896 SetPageLRU(page);
2897#endif
2898 }
2899 tracing_pages_allocated += pages_allocated;
2900 global_trace.entries += ENTRIES_PER_PAGE;
2901
2902 return 0;
2903
2904 free_pages:
2905 list_for_each_entry_safe(page, tmp, &pages, lru) {
2906 list_del_init(&page->lru);
2907 __free_page(page);
2908 }
2909 return -ENOMEM;
2910}
2911
2912static int trace_free_page(void)
2913{
2914 struct trace_array_cpu *data;
2915 struct page *page;
2916 struct list_head *p;
2917 int i;
2918 int ret = 0;
2919
2920 /* free one page from each buffer */
2921 for_each_tracing_cpu(i) {
2922 data = global_trace.data[i];
2923 p = data->trace_pages.next;
2924 if (p == &data->trace_pages) {
2925 /* should never happen */
2926 WARN_ON(1);
2927 tracing_disabled = 1;
2928 ret = -1;
2929 break;
2930 }
2931 page = list_entry(p, struct page, lru);
2932 ClearPageLRU(page);
2933 list_del(&page->lru);
2934 tracing_pages_allocated--;
2935 tracing_pages_allocated--;
2936 __free_page(page);
2937
2938 tracing_reset(data);
2939
2940#ifdef CONFIG_TRACER_MAX_TRACE
2941 data = max_tr.data[i];
2942 p = data->trace_pages.next;
2943 if (p == &data->trace_pages) {
2944 /* should never happen */
2945 WARN_ON(1);
2946 tracing_disabled = 1;
2947 ret = -1;
2948 break;
2949 }
2950 page = list_entry(p, struct page, lru);
2951 ClearPageLRU(page);
2952 list_del(&page->lru);
2953 __free_page(page);
2954
2955 tracing_reset(data);
2956#endif
2957 }
2958 global_trace.entries -= ENTRIES_PER_PAGE;
2959
2960 return ret;
2961}
2962
2963__init static int tracer_alloc_buffers(void)
2964{
2965 struct trace_array_cpu *data;
2966 void *array;
2967 struct page *page;
2968 int pages = 0;
2969 int ret = -ENOMEM;
2970 int i;
2971
2972 global_trace.ctrl = tracer_enabled;
2973
2974 /* TODO: make the number of buffers hot pluggable with CPUS */
2975 tracing_nr_buffers = num_possible_cpus();
2976 tracing_buffer_mask = cpu_possible_map;
2977
2978 /* Allocate the first page for all buffers */
2979 for_each_tracing_cpu(i) {
2980 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
2981 max_tr.data[i] = &per_cpu(max_data, i);
2982
2983 array = (void *)__get_free_page(GFP_KERNEL);
2984 if (array == NULL) {
2985 printk(KERN_ERR "tracer: failed to allocate page"
2986 "for trace buffer!\n");
2987 goto free_buffers;
2988 }
2989
2990 /* set the array to the list */
2991 INIT_LIST_HEAD(&data->trace_pages);
2992 page = virt_to_page(array);
2993 list_add(&page->lru, &data->trace_pages);
2994 /* use the LRU flag to differentiate the two buffers */
2995 ClearPageLRU(page);
2996
2997 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
2998 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
2999
3000/* Only allocate if we are actually using the max trace */
3001#ifdef CONFIG_TRACER_MAX_TRACE
3002 array = (void *)__get_free_page(GFP_KERNEL);
3003 if (array == NULL) {
3004 printk(KERN_ERR "tracer: failed to allocate page"
3005 "for trace buffer!\n");
3006 goto free_buffers;
3007 }
3008
3009 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
3010 page = virt_to_page(array);
3011 list_add(&page->lru, &max_tr.data[i]->trace_pages);
3012 SetPageLRU(page);
3013#endif
3014 }
3015
3016 /*
3017 * Since we allocate by orders of pages, we may be able to
3018 * round up a bit.
3019 */
3020 global_trace.entries = ENTRIES_PER_PAGE;
3021 pages++;
3022
3023 while (global_trace.entries < trace_nr_entries) {
3024 if (trace_alloc_page())
3025 break;
3026 pages++;
3027 }
3028 max_tr.entries = global_trace.entries;
3029
3030 pr_info("tracer: %d pages allocated for %ld",
3031 pages, trace_nr_entries);
3032 pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
3033 pr_info(" actual entries %ld\n", global_trace.entries);
3034
3035 tracer_init_debugfs();
3036
3037 trace_init_cmdlines();
3038
3039 register_tracer(&no_tracer);
3040 current_trace = &no_tracer;
3041
3042 /* All seems OK, enable tracing */
3043 tracing_disabled = 0;
3044
3045 return 0;
3046
3047 free_buffers:
3048 for (i-- ; i >= 0; i--) {
3049 struct page *page, *tmp;
3050 struct trace_array_cpu *data = global_trace.data[i];
3051
3052 if (data) {
3053 list_for_each_entry_safe(page, tmp,
3054 &data->trace_pages, lru) {
3055 list_del_init(&page->lru);
3056 __free_page(page);
3057 }
3058 }
3059
3060#ifdef CONFIG_TRACER_MAX_TRACE
3061 data = max_tr.data[i];
3062 if (data) {
3063 list_for_each_entry_safe(page, tmp,
3064 &data->trace_pages, lru) {
3065 list_del_init(&page->lru);
3066 __free_page(page);
3067 }
3068 }
3069#endif
3070 }
3071 return ret;
3072}
3073fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..0ef9ef74c806
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,327 @@
1#ifndef _LINUX_KERNEL_TRACE_H
2#define _LINUX_KERNEL_TRACE_H
3
4#include <linux/fs.h>
5#include <asm/atomic.h>
6#include <linux/sched.h>
7#include <linux/clocksource.h>
8#include <linux/mmiotrace.h>
9
10enum trace_type {
11 __TRACE_FIRST_TYPE = 0,
12
13 TRACE_FN,
14 TRACE_CTX,
15 TRACE_WAKE,
16 TRACE_STACK,
17 TRACE_SPECIAL,
18 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP,
20
21 __TRACE_LAST_TYPE
22};
23
24/*
25 * Function trace entry - function address and parent function addres:
26 */
27struct ftrace_entry {
28 unsigned long ip;
29 unsigned long parent_ip;
30};
31
32/*
33 * Context switch trace entry - which task (and prio) we switched from/to:
34 */
35struct ctx_switch_entry {
36 unsigned int prev_pid;
37 unsigned char prev_prio;
38 unsigned char prev_state;
39 unsigned int next_pid;
40 unsigned char next_prio;
41 unsigned char next_state;
42};
43
44/*
45 * Special (free-form) trace entry:
46 */
47struct special_entry {
48 unsigned long arg1;
49 unsigned long arg2;
50 unsigned long arg3;
51};
52
53/*
54 * Stack-trace entry:
55 */
56
57#define FTRACE_STACK_ENTRIES 8
58
59struct stack_entry {
60 unsigned long caller[FTRACE_STACK_ENTRIES];
61};
62
63/*
64 * The trace entry - the most basic unit of tracing. This is what
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */
69struct trace_entry {
70 char type;
71 char cpu;
72 char flags;
73 char preempt_count;
74 int pid;
75 cycle_t t;
76 union {
77 struct ftrace_entry fn;
78 struct ctx_switch_entry ctx;
79 struct special_entry special;
80 struct stack_entry stack;
81 struct mmiotrace_rw mmiorw;
82 struct mmiotrace_map mmiomap;
83 };
84};
85
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry)
87
88/*
89 * The CPU trace array - it consists of thousands of trace entries
90 * plus some other descriptor data: (for example which task started
91 * the trace, etc.)
92 */
93struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98
99 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx;
105 unsigned long overrun;
106 unsigned long saved_latency;
107 unsigned long critical_start;
108 unsigned long critical_end;
109 unsigned long critical_sequence;
110 unsigned long nice;
111 unsigned long policy;
112 unsigned long rt_priority;
113 cycle_t preempt_timestamp;
114 pid_t pid;
115 uid_t uid;
116 char comm[TASK_COMM_LEN];
117};
118
119struct trace_iterator;
120
121/*
122 * The trace array - an array of per-CPU trace arrays. This is the
123 * highest level data structure that individual tracers deal with.
124 * They have on/off state as well:
125 */
126struct trace_array {
127 unsigned long entries;
128 long ctrl;
129 int cpu;
130 cycle_t time_start;
131 struct task_struct *waiter;
132 struct trace_array_cpu *data[NR_CPUS];
133};
134
135/*
136 * A specific tracer, represented by methods that operate on a trace array:
137 */
138struct tracer {
139 const char *name;
140 void (*init)(struct trace_array *tr);
141 void (*reset)(struct trace_array *tr);
142 void (*open)(struct trace_iterator *iter);
143 void (*pipe_open)(struct trace_iterator *iter);
144 void (*close)(struct trace_iterator *iter);
145 void (*start)(struct trace_iterator *iter);
146 void (*stop)(struct trace_iterator *iter);
147 ssize_t (*read)(struct trace_iterator *iter,
148 struct file *filp, char __user *ubuf,
149 size_t cnt, loff_t *ppos);
150 void (*ctrl_update)(struct trace_array *tr);
151#ifdef CONFIG_FTRACE_STARTUP_TEST
152 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr);
154#endif
155 int (*print_line)(struct trace_iterator *iter);
156 struct tracer *next;
157 int print_max;
158};
159
160struct trace_seq {
161 unsigned char buffer[PAGE_SIZE];
162 unsigned int len;
163 unsigned int readpos;
164};
165
166/*
167 * Trace iterator - used by printout routines who present trace
168 * results to users and which routines might sleep, etc:
169 */
170struct trace_iterator {
171 struct trace_array *tr;
172 struct tracer *trace;
173 void *private;
174 long last_overrun[NR_CPUS];
175 long overrun[NR_CPUS];
176
177 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq;
179 struct trace_entry *ent;
180 int cpu;
181
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184
185 unsigned long iter_flags;
186 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx;
191};
192
193void tracing_reset(struct trace_array_cpu *data);
194int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void);
196void ftrace(struct trace_array *tr,
197 struct trace_array_cpu *data,
198 unsigned long ip,
199 unsigned long parent_ip,
200 unsigned long flags);
201void tracing_sched_switch_trace(struct trace_array *tr,
202 struct trace_array_cpu *data,
203 struct task_struct *prev,
204 struct task_struct *next,
205 unsigned long flags);
206void tracing_record_cmdline(struct task_struct *tsk);
207
208void tracing_sched_wakeup_trace(struct trace_array *tr,
209 struct trace_array_cpu *data,
210 struct task_struct *wakee,
211 struct task_struct *cur,
212 unsigned long flags);
213void trace_special(struct trace_array *tr,
214 struct trace_array_cpu *data,
215 unsigned long arg1,
216 unsigned long arg2,
217 unsigned long arg3);
218void trace_function(struct trace_array *tr,
219 struct trace_array_cpu *data,
220 unsigned long ip,
221 unsigned long parent_ip,
222 unsigned long flags);
223
224void tracing_start_function_trace(void);
225void tracing_stop_function_trace(void);
226int register_tracer(struct tracer *type);
227void unregister_tracer(struct tracer *type);
228
229extern unsigned long nsecs_to_usecs(unsigned long nsecs);
230
231extern unsigned long tracing_max_latency;
232extern unsigned long tracing_thresh;
233
234extern atomic_t trace_record_cmdline_enabled;
235
236void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
237void update_max_tr_single(struct trace_array *tr,
238 struct task_struct *tsk, int cpu);
239
240extern cycle_t ftrace_now(int cpu);
241
242#ifdef CONFIG_CONTEXT_SWITCH_TRACER
243typedef void
244(*tracer_switch_func_t)(void *private,
245 void *__rq,
246 struct task_struct *prev,
247 struct task_struct *next);
248
249struct tracer_switch_ops {
250 tracer_switch_func_t func;
251 void *private;
252 struct tracer_switch_ops *next;
253};
254
255#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
256
257#ifdef CONFIG_DYNAMIC_FTRACE
258extern unsigned long ftrace_update_tot_cnt;
259#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
260extern int DYN_FTRACE_TEST_NAME(void);
261#endif
262
263#ifdef CONFIG_MMIOTRACE
264extern void __trace_mmiotrace_rw(struct trace_array *tr,
265 struct trace_array_cpu *data,
266 struct mmiotrace_rw *rw);
267extern void __trace_mmiotrace_map(struct trace_array *tr,
268 struct trace_array_cpu *data,
269 struct mmiotrace_map *map);
270#endif
271
272#ifdef CONFIG_FTRACE_STARTUP_TEST
273#ifdef CONFIG_FTRACE
274extern int trace_selftest_startup_function(struct tracer *trace,
275 struct trace_array *tr);
276#endif
277#ifdef CONFIG_IRQSOFF_TRACER
278extern int trace_selftest_startup_irqsoff(struct tracer *trace,
279 struct trace_array *tr);
280#endif
281#ifdef CONFIG_PREEMPT_TRACER
282extern int trace_selftest_startup_preemptoff(struct tracer *trace,
283 struct trace_array *tr);
284#endif
285#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
286extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
287 struct trace_array *tr);
288#endif
289#ifdef CONFIG_SCHED_TRACER
290extern int trace_selftest_startup_wakeup(struct tracer *trace,
291 struct trace_array *tr);
292#endif
293#ifdef CONFIG_CONTEXT_SWITCH_TRACER
294extern int trace_selftest_startup_sched_switch(struct tracer *trace,
295 struct trace_array *tr);
296#endif
297#endif /* CONFIG_FTRACE_STARTUP_TEST */
298
299extern void *head_page(struct trace_array_cpu *data);
300extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
301extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
302 size_t cnt);
303extern long ns2usecs(cycle_t nsec);
304
305extern unsigned long trace_flags;
306
307/*
308 * trace_iterator_flags is an enumeration that defines bit
309 * positions into trace_flags that controls the output.
310 *
311 * NOTE: These bits must match the trace_options array in
312 * trace.c.
313 */
314enum trace_iterator_flags {
315 TRACE_ITER_PRINT_PARENT = 0x01,
316 TRACE_ITER_SYM_OFFSET = 0x02,
317 TRACE_ITER_SYM_ADDR = 0x04,
318 TRACE_ITER_VERBOSE = 0x08,
319 TRACE_ITER_RAW = 0x10,
320 TRACE_ITER_HEX = 0x20,
321 TRACE_ITER_BIN = 0x40,
322 TRACE_ITER_BLOCK = 0x80,
323 TRACE_ITER_STACKTRACE = 0x100,
324 TRACE_ITER_SCHED_TREE = 0x200,
325};
326
327#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..0a084656d7cf
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,78 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/debugfs.h>
13#include <linux/uaccess.h>
14#include <linux/ftrace.h>
15#include <linux/fs.h>
16
17#include "trace.h"
18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]);
27}
28
29static void start_function_trace(struct trace_array *tr)
30{
31 function_reset(tr);
32 atomic_inc(&trace_record_cmdline_enabled);
33 tracing_start_function_trace();
34}
35
36static void stop_function_trace(struct trace_array *tr)
37{
38 tracing_stop_function_trace();
39 atomic_dec(&trace_record_cmdline_enabled);
40}
41
42static void function_trace_init(struct trace_array *tr)
43{
44 if (tr->ctrl)
45 start_function_trace(tr);
46}
47
48static void function_trace_reset(struct trace_array *tr)
49{
50 if (tr->ctrl)
51 stop_function_trace(tr);
52}
53
54static void function_trace_ctrl_update(struct trace_array *tr)
55{
56 if (tr->ctrl)
57 start_function_trace(tr);
58 else
59 stop_function_trace(tr);
60}
61
62static struct tracer function_trace __read_mostly =
63{
64 .name = "ftrace",
65 .init = function_trace_init,
66 .reset = function_trace_reset,
67 .ctrl_update = function_trace_ctrl_update,
68#ifdef CONFIG_FTRACE_SELFTEST
69 .selftest = trace_selftest_startup_function,
70#endif
71};
72
73static __init int init_function_trace(void)
74{
75 return register_tracer(&function_trace);
76}
77
78device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..761f3ec66c50
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,502 @@
1/*
2 * trace irqs off criticall timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * From code in the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/ftrace.h>
17#include <linux/fs.h>
18
19#include "trace.h"
20
21static struct trace_array *irqsoff_trace __read_mostly;
22static int tracer_enabled __read_mostly;
23
24static DEFINE_PER_CPU(int, tracing_cpu);
25
26static DEFINE_SPINLOCK(max_trace_lock);
27
28enum {
29 TRACER_IRQS_OFF = (1 << 1),
30 TRACER_PREEMPT_OFF = (1 << 2),
31};
32
33static int trace_type __read_mostly;
34
35#ifdef CONFIG_PREEMPT_TRACER
36static inline int
37preempt_trace(void)
38{
39 return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
40}
41#else
42# define preempt_trace() (0)
43#endif
44
45#ifdef CONFIG_IRQSOFF_TRACER
46static inline int
47irq_trace(void)
48{
49 return ((trace_type & TRACER_IRQS_OFF) &&
50 irqs_disabled());
51}
52#else
53# define irq_trace() (0)
54#endif
55
56/*
57 * Sequence count - we record it when starting a measurement and
58 * skip the latency if the sequence has changed - some other section
59 * did a maximum and could disturb our measurement with serial console
60 * printouts, etc. Truly coinciding maximum latencies should be rare
61 * and what happens together happens separately as well, so this doesnt
62 * decrease the validity of the maximum found:
63 */
64static __cacheline_aligned_in_smp unsigned long max_sequence;
65
66#ifdef CONFIG_FTRACE
67/*
68 * irqsoff uses its own tracer function to keep the overhead down:
69 */
70static void
71irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
72{
73 struct trace_array *tr = irqsoff_trace;
74 struct trace_array_cpu *data;
75 unsigned long flags;
76 long disabled;
77 int cpu;
78
79 /*
80 * Does not matter if we preempt. We test the flags
81 * afterward, to see if irqs are disabled or not.
82 * If we preempt and get a false positive, the flags
83 * test will fail.
84 */
85 cpu = raw_smp_processor_id();
86 if (likely(!per_cpu(tracing_cpu, cpu)))
87 return;
88
89 local_save_flags(flags);
90 /* slight chance to get a false positive on tracing_cpu */
91 if (!irqs_disabled_flags(flags))
92 return;
93
94 data = tr->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96
97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags);
99
100 atomic_dec(&data->disabled);
101}
102
103static struct ftrace_ops trace_ops __read_mostly =
104{
105 .func = irqsoff_tracer_call,
106};
107#endif /* CONFIG_FTRACE */
108
109/*
110 * Should this new latency be reported/recorded?
111 */
112static int report_latency(cycle_t delta)
113{
114 if (tracing_thresh) {
115 if (delta < tracing_thresh)
116 return 0;
117 } else {
118 if (delta <= tracing_max_latency)
119 return 0;
120 }
121 return 1;
122}
123
124static void
125check_critical_timing(struct trace_array *tr,
126 struct trace_array_cpu *data,
127 unsigned long parent_ip,
128 int cpu)
129{
130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta;
132 unsigned long flags;
133
134 /*
135 * usecs conversion is slow so we try to delay the conversion
136 * as long as possible:
137 */
138 T0 = data->preempt_timestamp;
139 T1 = ftrace_now(cpu);
140 delta = T1-T0;
141
142 local_save_flags(flags);
143
144 if (!report_latency(delta))
145 goto out;
146
147 spin_lock_irqsave(&max_trace_lock, flags);
148
149 /* check if we are still the max latency */
150 if (!report_latency(delta))
151 goto out_unlock;
152
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
154
155 latency = nsecs_to_usecs(delta);
156
157 if (data->critical_sequence != max_sequence)
158 goto out_unlock;
159
160 tracing_max_latency = delta;
161 t0 = nsecs_to_usecs(T0);
162 t1 = nsecs_to_usecs(T1);
163
164 data->critical_end = parent_ip;
165
166 update_max_tr_single(tr, current, cpu);
167
168 if (!runqueue_is_locked()) {
169 if (tracing_thresh) {
170 printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical"
171 " section violates %lu us threshold.\n",
172 current->comm, current->pid,
173 raw_smp_processor_id(),
174 latency, nsecs_to_usecs(tracing_thresh));
175 } else {
176 printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us"
177 " maximum-latency critical section.\n",
178 current->comm, current->pid,
179 raw_smp_processor_id(),
180 latency);
181 }
182 }
183
184 max_sequence++;
185
186out_unlock:
187 spin_unlock_irqrestore(&max_trace_lock, flags);
188
189out:
190 data->critical_sequence = max_sequence;
191 data->preempt_timestamp = ftrace_now(cpu);
192 tracing_reset(data);
193 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
194}
195
196static inline void
197start_critical_timing(unsigned long ip, unsigned long parent_ip)
198{
199 int cpu;
200 struct trace_array *tr = irqsoff_trace;
201 struct trace_array_cpu *data;
202 unsigned long flags;
203
204 if (likely(!tracer_enabled))
205 return;
206
207 cpu = raw_smp_processor_id();
208
209 if (per_cpu(tracing_cpu, cpu))
210 return;
211
212 data = tr->data[cpu];
213
214 if (unlikely(!data) || atomic_read(&data->disabled))
215 return;
216
217 atomic_inc(&data->disabled);
218
219 data->critical_sequence = max_sequence;
220 data->preempt_timestamp = ftrace_now(cpu);
221 data->critical_start = parent_ip ? : ip;
222 tracing_reset(data);
223
224 local_save_flags(flags);
225
226 trace_function(tr, data, ip, parent_ip, flags);
227
228 per_cpu(tracing_cpu, cpu) = 1;
229
230 atomic_dec(&data->disabled);
231}
232
233static inline void
234stop_critical_timing(unsigned long ip, unsigned long parent_ip)
235{
236 int cpu;
237 struct trace_array *tr = irqsoff_trace;
238 struct trace_array_cpu *data;
239 unsigned long flags;
240
241 cpu = raw_smp_processor_id();
242 /* Always clear the tracing cpu on stopping the trace */
243 if (unlikely(per_cpu(tracing_cpu, cpu)))
244 per_cpu(tracing_cpu, cpu) = 0;
245 else
246 return;
247
248 if (!tracer_enabled)
249 return;
250
251 data = tr->data[cpu];
252
253 if (unlikely(!data) || unlikely(!head_page(data)) ||
254 !data->critical_start || atomic_read(&data->disabled))
255 return;
256
257 atomic_inc(&data->disabled);
258
259 local_save_flags(flags);
260 trace_function(tr, data, ip, parent_ip, flags);
261 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
262 data->critical_start = 0;
263 atomic_dec(&data->disabled);
264}
265
266/* start and stop critical timings used to for stoppage (in idle) */
267void start_critical_timings(void)
268{
269 if (preempt_trace() || irq_trace())
270 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
271}
272
273void stop_critical_timings(void)
274{
275 if (preempt_trace() || irq_trace())
276 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
277}
278
279#ifdef CONFIG_IRQSOFF_TRACER
280#ifdef CONFIG_PROVE_LOCKING
281void time_hardirqs_on(unsigned long a0, unsigned long a1)
282{
283 if (!preempt_trace() && irq_trace())
284 stop_critical_timing(a0, a1);
285}
286
287void time_hardirqs_off(unsigned long a0, unsigned long a1)
288{
289 if (!preempt_trace() && irq_trace())
290 start_critical_timing(a0, a1);
291}
292
293#else /* !CONFIG_PROVE_LOCKING */
294
295/*
296 * Stubs:
297 */
298
299void early_boot_irqs_off(void)
300{
301}
302
303void early_boot_irqs_on(void)
304{
305}
306
307void trace_softirqs_on(unsigned long ip)
308{
309}
310
311void trace_softirqs_off(unsigned long ip)
312{
313}
314
315inline void print_irqtrace_events(struct task_struct *curr)
316{
317}
318
319/*
320 * We are only interested in hardirq on/off events:
321 */
322void trace_hardirqs_on(void)
323{
324 if (!preempt_trace() && irq_trace())
325 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
326}
327EXPORT_SYMBOL(trace_hardirqs_on);
328
329void trace_hardirqs_off(void)
330{
331 if (!preempt_trace() && irq_trace())
332 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
333}
334EXPORT_SYMBOL(trace_hardirqs_off);
335
336void trace_hardirqs_on_caller(unsigned long caller_addr)
337{
338 if (!preempt_trace() && irq_trace())
339 stop_critical_timing(CALLER_ADDR0, caller_addr);
340}
341EXPORT_SYMBOL(trace_hardirqs_on_caller);
342
343void trace_hardirqs_off_caller(unsigned long caller_addr)
344{
345 if (!preempt_trace() && irq_trace())
346 start_critical_timing(CALLER_ADDR0, caller_addr);
347}
348EXPORT_SYMBOL(trace_hardirqs_off_caller);
349
350#endif /* CONFIG_PROVE_LOCKING */
351#endif /* CONFIG_IRQSOFF_TRACER */
352
353#ifdef CONFIG_PREEMPT_TRACER
354void trace_preempt_on(unsigned long a0, unsigned long a1)
355{
356 stop_critical_timing(a0, a1);
357}
358
359void trace_preempt_off(unsigned long a0, unsigned long a1)
360{
361 start_critical_timing(a0, a1);
362}
363#endif /* CONFIG_PREEMPT_TRACER */
364
365static void start_irqsoff_tracer(struct trace_array *tr)
366{
367 register_ftrace_function(&trace_ops);
368 tracer_enabled = 1;
369}
370
371static void stop_irqsoff_tracer(struct trace_array *tr)
372{
373 tracer_enabled = 0;
374 unregister_ftrace_function(&trace_ops);
375}
376
377static void __irqsoff_tracer_init(struct trace_array *tr)
378{
379 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */
381 smp_wmb();
382
383 if (tr->ctrl)
384 start_irqsoff_tracer(tr);
385}
386
387static void irqsoff_tracer_reset(struct trace_array *tr)
388{
389 if (tr->ctrl)
390 stop_irqsoff_tracer(tr);
391}
392
393static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
394{
395 if (tr->ctrl)
396 start_irqsoff_tracer(tr);
397 else
398 stop_irqsoff_tracer(tr);
399}
400
401static void irqsoff_tracer_open(struct trace_iterator *iter)
402{
403 /* stop the trace while dumping */
404 if (iter->tr->ctrl)
405 stop_irqsoff_tracer(iter->tr);
406}
407
408static void irqsoff_tracer_close(struct trace_iterator *iter)
409{
410 if (iter->tr->ctrl)
411 start_irqsoff_tracer(iter->tr);
412}
413
414#ifdef CONFIG_IRQSOFF_TRACER
415static void irqsoff_tracer_init(struct trace_array *tr)
416{
417 trace_type = TRACER_IRQS_OFF;
418
419 __irqsoff_tracer_init(tr);
420}
421static struct tracer irqsoff_tracer __read_mostly =
422{
423 .name = "irqsoff",
424 .init = irqsoff_tracer_init,
425 .reset = irqsoff_tracer_reset,
426 .open = irqsoff_tracer_open,
427 .close = irqsoff_tracer_close,
428 .ctrl_update = irqsoff_tracer_ctrl_update,
429 .print_max = 1,
430#ifdef CONFIG_FTRACE_SELFTEST
431 .selftest = trace_selftest_startup_irqsoff,
432#endif
433};
434# define register_irqsoff(trace) register_tracer(&trace)
435#else
436# define register_irqsoff(trace) do { } while (0)
437#endif
438
439#ifdef CONFIG_PREEMPT_TRACER
440static void preemptoff_tracer_init(struct trace_array *tr)
441{
442 trace_type = TRACER_PREEMPT_OFF;
443
444 __irqsoff_tracer_init(tr);
445}
446
447static struct tracer preemptoff_tracer __read_mostly =
448{
449 .name = "preemptoff",
450 .init = preemptoff_tracer_init,
451 .reset = irqsoff_tracer_reset,
452 .open = irqsoff_tracer_open,
453 .close = irqsoff_tracer_close,
454 .ctrl_update = irqsoff_tracer_ctrl_update,
455 .print_max = 1,
456#ifdef CONFIG_FTRACE_SELFTEST
457 .selftest = trace_selftest_startup_preemptoff,
458#endif
459};
460# define register_preemptoff(trace) register_tracer(&trace)
461#else
462# define register_preemptoff(trace) do { } while (0)
463#endif
464
465#if defined(CONFIG_IRQSOFF_TRACER) && \
466 defined(CONFIG_PREEMPT_TRACER)
467
468static void preemptirqsoff_tracer_init(struct trace_array *tr)
469{
470 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
471
472 __irqsoff_tracer_init(tr);
473}
474
475static struct tracer preemptirqsoff_tracer __read_mostly =
476{
477 .name = "preemptirqsoff",
478 .init = preemptirqsoff_tracer_init,
479 .reset = irqsoff_tracer_reset,
480 .open = irqsoff_tracer_open,
481 .close = irqsoff_tracer_close,
482 .ctrl_update = irqsoff_tracer_ctrl_update,
483 .print_max = 1,
484#ifdef CONFIG_FTRACE_SELFTEST
485 .selftest = trace_selftest_startup_preemptirqsoff,
486#endif
487};
488
489# define register_preemptirqsoff(trace) register_tracer(&trace)
490#else
491# define register_preemptirqsoff(trace) do { } while (0)
492#endif
493
494__init static int init_irqsoff_tracer(void)
495{
496 register_irqsoff(irqsoff_tracer);
497 register_preemptoff(preemptoff_tracer);
498 register_preemptirqsoff(preemptirqsoff_tracer);
499
500 return 0;
501}
502device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
1/*
2 * Memory mapped I/O tracing
3 *
4 * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
5 */
6
7#define DEBUG 1
8
9#include <linux/kernel.h>
10#include <linux/mmiotrace.h>
11#include <linux/pci.h>
12
13#include "trace.h"
14
15struct header_iter {
16 struct pci_dev *dev;
17};
18
19static struct trace_array *mmio_trace_array;
20static bool overrun_detected;
21
22static void mmio_reset_data(struct trace_array *tr)
23{
24 int cpu;
25
26 overrun_detected = false;
27 tr->time_start = ftrace_now(tr->cpu);
28
29 for_each_online_cpu(cpu)
30 tracing_reset(tr->data[cpu]);
31}
32
33static void mmio_trace_init(struct trace_array *tr)
34{
35 pr_debug("in %s\n", __func__);
36 mmio_trace_array = tr;
37 if (tr->ctrl) {
38 mmio_reset_data(tr);
39 enable_mmiotrace();
40 }
41}
42
43static void mmio_trace_reset(struct trace_array *tr)
44{
45 pr_debug("in %s\n", __func__);
46 if (tr->ctrl)
47 disable_mmiotrace();
48 mmio_reset_data(tr);
49 mmio_trace_array = NULL;
50}
51
52static void mmio_trace_ctrl_update(struct trace_array *tr)
53{
54 pr_debug("in %s\n", __func__);
55 if (tr->ctrl) {
56 mmio_reset_data(tr);
57 enable_mmiotrace();
58 } else {
59 disable_mmiotrace();
60 }
61}
62
63static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
64{
65 int ret = 0;
66 int i;
67 resource_size_t start, end;
68 const struct pci_driver *drv = pci_dev_driver(dev);
69
70 /* XXX: incomplete checks for trace_seq_printf() return value */
71 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
72 dev->bus->number, dev->devfn,
73 dev->vendor, dev->device, dev->irq);
74 /*
75 * XXX: is pci_resource_to_user() appropriate, since we are
76 * supposed to interpret the __ioremap() phys_addr argument based on
77 * these printed values?
78 */
79 for (i = 0; i < 7; i++) {
80 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
81 ret += trace_seq_printf(s, " %llx",
82 (unsigned long long)(start |
83 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
84 }
85 for (i = 0; i < 7; i++) {
86 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
87 ret += trace_seq_printf(s, " %llx",
88 dev->resource[i].start < dev->resource[i].end ?
89 (unsigned long long)(end - start) + 1 : 0);
90 }
91 if (drv)
92 ret += trace_seq_printf(s, " %s\n", drv->name);
93 else
94 ret += trace_seq_printf(s, " \n");
95 return ret;
96}
97
98static void destroy_header_iter(struct header_iter *hiter)
99{
100 if (!hiter)
101 return;
102 pci_dev_put(hiter->dev);
103 kfree(hiter);
104}
105
106static void mmio_pipe_open(struct trace_iterator *iter)
107{
108 struct header_iter *hiter;
109 struct trace_seq *s = &iter->seq;
110
111 trace_seq_printf(s, "VERSION 20070824\n");
112
113 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
114 if (!hiter)
115 return;
116
117 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
118 iter->private = hiter;
119}
120
121/* XXX: This is not called when the pipe is closed! */
122static void mmio_close(struct trace_iterator *iter)
123{
124 struct header_iter *hiter = iter->private;
125 destroy_header_iter(hiter);
126 iter->private = NULL;
127}
128
129static unsigned long count_overruns(struct trace_iterator *iter)
130{
131 int cpu;
132 unsigned long cnt = 0;
133 for_each_online_cpu(cpu) {
134 cnt += iter->overrun[cpu];
135 iter->overrun[cpu] = 0;
136 }
137 return cnt;
138}
139
140static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
141 char __user *ubuf, size_t cnt, loff_t *ppos)
142{
143 ssize_t ret;
144 struct header_iter *hiter = iter->private;
145 struct trace_seq *s = &iter->seq;
146 unsigned long n;
147
148 n = count_overruns(iter);
149 if (n) {
150 /* XXX: This is later than where events were lost. */
151 trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
152 if (!overrun_detected)
153 pr_warning("mmiotrace has lost events.\n");
154 overrun_detected = true;
155 goto print_out;
156 }
157
158 if (!hiter)
159 return 0;
160
161 mmio_print_pcidev(s, hiter->dev);
162 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
163
164 if (!hiter->dev) {
165 destroy_header_iter(hiter);
166 iter->private = NULL;
167 }
168
169print_out:
170 ret = trace_seq_to_user(s, ubuf, cnt);
171 return (ret == -EBUSY) ? 0 : ret;
172}
173
174static int mmio_print_rw(struct trace_iterator *iter)
175{
176 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw;
178 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t);
180 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t;
182 int ret = 1;
183
184 switch (entry->mmiorw.opcode) {
185 case MMIO_READ:
186 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
188 rw->width, secs, usec_rem, rw->map_id,
189 (unsigned long long)rw->phys,
190 rw->value, rw->pc, 0);
191 break;
192 case MMIO_WRITE:
193 ret = trace_seq_printf(s,
194 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
195 rw->width, secs, usec_rem, rw->map_id,
196 (unsigned long long)rw->phys,
197 rw->value, rw->pc, 0);
198 break;
199 case MMIO_UNKNOWN_OP:
200 ret = trace_seq_printf(s,
201 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
202 secs, usec_rem, rw->map_id,
203 (unsigned long long)rw->phys,
204 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
205 (rw->value >> 0) & 0xff, rw->pc, 0);
206 break;
207 default:
208 ret = trace_seq_printf(s, "rw what?\n");
209 break;
210 }
211 if (ret)
212 return 1;
213 return 0;
214}
215
216static int mmio_print_map(struct trace_iterator *iter)
217{
218 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap;
220 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t);
222 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t;
224 int ret = 1;
225
226 switch (entry->mmiorw.opcode) {
227 case MMIO_PROBE:
228 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
230 secs, usec_rem, m->map_id,
231 (unsigned long long)m->phys, m->virt, m->len,
232 0UL, 0);
233 break;
234 case MMIO_UNPROBE:
235 ret = trace_seq_printf(s,
236 "UNMAP %lu.%06lu %d 0x%lx %d\n",
237 secs, usec_rem, m->map_id, 0UL, 0);
238 break;
239 default:
240 ret = trace_seq_printf(s, "map what?\n");
241 break;
242 }
243 if (ret)
244 return 1;
245 return 0;
246}
247
248/* return 0 to abort printing without consuming current entry in pipe mode */
249static int mmio_print_line(struct trace_iterator *iter)
250{
251 switch (iter->ent->type) {
252 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter);
256 default:
257 return 1; /* ignore unknown entries */
258 }
259}
260
261static struct tracer mmio_tracer __read_mostly =
262{
263 .name = "mmiotrace",
264 .init = mmio_trace_init,
265 .reset = mmio_trace_reset,
266 .pipe_open = mmio_pipe_open,
267 .close = mmio_close,
268 .read = mmio_read,
269 .ctrl_update = mmio_trace_ctrl_update,
270 .print_line = mmio_print_line,
271};
272
273__init static int init_mmio_trace(void)
274{
275 return register_tracer(&mmio_tracer);
276}
277device_initcall(init_mmio_trace);
278
279void mmio_trace_rw(struct mmiotrace_rw *rw)
280{
281 struct trace_array *tr = mmio_trace_array;
282 struct trace_array_cpu *data = tr->data[smp_processor_id()];
283 __trace_mmiotrace_rw(tr, data, rw);
284}
285
286void mmio_trace_mapping(struct mmiotrace_map *map)
287{
288 struct trace_array *tr = mmio_trace_array;
289 struct trace_array_cpu *data;
290
291 preempt_disable();
292 data = tr->data[smp_processor_id()];
293 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable();
295}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..d25ffa5eaf2b
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,301 @@
1/*
2 * trace context switch
3 *
4 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h>
11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h>
14
15#include "trace.h"
16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref;
20
21static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev,
23 struct task_struct *next)
24{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data;
28 unsigned long flags;
29 long disabled;
30 int cpu;
31
32 if (!tracer_enabled)
33 return;
34
35 local_irq_save(flags);
36 cpu = raw_smp_processor_id();
37 data = tr->data[cpu];
38 disabled = atomic_inc_return(&data->disabled);
39
40 if (likely(disabled == 1))
41 tracing_sched_switch_trace(tr, data, prev, next, flags);
42
43 atomic_dec(&data->disabled);
44 local_irq_restore(flags);
45}
46
47static notrace void
48sched_switch_callback(void *probe_data, void *call_data,
49 const char *format, va_list *args)
50{
51 struct task_struct *prev;
52 struct task_struct *next;
53 struct rq *__rq;
54
55 if (!atomic_read(&sched_ref))
56 return;
57
58 /* skip prev_pid %d next_pid %d prev_state %ld */
59 (void)va_arg(*args, int);
60 (void)va_arg(*args, int);
61 (void)va_arg(*args, long);
62 __rq = va_arg(*args, typeof(__rq));
63 prev = va_arg(*args, typeof(prev));
64 next = va_arg(*args, typeof(next));
65
66 tracing_record_cmdline(prev);
67
68 /*
69 * If tracer_switch_func only points to the local
70 * switch func, it still needs the ptr passed to it.
71 */
72 sched_switch_func(probe_data, __rq, prev, next);
73}
74
75static void
76wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
77 task_struct *curr)
78{
79 struct trace_array **ptr = private;
80 struct trace_array *tr = *ptr;
81 struct trace_array_cpu *data;
82 unsigned long flags;
83 long disabled;
84 int cpu;
85
86 if (!tracer_enabled)
87 return;
88
89 tracing_record_cmdline(curr);
90
91 local_irq_save(flags);
92 cpu = raw_smp_processor_id();
93 data = tr->data[cpu];
94 disabled = atomic_inc_return(&data->disabled);
95
96 if (likely(disabled == 1))
97 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
98
99 atomic_dec(&data->disabled);
100 local_irq_restore(flags);
101}
102
103static notrace void
104wake_up_callback(void *probe_data, void *call_data,
105 const char *format, va_list *args)
106{
107 struct task_struct *curr;
108 struct task_struct *task;
109 struct rq *__rq;
110
111 if (likely(!tracer_enabled))
112 return;
113
114 /* Skip pid %d state %ld */
115 (void)va_arg(*args, int);
116 (void)va_arg(*args, long);
117 /* now get the meat: "rq %p task %p rq->curr %p" */
118 __rq = va_arg(*args, typeof(__rq));
119 task = va_arg(*args, typeof(task));
120 curr = va_arg(*args, typeof(curr));
121
122 tracing_record_cmdline(task);
123 tracing_record_cmdline(curr);
124
125 wakeup_func(probe_data, __rq, task, curr);
126}
127
128void
129ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
130{
131 struct trace_array *tr = ctx_trace;
132 struct trace_array_cpu *data;
133 unsigned long flags;
134 long disabled;
135 int cpu;
136
137 if (!tracer_enabled)
138 return;
139
140 local_irq_save(flags);
141 cpu = raw_smp_processor_id();
142 data = tr->data[cpu];
143 disabled = atomic_inc_return(&data->disabled);
144
145 if (likely(disabled == 1))
146 __trace_special(tr, data, arg1, arg2, arg3);
147
148 atomic_dec(&data->disabled);
149 local_irq_restore(flags);
150}
151
152static void sched_switch_reset(struct trace_array *tr)
153{
154 int cpu;
155
156 tr->time_start = ftrace_now(tr->cpu);
157
158 for_each_online_cpu(cpu)
159 tracing_reset(tr->data[cpu]);
160}
161
162static int tracing_sched_register(void)
163{
164 int ret;
165
166 ret = marker_probe_register("kernel_sched_wakeup",
167 "pid %d state %ld ## rq %p task %p rq->curr %p",
168 wake_up_callback,
169 &ctx_trace);
170 if (ret) {
171 pr_info("wakeup trace: Couldn't add marker"
172 " probe to kernel_sched_wakeup\n");
173 return ret;
174 }
175
176 ret = marker_probe_register("kernel_sched_wakeup_new",
177 "pid %d state %ld ## rq %p task %p rq->curr %p",
178 wake_up_callback,
179 &ctx_trace);
180 if (ret) {
181 pr_info("wakeup trace: Couldn't add marker"
182 " probe to kernel_sched_wakeup_new\n");
183 goto fail_deprobe;
184 }
185
186 ret = marker_probe_register("kernel_sched_schedule",
187 "prev_pid %d next_pid %d prev_state %ld "
188 "## rq %p prev %p next %p",
189 sched_switch_callback,
190 &ctx_trace);
191 if (ret) {
192 pr_info("sched trace: Couldn't add marker"
193 " probe to kernel_sched_schedule\n");
194 goto fail_deprobe_wake_new;
195 }
196
197 return ret;
198fail_deprobe_wake_new:
199 marker_probe_unregister("kernel_sched_wakeup_new",
200 wake_up_callback,
201 &ctx_trace);
202fail_deprobe:
203 marker_probe_unregister("kernel_sched_wakeup",
204 wake_up_callback,
205 &ctx_trace);
206 return ret;
207}
208
209static void tracing_sched_unregister(void)
210{
211 marker_probe_unregister("kernel_sched_schedule",
212 sched_switch_callback,
213 &ctx_trace);
214 marker_probe_unregister("kernel_sched_wakeup_new",
215 wake_up_callback,
216 &ctx_trace);
217 marker_probe_unregister("kernel_sched_wakeup",
218 wake_up_callback,
219 &ctx_trace);
220}
221
222void tracing_start_sched_switch(void)
223{
224 long ref;
225
226 ref = atomic_inc_return(&sched_ref);
227 if (ref == 1)
228 tracing_sched_register();
229}
230
231void tracing_stop_sched_switch(void)
232{
233 long ref;
234
235 ref = atomic_dec_and_test(&sched_ref);
236 if (ref)
237 tracing_sched_unregister();
238}
239
240static void start_sched_trace(struct trace_array *tr)
241{
242 sched_switch_reset(tr);
243 atomic_inc(&trace_record_cmdline_enabled);
244 tracer_enabled = 1;
245 tracing_start_sched_switch();
246}
247
248static void stop_sched_trace(struct trace_array *tr)
249{
250 tracing_stop_sched_switch();
251 atomic_dec(&trace_record_cmdline_enabled);
252 tracer_enabled = 0;
253}
254
255static void sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258
259 if (tr->ctrl)
260 start_sched_trace(tr);
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (tr->ctrl)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_ctrl_update(struct trace_array *tr)
270{
271 /* When starting a new trace, reset the buffers */
272 if (tr->ctrl)
273 start_sched_trace(tr);
274 else
275 stop_sched_trace(tr);
276}
277
278static struct tracer sched_switch_trace __read_mostly =
279{
280 .name = "sched_switch",
281 .init = sched_switch_trace_init,
282 .reset = sched_switch_trace_reset,
283 .ctrl_update = sched_switch_trace_ctrl_update,
284#ifdef CONFIG_FTRACE_SELFTEST
285 .selftest = trace_selftest_startup_sched_switch,
286#endif
287};
288
289__init static int init_sched_switch_trace(void)
290{
291 int ret = 0;
292
293 if (atomic_read(&sched_ref))
294 ret = tracing_sched_register();
295 if (ret) {
296 pr_info("error registering scheduler trace\n");
297 return ret;
298 }
299 return register_tracer(&sched_switch_trace);
300}
301device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..5d2fb48e47f8
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,382 @@
1/*
2 * trace task wakeup timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h>
16#include <linux/uaccess.h>
17#include <linux/ftrace.h>
18#include <linux/marker.h>
19
20#include "trace.h"
21
22static struct trace_array *wakeup_trace;
23static int __read_mostly tracer_enabled;
24
25static struct task_struct *wakeup_task;
26static int wakeup_cpu;
27static unsigned wakeup_prio = -1;
28
29static DEFINE_SPINLOCK(wakeup_lock);
30
31static void __wakeup_reset(struct trace_array *tr);
32
33/*
34 * Should this new latency be reported/recorded?
35 */
36static int report_latency(cycle_t delta)
37{
38 if (tracing_thresh) {
39 if (delta < tracing_thresh)
40 return 0;
41 } else {
42 if (delta <= tracing_max_latency)
43 return 0;
44 }
45 return 1;
46}
47
48static void notrace
49wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
50 struct task_struct *next)
51{
52 unsigned long latency = 0, t0 = 0, t1 = 0;
53 struct trace_array **ptr = private;
54 struct trace_array *tr = *ptr;
55 struct trace_array_cpu *data;
56 cycle_t T0, T1, delta;
57 unsigned long flags;
58 long disabled;
59 int cpu;
60
61 if (unlikely(!tracer_enabled))
62 return;
63
64 /*
65 * When we start a new trace, we set wakeup_task to NULL
66 * and then set tracer_enabled = 1. We want to make sure
67 * that another CPU does not see the tracer_enabled = 1
68 * and the wakeup_task with an older task, that might
69 * actually be the same as next.
70 */
71 smp_rmb();
72
73 if (next != wakeup_task)
74 return;
75
76 /* The task we are waitng for is waking up */
77 data = tr->data[wakeup_cpu];
78
79 /* disable local data, not wakeup_cpu data */
80 cpu = raw_smp_processor_id();
81 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
82 if (likely(disabled != 1))
83 goto out;
84
85 spin_lock_irqsave(&wakeup_lock, flags);
86
87 /* We could race with grabbing wakeup_lock */
88 if (unlikely(!tracer_enabled || next != wakeup_task))
89 goto out_unlock;
90
91 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
92
93 /*
94 * usecs conversion is slow so we try to delay the conversion
95 * as long as possible:
96 */
97 T0 = data->preempt_timestamp;
98 T1 = ftrace_now(cpu);
99 delta = T1-T0;
100
101 if (!report_latency(delta))
102 goto out_unlock;
103
104 latency = nsecs_to_usecs(delta);
105
106 tracing_max_latency = delta;
107 t0 = nsecs_to_usecs(T0);
108 t1 = nsecs_to_usecs(T1);
109
110 update_max_tr(tr, wakeup_task, wakeup_cpu);
111
112out_unlock:
113 __wakeup_reset(tr);
114 spin_unlock_irqrestore(&wakeup_lock, flags);
115out:
116 atomic_dec(&tr->data[cpu]->disabled);
117}
118
119static notrace void
120sched_switch_callback(void *probe_data, void *call_data,
121 const char *format, va_list *args)
122{
123 struct task_struct *prev;
124 struct task_struct *next;
125 struct rq *__rq;
126
127 /* skip prev_pid %d next_pid %d prev_state %ld */
128 (void)va_arg(*args, int);
129 (void)va_arg(*args, int);
130 (void)va_arg(*args, long);
131 __rq = va_arg(*args, typeof(__rq));
132 prev = va_arg(*args, typeof(prev));
133 next = va_arg(*args, typeof(next));
134
135 tracing_record_cmdline(prev);
136
137 /*
138 * If tracer_switch_func only points to the local
139 * switch func, it still needs the ptr passed to it.
140 */
141 wakeup_sched_switch(probe_data, __rq, prev, next);
142}
143
144static void __wakeup_reset(struct trace_array *tr)
145{
146 struct trace_array_cpu *data;
147 int cpu;
148
149 assert_spin_locked(&wakeup_lock);
150
151 for_each_possible_cpu(cpu) {
152 data = tr->data[cpu];
153 tracing_reset(data);
154 }
155
156 wakeup_cpu = -1;
157 wakeup_prio = -1;
158
159 if (wakeup_task)
160 put_task_struct(wakeup_task);
161
162 wakeup_task = NULL;
163}
164
165static void wakeup_reset(struct trace_array *tr)
166{
167 unsigned long flags;
168
169 spin_lock_irqsave(&wakeup_lock, flags);
170 __wakeup_reset(tr);
171 spin_unlock_irqrestore(&wakeup_lock, flags);
172}
173
174static void
175wakeup_check_start(struct trace_array *tr, struct task_struct *p,
176 struct task_struct *curr)
177{
178 int cpu = smp_processor_id();
179 unsigned long flags;
180 long disabled;
181
182 if (likely(!rt_task(p)) ||
183 p->prio >= wakeup_prio ||
184 p->prio >= curr->prio)
185 return;
186
187 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
188 if (unlikely(disabled != 1))
189 goto out;
190
191 /* interrupts should be off from try_to_wake_up */
192 spin_lock(&wakeup_lock);
193
194 /* check for races. */
195 if (!tracer_enabled || p->prio >= wakeup_prio)
196 goto out_locked;
197
198 /* reset the trace */
199 __wakeup_reset(tr);
200
201 wakeup_cpu = task_cpu(p);
202 wakeup_prio = p->prio;
203
204 wakeup_task = p;
205 get_task_struct(wakeup_task);
206
207 local_save_flags(flags);
208
209 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
210 trace_function(tr, tr->data[wakeup_cpu],
211 CALLER_ADDR1, CALLER_ADDR2, flags);
212
213out_locked:
214 spin_unlock(&wakeup_lock);
215out:
216 atomic_dec(&tr->data[cpu]->disabled);
217}
218
219static notrace void
220wake_up_callback(void *probe_data, void *call_data,
221 const char *format, va_list *args)
222{
223 struct trace_array **ptr = probe_data;
224 struct trace_array *tr = *ptr;
225 struct task_struct *curr;
226 struct task_struct *task;
227 struct rq *__rq;
228
229 if (likely(!tracer_enabled))
230 return;
231
232 /* Skip pid %d state %ld */
233 (void)va_arg(*args, int);
234 (void)va_arg(*args, long);
235 /* now get the meat: "rq %p task %p rq->curr %p" */
236 __rq = va_arg(*args, typeof(__rq));
237 task = va_arg(*args, typeof(task));
238 curr = va_arg(*args, typeof(curr));
239
240 tracing_record_cmdline(task);
241 tracing_record_cmdline(curr);
242
243 wakeup_check_start(tr, task, curr);
244}
245
246static void start_wakeup_tracer(struct trace_array *tr)
247{
248 int ret;
249
250 ret = marker_probe_register("kernel_sched_wakeup",
251 "pid %d state %ld ## rq %p task %p rq->curr %p",
252 wake_up_callback,
253 &wakeup_trace);
254 if (ret) {
255 pr_info("wakeup trace: Couldn't add marker"
256 " probe to kernel_sched_wakeup\n");
257 return;
258 }
259
260 ret = marker_probe_register("kernel_sched_wakeup_new",
261 "pid %d state %ld ## rq %p task %p rq->curr %p",
262 wake_up_callback,
263 &wakeup_trace);
264 if (ret) {
265 pr_info("wakeup trace: Couldn't add marker"
266 " probe to kernel_sched_wakeup_new\n");
267 goto fail_deprobe;
268 }
269
270 ret = marker_probe_register("kernel_sched_schedule",
271 "prev_pid %d next_pid %d prev_state %ld "
272 "## rq %p prev %p next %p",
273 sched_switch_callback,
274 &wakeup_trace);
275 if (ret) {
276 pr_info("sched trace: Couldn't add marker"
277 " probe to kernel_sched_schedule\n");
278 goto fail_deprobe_wake_new;
279 }
280
281 wakeup_reset(tr);
282
283 /*
284 * Don't let the tracer_enabled = 1 show up before
285 * the wakeup_task is reset. This may be overkill since
286 * wakeup_reset does a spin_unlock after setting the
287 * wakeup_task to NULL, but I want to be safe.
288 * This is a slow path anyway.
289 */
290 smp_wmb();
291
292 tracer_enabled = 1;
293
294 return;
295fail_deprobe_wake_new:
296 marker_probe_unregister("kernel_sched_wakeup_new",
297 wake_up_callback,
298 &wakeup_trace);
299fail_deprobe:
300 marker_probe_unregister("kernel_sched_wakeup",
301 wake_up_callback,
302 &wakeup_trace);
303}
304
305static void stop_wakeup_tracer(struct trace_array *tr)
306{
307 tracer_enabled = 0;
308 marker_probe_unregister("kernel_sched_schedule",
309 sched_switch_callback,
310 &wakeup_trace);
311 marker_probe_unregister("kernel_sched_wakeup_new",
312 wake_up_callback,
313 &wakeup_trace);
314 marker_probe_unregister("kernel_sched_wakeup",
315 wake_up_callback,
316 &wakeup_trace);
317}
318
319static void wakeup_tracer_init(struct trace_array *tr)
320{
321 wakeup_trace = tr;
322
323 if (tr->ctrl)
324 start_wakeup_tracer(tr);
325}
326
327static void wakeup_tracer_reset(struct trace_array *tr)
328{
329 if (tr->ctrl) {
330 stop_wakeup_tracer(tr);
331 /* make sure we put back any tasks we are tracing */
332 wakeup_reset(tr);
333 }
334}
335
336static void wakeup_tracer_ctrl_update(struct trace_array *tr)
337{
338 if (tr->ctrl)
339 start_wakeup_tracer(tr);
340 else
341 stop_wakeup_tracer(tr);
342}
343
344static void wakeup_tracer_open(struct trace_iterator *iter)
345{
346 /* stop the trace while dumping */
347 if (iter->tr->ctrl)
348 stop_wakeup_tracer(iter->tr);
349}
350
351static void wakeup_tracer_close(struct trace_iterator *iter)
352{
353 /* forget about any processes we were recording */
354 if (iter->tr->ctrl)
355 start_wakeup_tracer(iter->tr);
356}
357
358static struct tracer wakeup_tracer __read_mostly =
359{
360 .name = "wakeup",
361 .init = wakeup_tracer_init,
362 .reset = wakeup_tracer_reset,
363 .open = wakeup_tracer_open,
364 .close = wakeup_tracer_close,
365 .ctrl_update = wakeup_tracer_ctrl_update,
366 .print_max = 1,
367#ifdef CONFIG_FTRACE_SELFTEST
368 .selftest = trace_selftest_startup_wakeup,
369#endif
370};
371
372__init static int init_wakeup_tracer(void)
373{
374 int ret;
375
376 ret = register_tracer(&wakeup_tracer);
377 if (ret)
378 return ret;
379
380 return 0;
381}
382device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..3877dd9102f1
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,539 @@
1/* Include in trace.c */
2
3#include <linux/kthread.h>
4#include <linux/delay.h>
5
6static inline int trace_valid_entry(struct trace_entry *entry)
7{
8 switch (entry->type) {
9 case TRACE_FN:
10 case TRACE_CTX:
11 case TRACE_WAKE:
12 case TRACE_STACK:
13 case TRACE_SPECIAL:
14 return 1;
15 }
16 return 0;
17}
18
19static int
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{
22 struct trace_entry *entries;
23 struct page *page;
24 int idx = 0;
25 int i;
26
27 BUG_ON(list_empty(&data->trace_pages));
28 page = list_entry(data->trace_pages.next, struct page, lru);
29 entries = page_address(page);
30
31 if (head_page(data) != entries)
32 goto failed;
33
34 /*
35 * The starting trace buffer always has valid elements,
36 * if any element exists.
37 */
38 entries = head_page(data);
39
40 for (i = 0; i < tr->entries; i++) {
41
42 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
43 printk(KERN_CONT ".. invalid entry %d ",
44 entries[idx].type);
45 goto failed;
46 }
47
48 idx++;
49 if (idx >= ENTRIES_PER_PAGE) {
50 page = virt_to_page(entries);
51 if (page->lru.next == &data->trace_pages) {
52 if (i != tr->entries - 1) {
53 printk(KERN_CONT ".. entries buffer mismatch");
54 goto failed;
55 }
56 } else {
57 page = list_entry(page->lru.next, struct page, lru);
58 entries = page_address(page);
59 }
60 idx = 0;
61 }
62 }
63
64 page = virt_to_page(entries);
65 if (page->lru.next != &data->trace_pages) {
66 printk(KERN_CONT ".. too many entries");
67 goto failed;
68 }
69
70 return 0;
71
72 failed:
73 /* disable tracing */
74 tracing_disabled = 1;
75 printk(KERN_CONT ".. corrupted trace buffer .. ");
76 return -1;
77}
78
79/*
80 * Test the trace buffer to see if all the elements
81 * are still sane.
82 */
83static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
84{
85 unsigned long flags, cnt = 0;
86 int cpu, ret = 0;
87
88 /* Don't allow flipping of max traces now */
89 raw_local_irq_save(flags);
90 __raw_spin_lock(&ftrace_max_lock);
91 for_each_possible_cpu(cpu) {
92 if (!head_page(tr->data[cpu]))
93 continue;
94
95 cnt += tr->data[cpu]->trace_idx;
96
97 ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
98 if (ret)
99 break;
100 }
101 __raw_spin_unlock(&ftrace_max_lock);
102 raw_local_irq_restore(flags);
103
104 if (count)
105 *count = cnt;
106
107 return ret;
108}
109
110#ifdef CONFIG_FTRACE
111
112#ifdef CONFIG_DYNAMIC_FTRACE
113
114#define __STR(x) #x
115#define STR(x) __STR(x)
116
117/* Test dynamic code modification and ftrace filters */
118int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
119 struct trace_array *tr,
120 int (*func)(void))
121{
122 unsigned long count;
123 int ret;
124 int save_ftrace_enabled = ftrace_enabled;
125 int save_tracer_enabled = tracer_enabled;
126 char *func_name;
127
128 /* The ftrace test PASSED */
129 printk(KERN_CONT "PASSED\n");
130 pr_info("Testing dynamic ftrace: ");
131
132 /* enable tracing, and record the filter function */
133 ftrace_enabled = 1;
134 tracer_enabled = 1;
135
136 /* passed in by parameter to fool gcc from optimizing */
137 func();
138
139 /* update the records */
140 ret = ftrace_force_update();
141 if (ret) {
142 printk(KERN_CONT ".. ftraced failed .. ");
143 return ret;
144 }
145
146 /*
147 * Some archs *cough*PowerPC*cough* add charachters to the
148 * start of the function names. We simply put a '*' to
149 * accomodate them.
150 */
151 func_name = "*" STR(DYN_FTRACE_TEST_NAME);
152
153 /* filter only on our function */
154 ftrace_set_filter(func_name, strlen(func_name), 1);
155
156 /* enable tracing */
157 tr->ctrl = 1;
158 trace->init(tr);
159 /* Sleep for a 1/10 of a second */
160 msleep(100);
161
162 /* we should have nothing in the buffer */
163 ret = trace_test_buffer(tr, &count);
164 if (ret)
165 goto out;
166
167 if (count) {
168 ret = -1;
169 printk(KERN_CONT ".. filter did not filter .. ");
170 goto out;
171 }
172
173 /* call our function again */
174 func();
175
176 /* sleep again */
177 msleep(100);
178
179 /* stop the tracing. */
180 tr->ctrl = 0;
181 trace->ctrl_update(tr);
182 ftrace_enabled = 0;
183
184 /* check the trace buffer */
185 ret = trace_test_buffer(tr, &count);
186 trace->reset(tr);
187
188 /* we should only have one item */
189 if (!ret && count != 1) {
190 printk(KERN_CONT ".. filter failed count=%ld ..", count);
191 ret = -1;
192 goto out;
193 }
194 out:
195 ftrace_enabled = save_ftrace_enabled;
196 tracer_enabled = save_tracer_enabled;
197
198 /* Enable tracing on all functions again */
199 ftrace_set_filter(NULL, 0, 1);
200
201 return ret;
202}
203#else
204# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
205#endif /* CONFIG_DYNAMIC_FTRACE */
206/*
207 * Simple verification test of ftrace function tracer.
208 * Enable ftrace, sleep 1/10 second, and then read the trace
209 * buffer to see if all is in order.
210 */
211int
212trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
213{
214 unsigned long count;
215 int ret;
216 int save_ftrace_enabled = ftrace_enabled;
217 int save_tracer_enabled = tracer_enabled;
218
219 /* make sure msleep has been recorded */
220 msleep(1);
221
222 /* force the recorded functions to be traced */
223 ret = ftrace_force_update();
224 if (ret) {
225 printk(KERN_CONT ".. ftraced failed .. ");
226 return ret;
227 }
228
229 /* start the tracing */
230 ftrace_enabled = 1;
231 tracer_enabled = 1;
232
233 tr->ctrl = 1;
234 trace->init(tr);
235 /* Sleep for a 1/10 of a second */
236 msleep(100);
237 /* stop the tracing. */
238 tr->ctrl = 0;
239 trace->ctrl_update(tr);
240 ftrace_enabled = 0;
241
242 /* check the trace buffer */
243 ret = trace_test_buffer(tr, &count);
244 trace->reset(tr);
245
246 if (!ret && !count) {
247 printk(KERN_CONT ".. no entries found ..");
248 ret = -1;
249 goto out;
250 }
251
252 ret = trace_selftest_startup_dynamic_tracing(trace, tr,
253 DYN_FTRACE_TEST_NAME);
254
255 out:
256 ftrace_enabled = save_ftrace_enabled;
257 tracer_enabled = save_tracer_enabled;
258
259 /* kill ftrace totally if we failed */
260 if (ret)
261 ftrace_kill();
262
263 return ret;
264}
265#endif /* CONFIG_FTRACE */
266
267#ifdef CONFIG_IRQSOFF_TRACER
268int
269trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
270{
271 unsigned long save_max = tracing_max_latency;
272 unsigned long count;
273 int ret;
274
275 /* start the tracing */
276 tr->ctrl = 1;
277 trace->init(tr);
278 /* reset the max latency */
279 tracing_max_latency = 0;
280 /* disable interrupts for a bit */
281 local_irq_disable();
282 udelay(100);
283 local_irq_enable();
284 /* stop the tracing. */
285 tr->ctrl = 0;
286 trace->ctrl_update(tr);
287 /* check both trace buffers */
288 ret = trace_test_buffer(tr, NULL);
289 if (!ret)
290 ret = trace_test_buffer(&max_tr, &count);
291 trace->reset(tr);
292
293 if (!ret && !count) {
294 printk(KERN_CONT ".. no entries found ..");
295 ret = -1;
296 }
297
298 tracing_max_latency = save_max;
299
300 return ret;
301}
302#endif /* CONFIG_IRQSOFF_TRACER */
303
304#ifdef CONFIG_PREEMPT_TRACER
305int
306trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
307{
308 unsigned long save_max = tracing_max_latency;
309 unsigned long count;
310 int ret;
311
312 /* start the tracing */
313 tr->ctrl = 1;
314 trace->init(tr);
315 /* reset the max latency */
316 tracing_max_latency = 0;
317 /* disable preemption for a bit */
318 preempt_disable();
319 udelay(100);
320 preempt_enable();
321 /* stop the tracing. */
322 tr->ctrl = 0;
323 trace->ctrl_update(tr);
324 /* check both trace buffers */
325 ret = trace_test_buffer(tr, NULL);
326 if (!ret)
327 ret = trace_test_buffer(&max_tr, &count);
328 trace->reset(tr);
329
330 if (!ret && !count) {
331 printk(KERN_CONT ".. no entries found ..");
332 ret = -1;
333 }
334
335 tracing_max_latency = save_max;
336
337 return ret;
338}
339#endif /* CONFIG_PREEMPT_TRACER */
340
341#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
342int
343trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
344{
345 unsigned long save_max = tracing_max_latency;
346 unsigned long count;
347 int ret;
348
349 /* start the tracing */
350 tr->ctrl = 1;
351 trace->init(tr);
352
353 /* reset the max latency */
354 tracing_max_latency = 0;
355
356 /* disable preemption and interrupts for a bit */
357 preempt_disable();
358 local_irq_disable();
359 udelay(100);
360 preempt_enable();
361 /* reverse the order of preempt vs irqs */
362 local_irq_enable();
363
364 /* stop the tracing. */
365 tr->ctrl = 0;
366 trace->ctrl_update(tr);
367 /* check both trace buffers */
368 ret = trace_test_buffer(tr, NULL);
369 if (ret)
370 goto out;
371
372 ret = trace_test_buffer(&max_tr, &count);
373 if (ret)
374 goto out;
375
376 if (!ret && !count) {
377 printk(KERN_CONT ".. no entries found ..");
378 ret = -1;
379 goto out;
380 }
381
382 /* do the test by disabling interrupts first this time */
383 tracing_max_latency = 0;
384 tr->ctrl = 1;
385 trace->ctrl_update(tr);
386 preempt_disable();
387 local_irq_disable();
388 udelay(100);
389 preempt_enable();
390 /* reverse the order of preempt vs irqs */
391 local_irq_enable();
392
393 /* stop the tracing. */
394 tr->ctrl = 0;
395 trace->ctrl_update(tr);
396 /* check both trace buffers */
397 ret = trace_test_buffer(tr, NULL);
398 if (ret)
399 goto out;
400
401 ret = trace_test_buffer(&max_tr, &count);
402
403 if (!ret && !count) {
404 printk(KERN_CONT ".. no entries found ..");
405 ret = -1;
406 goto out;
407 }
408
409 out:
410 trace->reset(tr);
411 tracing_max_latency = save_max;
412
413 return ret;
414}
415#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
416
417#ifdef CONFIG_SCHED_TRACER
418static int trace_wakeup_test_thread(void *data)
419{
420 /* Make this a RT thread, doesn't need to be too high */
421 struct sched_param param = { .sched_priority = 5 };
422 struct completion *x = data;
423
424 sched_setscheduler(current, SCHED_FIFO, &param);
425
426 /* Make it know we have a new prio */
427 complete(x);
428
429 /* now go to sleep and let the test wake us up */
430 set_current_state(TASK_INTERRUPTIBLE);
431 schedule();
432
433 /* we are awake, now wait to disappear */
434 while (!kthread_should_stop()) {
435 /*
436 * This is an RT task, do short sleeps to let
437 * others run.
438 */
439 msleep(100);
440 }
441
442 return 0;
443}
444
445int
446trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
447{
448 unsigned long save_max = tracing_max_latency;
449 struct task_struct *p;
450 struct completion isrt;
451 unsigned long count;
452 int ret;
453
454 init_completion(&isrt);
455
456 /* create a high prio thread */
457 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
458 if (IS_ERR(p)) {
459 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
460 return -1;
461 }
462
463 /* make sure the thread is running at an RT prio */
464 wait_for_completion(&isrt);
465
466 /* start the tracing */
467 tr->ctrl = 1;
468 trace->init(tr);
469 /* reset the max latency */
470 tracing_max_latency = 0;
471
472 /* sleep to let the RT thread sleep too */
473 msleep(100);
474
475 /*
476 * Yes this is slightly racy. It is possible that for some
477 * strange reason that the RT thread we created, did not
478 * call schedule for 100ms after doing the completion,
479 * and we do a wakeup on a task that already is awake.
480 * But that is extremely unlikely, and the worst thing that
481 * happens in such a case, is that we disable tracing.
482 * Honestly, if this race does happen something is horrible
483 * wrong with the system.
484 */
485
486 wake_up_process(p);
487
488 /* stop the tracing. */
489 tr->ctrl = 0;
490 trace->ctrl_update(tr);
491 /* check both trace buffers */
492 ret = trace_test_buffer(tr, NULL);
493 if (!ret)
494 ret = trace_test_buffer(&max_tr, &count);
495
496
497 trace->reset(tr);
498
499 tracing_max_latency = save_max;
500
501 /* kill the thread */
502 kthread_stop(p);
503
504 if (!ret && !count) {
505 printk(KERN_CONT ".. no entries found ..");
506 ret = -1;
507 }
508
509 return ret;
510}
511#endif /* CONFIG_SCHED_TRACER */
512
513#ifdef CONFIG_CONTEXT_SWITCH_TRACER
514int
515trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
516{
517 unsigned long count;
518 int ret;
519
520 /* start the tracing */
521 tr->ctrl = 1;
522 trace->init(tr);
523 /* Sleep for a 1/10 of a second */
524 msleep(100);
525 /* stop the tracing. */
526 tr->ctrl = 0;
527 trace->ctrl_update(tr);
528 /* check the trace buffer */
529 ret = trace_test_buffer(tr, &count);
530 trace->reset(tr);
531
532 if (!ret && !count) {
533 printk(KERN_CONT ".. no entries found ..");
534 ret = -1;
535 }
536
537 return ret;
538}
539#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
1#include "trace.h"
2
3int DYN_FTRACE_TEST_NAME(void)
4{
5 /* used to call mcount */
6 return 0;
7}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..d8b6279a9b42 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -634,6 +634,8 @@ config LATENCYTOP
634 Enable this option if you want to use the LatencyTOP tool 634 Enable this option if you want to use the LatencyTOP tool
635 to find out which userspace is blocking on what kernel operations. 635 to find out which userspace is blocking on what kernel operations.
636 636
637source kernel/trace/Kconfig
638
637config PROVIDE_OHCI1394_DMA_INIT 639config PROVIDE_OHCI1394_DMA_INIT
638 bool "Remote debugging over FireWire early on boot" 640 bool "Remote debugging over FireWire early on boot"
639 depends on PCI && X86 641 depends on PCI && X86
diff --git a/lib/Makefile b/lib/Makefile
index 74b0cfb1fcc3..4b836a53c08f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,6 +8,15 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \ 8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
9 proportions.o prio_heap.o ratelimit.o 9 proportions.o prio_heap.o ratelimit.o
10 10
11ifdef CONFIG_FTRACE
12# Do not profile string.o, since it may be used in early boot or vdso
13CFLAGS_REMOVE_string.o = -pg
14# Also do not profile any debug utilities
15CFLAGS_REMOVE_spinlock_debug.o = -pg
16CFLAGS_REMOVE_list_debug.o = -pg
17CFLAGS_REMOVE_debugobjects.o = -pg
18endif
19
11lib-$(CONFIG_MMU) += ioremap.o 20lib-$(CONFIG_MMU) += ioremap.o
12lib-$(CONFIG_SMP) += cpumask.o 21lib-$(CONFIG_SMP) += cpumask.o
13 22
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 6c90fb90e19c..3b4dc098181e 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -7,7 +7,7 @@
7#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9 9
10unsigned int debug_smp_processor_id(void) 10notrace unsigned int debug_smp_processor_id(void)
11{ 11{
12 unsigned long preempt_count = preempt_count(); 12 unsigned long preempt_count = preempt_count();
13 int this_cpu = raw_smp_processor_id(); 13 int this_cpu = raw_smp_processor_id();
@@ -37,7 +37,7 @@ unsigned int debug_smp_processor_id(void)
37 /* 37 /*
38 * Avoid recursion: 38 * Avoid recursion:
39 */ 39 */
40 preempt_disable(); 40 preempt_disable_notrace();
41 41
42 if (!printk_ratelimit()) 42 if (!printk_ratelimit())
43 goto out_enable; 43 goto out_enable;
@@ -49,7 +49,7 @@ unsigned int debug_smp_processor_id(void)
49 dump_stack(); 49 dump_stack();
50 50
51out_enable: 51out_enable:
52 preempt_enable_no_resched(); 52 preempt_enable_no_resched_notrace();
53out: 53out:
54 return this_cpu; 54 return this_cpu;
55} 55}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef37..b38f700825fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
126static struct prop_descriptor vm_completions; 126static struct prop_descriptor vm_completions;
127static struct prop_descriptor vm_dirties; 127static struct prop_descriptor vm_dirties;
128 128
129static unsigned long determine_dirtyable_memory(void);
130
131/* 129/*
132 * couple the period to the dirty_ratio: 130 * couple the period to the dirty_ratio:
133 * 131 *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
347#endif 345#endif
348} 346}
349 347
350static unsigned long determine_dirtyable_memory(void) 348/**
349 * determine_dirtyable_memory - amount of memory that may be used
350 *
351 * Returns the numebr of pages that can currently be freed and used
352 * by the kernel for direct mappings.
353 */
354unsigned long determine_dirtyable_memory(void)
351{ 355{
352 unsigned long x; 356 unsigned long x;
353 357
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 8e440233c27d..ea48b82a3707 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
96modname_flags = $(if $(filter 1,$(words $(modname))),\ 96modname_flags = $(if $(filter 1,$(words $(modname))),\
97 -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") 97 -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
98 98
99_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) 99orig_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
100_c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
100_a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o) 101_a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o)
101_cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F)) 102_cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))
102 103