aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/ftrace.h14
-rw-r--r--arch/powerpc/include/asm/module.h16
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/entry_32.S40
-rw-r--r--arch/powerpc/kernel/entry_64.S12
-rw-r--r--arch/powerpc/kernel/ftrace.c461
-rw-r--r--arch/powerpc/kernel/idle.c5
-rw-r--r--arch/powerpc/kernel/module_32.c10
-rw-r--r--arch/powerpc/kernel/module_64.c13
-rw-r--r--arch/powerpc/lib/Makefile3
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Kconfig.cpu1
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/include/asm/ds.h305
-rw-r--r--arch/x86/include/asm/ftrace.h61
-rw-r--r--arch/x86/include/asm/processor.h13
-rw-r--r--arch/x86/include/asm/ptrace.h36
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/apic.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/ds.c1138
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S98
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/irq_64.c3
-rw-r--r--arch/x86/kernel/process.c16
-rw-r--r--arch/x86/kernel/process_32.c63
-rw-r--r--arch/x86/kernel/process_64.c54
-rw-r--r--arch/x86/kernel/ptrace.c399
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
42 files changed, 2515 insertions, 1791 deletions
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index b298f7a631e6..e5f2ae8362f7 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -7,7 +7,19 @@
7 7
8#ifndef __ASSEMBLY__ 8#ifndef __ASSEMBLY__
9extern void _mcount(void); 9extern void _mcount(void);
10#endif 10
11#ifdef CONFIG_DYNAMIC_FTRACE
12static inline unsigned long ftrace_call_adjust(unsigned long addr)
13{
14 /* reloction of mcount call site is the same as the address */
15 return addr;
16}
17
18struct dyn_arch_ftrace {
19 struct module *mod;
20};
21#endif /* CONFIG_DYNAMIC_FTRACE */
22#endif /* __ASSEMBLY__ */
11 23
12#endif 24#endif
13 25
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index e5f14b13ccf0..08454880a2c0 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -34,11 +34,19 @@ struct mod_arch_specific {
34#ifdef __powerpc64__ 34#ifdef __powerpc64__
35 unsigned int stubs_section; /* Index of stubs section in module */ 35 unsigned int stubs_section; /* Index of stubs section in module */
36 unsigned int toc_section; /* What section is the TOC? */ 36 unsigned int toc_section; /* What section is the TOC? */
37#else 37#ifdef CONFIG_DYNAMIC_FTRACE
38 unsigned long toc;
39 unsigned long tramp;
40#endif
41
42#else /* powerpc64 */
38 /* Indices of PLT sections within module. */ 43 /* Indices of PLT sections within module. */
39 unsigned int core_plt_section; 44 unsigned int core_plt_section;
40 unsigned int init_plt_section; 45 unsigned int init_plt_section;
46#ifdef CONFIG_DYNAMIC_FTRACE
47 unsigned long tramp;
41#endif 48#endif
49#endif /* powerpc64 */
42 50
43 /* List of BUG addresses, source line numbers and filenames */ 51 /* List of BUG addresses, source line numbers and filenames */
44 struct list_head bug_list; 52 struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
68# endif /* MODULE */ 76# endif /* MODULE */
69#endif 77#endif
70 78
79#ifdef CONFIG_DYNAMIC_FTRACE
80# ifdef MODULE
81 asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
82# endif /* MODULE */
83#endif
84
71 85
72struct exception_table_entry; 86struct exception_table_entry;
73void sort_ex_table(struct exception_table_entry *start, 87void sort_ex_table(struct exception_table_entry *start,
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 92673b43858d..d17edb4a2f9d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER
17CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog 17CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
18CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog 18CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
19CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog 19CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
20CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
20 21
21ifdef CONFIG_DYNAMIC_FTRACE 22ifdef CONFIG_DYNAMIC_FTRACE
22# dynamic ftrace setup. 23# dynamic ftrace setup.
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 7ecc0d1855c3..6f7eb7e00c79 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1162,39 +1162,17 @@ machine_check_in_rtas:
1162#ifdef CONFIG_DYNAMIC_FTRACE 1162#ifdef CONFIG_DYNAMIC_FTRACE
1163_GLOBAL(mcount) 1163_GLOBAL(mcount)
1164_GLOBAL(_mcount) 1164_GLOBAL(_mcount)
1165 stwu r1,-48(r1) 1165 /*
1166 stw r3, 12(r1) 1166 * It is required that _mcount on PPC32 must preserve the
1167 stw r4, 16(r1) 1167 * link register. But we have r0 to play with. We use r0
1168 stw r5, 20(r1) 1168 * to push the return address back to the caller of mcount
1169 stw r6, 24(r1) 1169 * into the ctr register, restore the link register and
1170 mflr r3 1170 * then jump back using the ctr register.
1171 stw r7, 28(r1) 1171 */
1172 mfcr r5 1172 mflr r0
1173 stw r8, 32(r1)
1174 stw r9, 36(r1)
1175 stw r10,40(r1)
1176 stw r3, 44(r1)
1177 stw r5, 8(r1)
1178 subi r3, r3, MCOUNT_INSN_SIZE
1179 .globl mcount_call
1180mcount_call:
1181 bl ftrace_stub
1182 nop
1183 lwz r6, 8(r1)
1184 lwz r0, 44(r1)
1185 lwz r3, 12(r1)
1186 mtctr r0 1173 mtctr r0
1187 lwz r4, 16(r1) 1174 lwz r0, 4(r1)
1188 mtcr r6
1189 lwz r5, 20(r1)
1190 lwz r6, 24(r1)
1191 lwz r0, 52(r1)
1192 lwz r7, 28(r1)
1193 lwz r8, 32(r1)
1194 mtlr r0 1175 mtlr r0
1195 lwz r9, 36(r1)
1196 lwz r10,40(r1)
1197 addi r1, r1, 48
1198 bctr 1176 bctr
1199 1177
1200_GLOBAL(ftrace_caller) 1178_GLOBAL(ftrace_caller)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index e0bcf9354286..383ed6eb0085 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -894,18 +894,6 @@ _GLOBAL(enter_prom)
894#ifdef CONFIG_DYNAMIC_FTRACE 894#ifdef CONFIG_DYNAMIC_FTRACE
895_GLOBAL(mcount) 895_GLOBAL(mcount)
896_GLOBAL(_mcount) 896_GLOBAL(_mcount)
897 /* Taken from output of objdump from lib64/glibc */
898 mflr r3
899 stdu r1, -112(r1)
900 std r3, 128(r1)
901 subi r3, r3, MCOUNT_INSN_SIZE
902 .globl mcount_call
903mcount_call:
904 bl ftrace_stub
905 nop
906 ld r0, 128(r1)
907 mtlr r0
908 addi r1, r1, 112
909 blr 897 blr
910 898
911_GLOBAL(ftrace_caller) 899_GLOBAL(ftrace_caller)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index f4b006ed0ab1..5355244c99ff 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -9,22 +9,30 @@
9 9
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
12#include <linux/ftrace.h> 14#include <linux/ftrace.h>
13#include <linux/percpu.h> 15#include <linux/percpu.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/list.h> 17#include <linux/list.h>
16 18
17#include <asm/cacheflush.h> 19#include <asm/cacheflush.h>
20#include <asm/code-patching.h>
18#include <asm/ftrace.h> 21#include <asm/ftrace.h>
19 22
23#if 0
24#define DEBUGP printk
25#else
26#define DEBUGP(fmt , ...) do { } while (0)
27#endif
20 28
21static unsigned int ftrace_nop = 0x60000000; 29static unsigned int ftrace_nop = PPC_NOP_INSTR;
22 30
23#ifdef CONFIG_PPC32 31#ifdef CONFIG_PPC32
24# define GET_ADDR(addr) addr 32# define GET_ADDR(addr) addr
25#else 33#else
26/* PowerPC64's functions are data that points to the functions */ 34/* PowerPC64's functions are data that points to the functions */
27# define GET_ADDR(addr) *(unsigned long *)addr 35# define GET_ADDR(addr) (*(unsigned long *)addr)
28#endif 36#endif
29 37
30 38
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
33 return (int)(addr - ip); 41 return (int)(addr - ip);
34} 42}
35 43
36unsigned char *ftrace_nop_replace(void) 44static unsigned char *ftrace_nop_replace(void)
37{ 45{
38 return (char *)&ftrace_nop; 46 return (char *)&ftrace_nop;
39} 47}
40 48
41unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) 49static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
42{ 50{
43 static unsigned int op; 51 static unsigned int op;
44 52
@@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
68# define _ASM_PTR " .long " 76# define _ASM_PTR " .long "
69#endif 77#endif
70 78
71int 79static int
72ftrace_modify_code(unsigned long ip, unsigned char *old_code, 80ftrace_modify_code(unsigned long ip, unsigned char *old_code,
73 unsigned char *new_code) 81 unsigned char *new_code)
74{ 82{
75 unsigned replaced; 83 unsigned char replaced[MCOUNT_INSN_SIZE];
76 unsigned old = *(unsigned *)old_code;
77 unsigned new = *(unsigned *)new_code;
78 int faulted = 0;
79 84
80 /* 85 /*
81 * Note: Due to modules and __init, code can 86 * Note: Due to modules and __init, code can
82 * disappear and change, we need to protect against faulting 87 * disappear and change, we need to protect against faulting
83 * as well as code changing. 88 * as well as code changing. We do this by using the
89 * probe_kernel_* functions.
84 * 90 *
85 * No real locking needed, this code is run through 91 * No real locking needed, this code is run through
86 * kstop_machine. 92 * kstop_machine, or before SMP starts.
87 */ 93 */
88 asm volatile ( 94
89 "1: lwz %1, 0(%2)\n" 95 /* read the text we want to modify */
90 " cmpw %1, %5\n" 96 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
91 " bne 2f\n" 97 return -EFAULT;
92 " stwu %3, 0(%2)\n" 98
93 "2:\n" 99 /* Make sure it is what we expect it to be */
94 ".section .fixup, \"ax\"\n" 100 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
95 "3: li %0, 1\n" 101 return -EINVAL;
96 " b 2b\n" 102
97 ".previous\n" 103 /* replace the text with the new text */
98 ".section __ex_table,\"a\"\n" 104 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
99 _ASM_ALIGN "\n" 105 return -EPERM;
100 _ASM_PTR "1b, 3b\n" 106
101 ".previous" 107 flush_icache_range(ip, ip + 8);
102 : "=r"(faulted), "=r"(replaced) 108
103 : "r"(ip), "r"(new), 109 return 0;
104 "0"(faulted), "r"(old) 110}
105 : "memory"); 111
106 112/*
107 if (replaced != old && replaced != new) 113 * Helper functions that are the same for both PPC64 and PPC32.
108 faulted = 2; 114 */
109 115static int test_24bit_addr(unsigned long ip, unsigned long addr)
110 if (!faulted) 116{
111 flush_icache_range(ip, ip + 8); 117
112 118 /* use the create_branch to verify that this offset can be branched */
113 return faulted; 119 return create_branch((unsigned int *)ip, addr, 0);
120}
121
122static int is_bl_op(unsigned int op)
123{
124 return (op & 0xfc000003) == 0x48000001;
125}
126
127static unsigned long find_bl_target(unsigned long ip, unsigned int op)
128{
129 static int offset;
130
131 offset = (op & 0x03fffffc);
132 /* make it signed */
133 if (offset & 0x02000000)
134 offset |= 0xfe000000;
135
136 return ip + (long)offset;
137}
138
139#ifdef CONFIG_PPC64
140static int
141__ftrace_make_nop(struct module *mod,
142 struct dyn_ftrace *rec, unsigned long addr)
143{
144 unsigned int op;
145 unsigned int jmp[5];
146 unsigned long ptr;
147 unsigned long ip = rec->ip;
148 unsigned long tramp;
149 int offset;
150
151 /* read where this goes */
152 if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
153 return -EFAULT;
154
155 /* Make sure that that this is still a 24bit jump */
156 if (!is_bl_op(op)) {
157 printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
158 return -EINVAL;
159 }
160
161 /* lets find where the pointer goes */
162 tramp = find_bl_target(ip, op);
163
164 /*
165 * On PPC64 the trampoline looks like:
166 * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high>
167 * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low>
168 * Where the bytes 2,3,6 and 7 make up the 32bit offset
169 * to the TOC that holds the pointer.
170 * to jump to.
171 * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1)
172 * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12)
173 * The actually address is 32 bytes from the offset
174 * into the TOC.
175 * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12)
176 */
177
178 DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
179
180 /* Find where the trampoline jumps to */
181 if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
182 printk(KERN_ERR "Failed to read %lx\n", tramp);
183 return -EFAULT;
184 }
185
186 DEBUGP(" %08x %08x", jmp[0], jmp[1]);
187
188 /* verify that this is what we expect it to be */
189 if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
190 ((jmp[1] & 0xffff0000) != 0x398c0000) ||
191 (jmp[2] != 0xf8410028) ||
192 (jmp[3] != 0xe96c0020) ||
193 (jmp[4] != 0xe84c0028)) {
194 printk(KERN_ERR "Not a trampoline\n");
195 return -EINVAL;
196 }
197
198 offset = (unsigned)((unsigned short)jmp[0]) << 16 |
199 (unsigned)((unsigned short)jmp[1]);
200
201 DEBUGP(" %x ", offset);
202
203 /* get the address this jumps too */
204 tramp = mod->arch.toc + offset + 32;
205 DEBUGP("toc: %lx", tramp);
206
207 if (probe_kernel_read(jmp, (void *)tramp, 8)) {
208 printk(KERN_ERR "Failed to read %lx\n", tramp);
209 return -EFAULT;
210 }
211
212 DEBUGP(" %08x %08x\n", jmp[0], jmp[1]);
213
214 ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
215
216 /* This should match what was called */
217 if (ptr != GET_ADDR(addr)) {
218 printk(KERN_ERR "addr does not match %lx\n", ptr);
219 return -EINVAL;
220 }
221
222 /*
223 * We want to nop the line, but the next line is
224 * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1)
225 * This needs to be turned to a nop too.
226 */
227 if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
228 return -EFAULT;
229
230 if (op != 0xe8410028) {
231 printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
232 return -EINVAL;
233 }
234
235 /*
236 * Milton Miller pointed out that we can not blindly do nops.
237 * If a task was preempted when calling a trace function,
238 * the nops will remove the way to restore the TOC in r2
239 * and the r2 TOC will get corrupted.
240 */
241
242 /*
243 * Replace:
244 * bl <tramp> <==== will be replaced with "b 1f"
245 * ld r2,40(r1)
246 * 1:
247 */
248 op = 0x48000008; /* b +8 */
249
250 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
251 return -EPERM;
252
253
254 flush_icache_range(ip, ip + 8);
255
256 return 0;
257}
258
259#else /* !PPC64 */
260static int
261__ftrace_make_nop(struct module *mod,
262 struct dyn_ftrace *rec, unsigned long addr)
263{
264 unsigned int op;
265 unsigned int jmp[4];
266 unsigned long ip = rec->ip;
267 unsigned long tramp;
268
269 if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
270 return -EFAULT;
271
272 /* Make sure that that this is still a 24bit jump */
273 if (!is_bl_op(op)) {
274 printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
275 return -EINVAL;
276 }
277
278 /* lets find where the pointer goes */
279 tramp = find_bl_target(ip, op);
280
281 /*
282 * On PPC32 the trampoline looks like:
283 * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha
284 * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l
285 * 0x7d, 0x69, 0x03, 0xa6 mtctr r11
286 * 0x4e, 0x80, 0x04, 0x20 bctr
287 */
288
289 DEBUGP("ip:%lx jumps to %lx", ip, tramp);
290
291 /* Find where the trampoline jumps to */
292 if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
293 printk(KERN_ERR "Failed to read %lx\n", tramp);
294 return -EFAULT;
295 }
296
297 DEBUGP(" %08x %08x ", jmp[0], jmp[1]);
298
299 /* verify that this is what we expect it to be */
300 if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
301 ((jmp[1] & 0xffff0000) != 0x396b0000) ||
302 (jmp[2] != 0x7d6903a6) ||
303 (jmp[3] != 0x4e800420)) {
304 printk(KERN_ERR "Not a trampoline\n");
305 return -EINVAL;
306 }
307
308 tramp = (jmp[1] & 0xffff) |
309 ((jmp[0] & 0xffff) << 16);
310 if (tramp & 0x8000)
311 tramp -= 0x10000;
312
313 DEBUGP(" %x ", tramp);
314
315 if (tramp != addr) {
316 printk(KERN_ERR
317 "Trampoline location %08lx does not match addr\n",
318 tramp);
319 return -EINVAL;
320 }
321
322 op = PPC_NOP_INSTR;
323
324 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
325 return -EPERM;
326
327 flush_icache_range(ip, ip + 8);
328
329 return 0;
330}
331#endif /* PPC64 */
332
333int ftrace_make_nop(struct module *mod,
334 struct dyn_ftrace *rec, unsigned long addr)
335{
336 unsigned char *old, *new;
337 unsigned long ip = rec->ip;
338
339 /*
340 * If the calling address is more that 24 bits away,
341 * then we had to use a trampoline to make the call.
342 * Otherwise just update the call site.
343 */
344 if (test_24bit_addr(ip, addr)) {
345 /* within range */
346 old = ftrace_call_replace(ip, addr);
347 new = ftrace_nop_replace();
348 return ftrace_modify_code(ip, old, new);
349 }
350
351 /*
352 * Out of range jumps are called from modules.
353 * We should either already have a pointer to the module
354 * or it has been passed in.
355 */
356 if (!rec->arch.mod) {
357 if (!mod) {
358 printk(KERN_ERR "No module loaded addr=%lx\n",
359 addr);
360 return -EFAULT;
361 }
362 rec->arch.mod = mod;
363 } else if (mod) {
364 if (mod != rec->arch.mod) {
365 printk(KERN_ERR
366 "Record mod %p not equal to passed in mod %p\n",
367 rec->arch.mod, mod);
368 return -EINVAL;
369 }
370 /* nothing to do if mod == rec->arch.mod */
371 } else
372 mod = rec->arch.mod;
373
374 return __ftrace_make_nop(mod, rec, addr);
375
376}
377
378#ifdef CONFIG_PPC64
379static int
380__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
381{
382 unsigned int op[2];
383 unsigned long ip = rec->ip;
384
385 /* read where this goes */
386 if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
387 return -EFAULT;
388
389 /*
390 * It should be pointing to two nops or
391 * b +8; ld r2,40(r1)
392 */
393 if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
394 ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
395 printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
396 return -EINVAL;
397 }
398
399 /* If we never set up a trampoline to ftrace_caller, then bail */
400 if (!rec->arch.mod->arch.tramp) {
401 printk(KERN_ERR "No ftrace trampoline\n");
402 return -EINVAL;
403 }
404
405 /* create the branch to the trampoline */
406 op[0] = create_branch((unsigned int *)ip,
407 rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
408 if (!op[0]) {
409 printk(KERN_ERR "REL24 out of range!\n");
410 return -EINVAL;
411 }
412
413 /* ld r2,40(r1) */
414 op[1] = 0xe8410028;
415
416 DEBUGP("write to %lx\n", rec->ip);
417
418 if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
419 return -EPERM;
420
421 flush_icache_range(ip, ip + 8);
422
423 return 0;
424}
425#else
426static int
427__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
428{
429 unsigned int op;
430 unsigned long ip = rec->ip;
431
432 /* read where this goes */
433 if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
434 return -EFAULT;
435
436 /* It should be pointing to a nop */
437 if (op != PPC_NOP_INSTR) {
438 printk(KERN_ERR "Expected NOP but have %x\n", op);
439 return -EINVAL;
440 }
441
442 /* If we never set up a trampoline to ftrace_caller, then bail */
443 if (!rec->arch.mod->arch.tramp) {
444 printk(KERN_ERR "No ftrace trampoline\n");
445 return -EINVAL;
446 }
447
448 /* create the branch to the trampoline */
449 op = create_branch((unsigned int *)ip,
450 rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
451 if (!op) {
452 printk(KERN_ERR "REL24 out of range!\n");
453 return -EINVAL;
454 }
455
456 DEBUGP("write to %lx\n", rec->ip);
457
458 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
459 return -EPERM;
460
461 flush_icache_range(ip, ip + 8);
462
463 return 0;
464}
465#endif /* CONFIG_PPC64 */
466
467int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
468{
469 unsigned char *old, *new;
470 unsigned long ip = rec->ip;
471
472 /*
473 * If the calling address is more that 24 bits away,
474 * then we had to use a trampoline to make the call.
475 * Otherwise just update the call site.
476 */
477 if (test_24bit_addr(ip, addr)) {
478 /* within range */
479 old = ftrace_nop_replace();
480 new = ftrace_call_replace(ip, addr);
481 return ftrace_modify_code(ip, old, new);
482 }
483
484 /*
485 * Out of range jumps are called from modules.
486 * Being that we are converting from nop, it had better
487 * already have a module defined.
488 */
489 if (!rec->arch.mod) {
490 printk(KERN_ERR "No module loaded\n");
491 return -EINVAL;
492 }
493
494 return __ftrace_make_call(rec, addr);
114} 495}
115 496
116int ftrace_update_ftrace_func(ftrace_func_t func) 497int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
128 509
129int __init ftrace_dyn_arch_init(void *data) 510int __init ftrace_dyn_arch_init(void *data)
130{ 511{
131 /* This is running in kstop_machine */ 512 /* caller expects data to be zero */
513 unsigned long *p = data;
132 514
133 ftrace_mcount_set(data); 515 *p = 0;
134 516
135 return 0; 517 return 0;
136} 518}
137
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 31982d05d81a..88d9c1d5e5fb 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -69,10 +69,15 @@ void cpu_idle(void)
69 smp_mb(); 69 smp_mb();
70 local_irq_disable(); 70 local_irq_disable();
71 71
72 /* Don't trace irqs off for idle */
73 stop_critical_timings();
74
72 /* check again after disabling irqs */ 75 /* check again after disabling irqs */
73 if (!need_resched() && !cpu_should_die()) 76 if (!need_resched() && !cpu_should_die())
74 ppc_md.power_save(); 77 ppc_md.power_save();
75 78
79 start_critical_timings();
80
76 local_irq_enable(); 81 local_irq_enable();
77 set_thread_flag(TIF_POLLING_NRFLAG); 82 set_thread_flag(TIF_POLLING_NRFLAG);
78 83
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 2df91a03462a..f832773fc28e 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/ftrace.h>
25#include <linux/cache.h> 26#include <linux/cache.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27#include <linux/sort.h> 28#include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
53 r_addend = rela[i].r_addend; 54 r_addend = rela[i].r_addend;
54 } 55 }
55 56
57#ifdef CONFIG_DYNAMIC_FTRACE
58 _count_relocs++; /* add one for ftrace_caller */
59#endif
56 return _count_relocs; 60 return _count_relocs;
57} 61}
58 62
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
306 return -ENOEXEC; 310 return -ENOEXEC;
307 } 311 }
308 } 312 }
313#ifdef CONFIG_DYNAMIC_FTRACE
314 module->arch.tramp =
315 do_plt_call(module->module_core,
316 (unsigned long)ftrace_caller,
317 sechdrs, module);
318#endif
309 return 0; 319 return 0;
310} 320}
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 1af2377e4992..8992b031a7b6 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -20,6 +20,7 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/err.h> 21#include <linux/err.h>
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/ftrace.h>
23#include <linux/bug.h> 24#include <linux/bug.h>
24#include <asm/module.h> 25#include <asm/module.h>
25#include <asm/firmware.h> 26#include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
163 } 164 }
164 } 165 }
165 166
167#ifdef CONFIG_DYNAMIC_FTRACE
168 /* make the trampoline to the ftrace_caller */
169 relocs++;
170#endif
171
166 DEBUGP("Looks like a total of %lu stubs, max\n", relocs); 172 DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
167 return relocs * sizeof(struct ppc64_stub_entry); 173 return relocs * sizeof(struct ppc64_stub_entry);
168} 174}
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
441 } 447 }
442 } 448 }
443 449
450#ifdef CONFIG_DYNAMIC_FTRACE
451 me->arch.toc = my_r2(sechdrs, me);
452 me->arch.tramp = stub_for_addr(sechdrs,
453 (unsigned long)ftrace_caller,
454 me);
455#endif
456
444 return 0; 457 return 0;
445} 458}
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d69912c07ce7..8db35278a4b4 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y)
6EXTRA_CFLAGS += -mno-minimal-toc 6EXTRA_CFLAGS += -mno-minimal-toc
7endif 7endif
8 8
9CFLAGS_REMOVE_code-patching.o = -pg
10CFLAGS_REMOVE_feature-fixups.o = -pg
11
9obj-y := string.o alloc.o \ 12obj-y := string.o alloc.o \
10 checksum_$(CONFIG_WORD_SIZE).o 13 checksum_$(CONFIG_WORD_SIZE).o
11obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o 14obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..45c86fb94132 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,11 +29,14 @@ config X86
29 select HAVE_FTRACE_MCOUNT_RECORD 29 select HAVE_FTRACE_MCOUNT_RECORD
30 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
31 select HAVE_FUNCTION_TRACER 31 select HAVE_FUNCTION_TRACER
32 select HAVE_FUNCTION_GRAPH_TRACER
33 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 34 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
33 select HAVE_ARCH_KGDB if !X86_VOYAGER 35 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK 36 select HAVE_ARCH_TRACEHOOK
35 select HAVE_GENERIC_DMA_COHERENT if X86_32 37 select HAVE_GENERIC_DMA_COHERENT if X86_32
36 select HAVE_EFFICIENT_UNALIGNED_ACCESS 38 select HAVE_EFFICIENT_UNALIGNED_ACCESS
39 select USER_STACKTRACE_SUPPORT
37 40
38config ARCH_DEFCONFIG 41config ARCH_DEFCONFIG
39 string 42 string
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b815664fe370..85a78575956c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
515config X86_DS 515config X86_DS
516 def_bool X86_PTRACE_BTS 516 def_bool X86_PTRACE_BTS
517 depends on X86_DEBUGCTLMSR 517 depends on X86_DEBUGCTLMSR
518 select HAVE_HW_BRANCH_TRACER
518 519
519config X86_PTRACE_BTS 520config X86_PTRACE_BTS
520 bool "Branch Trace Store" 521 bool "Branch Trace Store"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2a3dfbd5e677..fa013f529b74 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,14 +186,10 @@ config IOMMU_LEAK
186 Add a simple leak tracer to the IOMMU code. This is useful when you 186 Add a simple leak tracer to the IOMMU code. This is useful when you
187 are debugging a buggy device driver that leaks IOMMU mappings. 187 are debugging a buggy device driver that leaks IOMMU mappings.
188 188
189config MMIOTRACE_HOOKS
190 bool
191
192config MMIOTRACE 189config MMIOTRACE
193 bool "Memory mapped IO tracing" 190 bool "Memory mapped IO tracing"
194 depends on DEBUG_KERNEL && PCI 191 depends on DEBUG_KERNEL && PCI
195 select TRACING 192 select TRACING
196 select MMIOTRACE_HOOKS
197 help 193 help
198 Mmiotrace traces Memory Mapped I/O access and is meant for 194 Mmiotrace traces Memory Mapped I/O access and is meant for
199 debugging and reverse engineering. It is called from the ioremap 195 debugging and reverse engineering. It is called from the ioremap
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a95008457ea4..ee0ea3a96c11 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -6,14 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -26,11 +25,51 @@
26 25
27#include <linux/types.h> 26#include <linux/types.h>
28#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/err.h>
29 29
30 30
31#ifdef CONFIG_X86_DS 31#ifdef CONFIG_X86_DS
32 32
33struct task_struct; 33struct task_struct;
34struct ds_context;
35struct ds_tracer;
36struct bts_tracer;
37struct pebs_tracer;
38
39typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
40typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
41
42
43/*
44 * A list of features plus corresponding macros to talk about them in
45 * the ds_request function's flags parameter.
46 *
47 * We use the enum to index an array of corresponding control bits;
48 * we use the macro to index a flags bit-vector.
49 */
50enum ds_feature {
51 dsf_bts = 0,
52 dsf_bts_kernel,
53#define BTS_KERNEL (1 << dsf_bts_kernel)
54 /* trace kernel-mode branches */
55
56 dsf_bts_user,
57#define BTS_USER (1 << dsf_bts_user)
58 /* trace user-mode branches */
59
60 dsf_bts_overflow,
61 dsf_bts_max,
62 dsf_pebs = dsf_bts_max,
63
64 dsf_pebs_max,
65 dsf_ctl_max = dsf_pebs_max,
66 dsf_bts_timestamps = dsf_ctl_max,
67#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
68 /* add timestamps into BTS trace */
69
70#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
71};
72
34 73
35/* 74/*
36 * Request BTS or PEBS 75 * Request BTS or PEBS
@@ -38,163 +77,169 @@ struct task_struct;
38 * Due to alignement constraints, the actual buffer may be slightly 77 * Due to alignement constraints, the actual buffer may be slightly
39 * smaller than the requested or provided buffer. 78 * smaller than the requested or provided buffer.
40 * 79 *
41 * Returns 0 on success; -Eerrno otherwise 80 * Returns a pointer to a tracer structure on success, or
81 * ERR_PTR(errcode) on failure.
82 *
83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism.
42 * 85 *
43 * task: the task to request recording for; 86 * task: the task to request recording for;
44 * NULL for per-cpu recording on the current cpu 87 * NULL for per-cpu recording on the current cpu
45 * base: the base pointer for the (non-pageable) buffer; 88 * base: the base pointer for the (non-pageable) buffer;
46 * NULL if buffer allocation requested 89 * size: the size of the provided buffer in bytes
47 * size: the size of the requested or provided buffer
48 * ovfl: pointer to a function to be called on buffer overflow; 90 * ovfl: pointer to a function to be called on buffer overflow;
49 * NULL if cyclic buffer requested 91 * NULL if cyclic buffer requested
92 * th: the interrupt threshold in records from the end of the buffer;
93 * -1 if no interrupt threshold is requested.
94 * flags: a bit-mask of the above flags
50 */ 95 */
51typedef void (*ds_ovfl_callback_t)(struct task_struct *); 96extern struct bts_tracer *ds_request_bts(struct task_struct *task,
52extern int ds_request_bts(struct task_struct *task, void *base, size_t size, 97 void *base, size_t size,
53 ds_ovfl_callback_t ovfl); 98 bts_ovfl_callback_t ovfl,
54extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, 99 size_t th, unsigned int flags);
55 ds_ovfl_callback_t ovfl); 100extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
101 void *base, size_t size,
102 pebs_ovfl_callback_t ovfl,
103 size_t th, unsigned int flags);
56 104
57/* 105/*
58 * Release BTS or PEBS resources 106 * Release BTS or PEBS resources
107 * Suspend and resume BTS or PEBS tracing
59 * 108 *
60 * Frees buffers allocated on ds_request. 109 * tracer: the tracer handle returned from ds_request_~()
61 *
62 * Returns 0 on success; -Eerrno otherwise
63 *
64 * task: the task to release resources for;
65 * NULL to release resources for the current cpu
66 */ 110 */
67extern int ds_release_bts(struct task_struct *task); 111extern void ds_release_bts(struct bts_tracer *tracer);
68extern int ds_release_pebs(struct task_struct *task); 112extern void ds_suspend_bts(struct bts_tracer *tracer);
113extern void ds_resume_bts(struct bts_tracer *tracer);
114extern void ds_release_pebs(struct pebs_tracer *tracer);
115extern void ds_suspend_pebs(struct pebs_tracer *tracer);
116extern void ds_resume_pebs(struct pebs_tracer *tracer);
69 117
70/*
71 * Return the (array) index of the write pointer.
72 * (assuming an array of BTS/PEBS records)
73 *
74 * Returns -Eerrno on error
75 *
76 * task: the task to access;
77 * NULL to access the current cpu
78 * pos (out): if not NULL, will hold the result
79 */
80extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
81extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
82 118
83/* 119/*
84 * Return the (array) index one record beyond the end of the array. 120 * The raw DS buffer state as it is used for BTS and PEBS recording.
85 * (assuming an array of BTS/PEBS records)
86 *
87 * Returns -Eerrno on error
88 * 121 *
89 * task: the task to access; 122 * This is the low-level, arch-dependent interface for working
90 * NULL to access the current cpu 123 * directly on the raw trace data.
91 * pos (out): if not NULL, will hold the result
92 */ 124 */
93extern int ds_get_bts_end(struct task_struct *task, size_t *pos); 125struct ds_trace {
94extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); 126 /* the number of bts/pebs records */
127 size_t n;
128 /* the size of a bts/pebs record in bytes */
129 size_t size;
130 /* pointers into the raw buffer:
131 - to the first entry */
132 void *begin;
133 /* - one beyond the last entry */
134 void *end;
135 /* - one beyond the newest entry */
136 void *top;
137 /* - the interrupt threshold */
138 void *ith;
139 /* flags given on ds_request() */
140 unsigned int flags;
141};
95 142
96/* 143/*
97 * Provide a pointer to the BTS/PEBS record at parameter index. 144 * An arch-independent view on branch trace data.
98 * (assuming an array of BTS/PEBS records)
99 *
100 * The pointer points directly into the buffer. The user is
101 * responsible for copying the record.
102 *
103 * Returns the size of a single record on success; -Eerrno on error
104 *
105 * task: the task to access;
106 * NULL to access the current cpu
107 * index: the index of the requested record
108 * record (out): pointer to the requested record
109 */ 145 */
110extern int ds_access_bts(struct task_struct *task, 146enum bts_qualifier {
111 size_t index, const void **record); 147 bts_invalid,
112extern int ds_access_pebs(struct task_struct *task, 148#define BTS_INVALID bts_invalid
113 size_t index, const void **record); 149
150 bts_branch,
151#define BTS_BRANCH bts_branch
152
153 bts_task_arrives,
154#define BTS_TASK_ARRIVES bts_task_arrives
155
156 bts_task_departs,
157#define BTS_TASK_DEPARTS bts_task_departs
158
159 bts_qual_bit_size = 4,
160 bts_qual_max = (1 << bts_qual_bit_size),
161};
162
163struct bts_struct {
164 __u64 qualifier;
165 union {
166 /* BTS_BRANCH */
167 struct {
168 __u64 from;
169 __u64 to;
170 } lbr;
171 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
172 struct {
173 __u64 jiffies;
174 pid_t pid;
175 } timestamp;
176 } variant;
177};
114 178
115/*
116 * Write one or more BTS/PEBS records at the write pointer index and
117 * advance the write pointer.
118 *
119 * If size is not a multiple of the record size, trailing bytes are
120 * zeroed out.
121 *
122 * May result in one or more overflow notifications.
123 *
124 * If called during overflow handling, that is, with index >=
125 * interrupt threshold, the write will wrap around.
126 *
127 * An overflow notification is given if and when the interrupt
128 * threshold is reached during or after the write.
129 *
130 * Returns the number of bytes written or -Eerrno.
131 *
132 * task: the task to access;
133 * NULL to access the current cpu
134 * buffer: the buffer to write
135 * size: the size of the buffer
136 */
137extern int ds_write_bts(struct task_struct *task,
138 const void *buffer, size_t size);
139extern int ds_write_pebs(struct task_struct *task,
140 const void *buffer, size_t size);
141 179
142/* 180/*
143 * Same as ds_write_bts/pebs, but omit ownership checks. 181 * The BTS state.
144 * 182 *
145 * This is needed to have some other task than the owner of the 183 * This gives access to the raw DS state and adds functions to provide
146 * BTS/PEBS buffer or the parameter task itself write into the 184 * an arch-independent view of the BTS data.
147 * respective buffer.
148 */ 185 */
149extern int ds_unchecked_write_bts(struct task_struct *task, 186struct bts_trace {
150 const void *buffer, size_t size); 187 struct ds_trace ds;
151extern int ds_unchecked_write_pebs(struct task_struct *task, 188
152 const void *buffer, size_t size); 189 int (*read)(struct bts_tracer *tracer, const void *at,
190 struct bts_struct *out);
191 int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
192};
193
153 194
154/* 195/*
155 * Reset the write pointer of the BTS/PEBS buffer. 196 * The PEBS state.
156 *
157 * Returns 0 on success; -Eerrno on error
158 * 197 *
159 * task: the task to access; 198 * This gives access to the raw DS state and the PEBS-specific counter
160 * NULL to access the current cpu 199 * reset value.
161 */ 200 */
162extern int ds_reset_bts(struct task_struct *task); 201struct pebs_trace {
163extern int ds_reset_pebs(struct task_struct *task); 202 struct ds_trace ds;
203
204 /* the PEBS reset value */
205 unsigned long long reset_value;
206};
207
164 208
165/* 209/*
166 * Clear the BTS/PEBS buffer and reset the write pointer. 210 * Read the BTS or PEBS trace.
167 * The entire buffer will be zeroed out.
168 * 211 *
169 * Returns 0 on success; -Eerrno on error 212 * Returns a view on the trace collected for the parameter tracer.
170 * 213 *
171 * task: the task to access; 214 * The view remains valid as long as the traced task is not running or
172 * NULL to access the current cpu 215 * the tracer is suspended.
216 * Writes into the trace buffer are not reflected.
217 *
218 * tracer: the tracer handle returned from ds_request_~()
173 */ 219 */
174extern int ds_clear_bts(struct task_struct *task); 220extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
175extern int ds_clear_pebs(struct task_struct *task); 221extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
222
176 223
177/* 224/*
178 * Provide the PEBS counter reset value. 225 * Reset the write pointer of the BTS/PEBS buffer.
179 * 226 *
180 * Returns 0 on success; -Eerrno on error 227 * Returns 0 on success; -Eerrno on error
181 * 228 *
182 * task: the task to access; 229 * tracer: the tracer handle returned from ds_request_~()
183 * NULL to access the current cpu
184 * value (out): the counter reset value
185 */ 230 */
186extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); 231extern int ds_reset_bts(struct bts_tracer *tracer);
232extern int ds_reset_pebs(struct pebs_tracer *tracer);
187 233
188/* 234/*
189 * Set the PEBS counter reset value. 235 * Set the PEBS counter reset value.
190 * 236 *
191 * Returns 0 on success; -Eerrno on error 237 * Returns 0 on success; -Eerrno on error
192 * 238 *
193 * task: the task to access; 239 * tracer: the tracer handle returned from ds_request_pebs()
194 * NULL to access the current cpu
195 * value: the new counter reset value 240 * value: the new counter reset value
196 */ 241 */
197extern int ds_set_pebs_reset(struct task_struct *task, u64 value); 242extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
198 243
199/* 244/*
200 * Initialization 245 * Initialization
@@ -202,39 +247,17 @@ extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
202struct cpuinfo_x86; 247struct cpuinfo_x86;
203extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); 248extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
204 249
205
206
207/* 250/*
208 * The DS context - part of struct thread_struct. 251 * Context switch work
209 */ 252 */
210struct ds_context { 253extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
211 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
212 unsigned char *ds;
213 /* the owner of the BTS and PEBS configuration, respectively */
214 struct task_struct *owner[2];
215 /* buffer overflow notification function for BTS and PEBS */
216 ds_ovfl_callback_t callback[2];
217 /* the original buffer address */
218 void *buffer[2];
219 /* the number of allocated pages for on-request allocated buffers */
220 unsigned int pages[2];
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230
231/* called by exit_thread() to free leftover contexts */
232extern void ds_free(struct ds_context *context);
233 254
234#else /* CONFIG_X86_DS */ 255#else /* CONFIG_X86_DS */
235 256
236struct cpuinfo_x86; 257struct cpuinfo_x86;
237static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} 258static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
259static inline void ds_switch_to(struct task_struct *prev,
260 struct task_struct *next) {}
238 261
239#endif /* CONFIG_X86_DS */ 262#endif /* CONFIG_X86_DS */
240#endif /* _ASM_X86_DS_H */ 263#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b17..b55b4a7fbefd 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -1,6 +1,33 @@
1#ifndef _ASM_X86_FTRACE_H 1#ifndef _ASM_X86_FTRACE_H
2#define _ASM_X86_FTRACE_H 2#define _ASM_X86_FTRACE_H
3 3
4#ifdef __ASSEMBLY__
5
6 .macro MCOUNT_SAVE_FRAME
7 /* taken from glibc */
8 subq $0x38, %rsp
9 movq %rax, (%rsp)
10 movq %rcx, 8(%rsp)
11 movq %rdx, 16(%rsp)
12 movq %rsi, 24(%rsp)
13 movq %rdi, 32(%rsp)
14 movq %r8, 40(%rsp)
15 movq %r9, 48(%rsp)
16 .endm
17
18 .macro MCOUNT_RESTORE_FRAME
19 movq 48(%rsp), %r9
20 movq 40(%rsp), %r8
21 movq 32(%rsp), %rdi
22 movq 24(%rsp), %rsi
23 movq 16(%rsp), %rdx
24 movq 8(%rsp), %rcx
25 movq (%rsp), %rax
26 addq $0x38, %rsp
27 .endm
28
29#endif
30
4#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
5#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
6#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -17,8 +44,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
17 */ 44 */
18 return addr - 1; 45 return addr - 1;
19} 46}
20#endif
21 47
48#ifdef CONFIG_DYNAMIC_FTRACE
49
50struct dyn_arch_ftrace {
51 /* No extra data needed for x86 */
52};
53
54#endif /* CONFIG_DYNAMIC_FTRACE */
55#endif /* __ASSEMBLY__ */
22#endif /* CONFIG_FUNCTION_TRACER */ 56#endif /* CONFIG_FUNCTION_TRACER */
23 57
58#ifdef CONFIG_FUNCTION_GRAPH_TRACER
59
60#ifndef __ASSEMBLY__
61
62/*
63 * Stack of return addresses for functions
64 * of a thread.
65 * Used in struct thread_info
66 */
67struct ftrace_ret_stack {
68 unsigned long ret;
69 unsigned long func;
70 unsigned long long calltime;
71};
72
73/*
74 * Primary handler of a function return.
75 * It relays on ftrace_return_to_handler.
76 * Defined in entry_32/64.S
77 */
78extern void return_to_handler(void);
79
80#endif /* __ASSEMBLY__ */
81#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
82
24#endif /* _ASM_X86_FTRACE_H */ 83#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e383269..aa5914f8e501 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void);
752extern void cpu_init(void); 752extern void cpu_init(void);
753extern void init_gdt(int cpu); 753extern void init_gdt(int cpu);
754 754
755static inline unsigned long get_debugctlmsr(void)
756{
757 unsigned long debugctlmsr = 0;
758
759#ifndef CONFIG_X86_DEBUGCTLMSR
760 if (boot_cpu_data.x86 < 6)
761 return 0;
762#endif
763 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
764
765 return debugctlmsr;
766}
767
755static inline void update_debugctlmsr(unsigned long debugctlmsr) 768static inline void update_debugctlmsr(unsigned long debugctlmsr)
756{ 769{
757#ifndef CONFIG_X86_DEBUGCTLMSR 770#ifndef CONFIG_X86_DEBUGCTLMSR
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index eefb0594b058..fbf744215911 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,6 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
10#include <asm/segment.h> 9#include <asm/segment.h>
11#endif 10#endif
12 11
@@ -128,34 +127,6 @@ struct pt_regs {
128#endif /* !__i386__ */ 127#endif /* !__i386__ */
129 128
130 129
131#ifdef CONFIG_X86_PTRACE_BTS
132/* a branch trace record entry
133 *
134 * In order to unify the interface between various processor versions,
135 * we use the below data structure for all processors.
136 */
137enum bts_qualifier {
138 BTS_INVALID = 0,
139 BTS_BRANCH,
140 BTS_TASK_ARRIVES,
141 BTS_TASK_DEPARTS
142};
143
144struct bts_struct {
145 __u64 qualifier;
146 union {
147 /* BTS_BRANCH */
148 struct {
149 __u64 from_ip;
150 __u64 to_ip;
151 } lbr;
152 /* BTS_TASK_ARRIVES or
153 BTS_TASK_DEPARTS */
154 __u64 jiffies;
155 } variant;
156};
157#endif /* CONFIG_X86_PTRACE_BTS */
158
159#ifdef __KERNEL__ 130#ifdef __KERNEL__
160 131
161#include <linux/init.h> 132#include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
163struct cpuinfo_x86; 134struct cpuinfo_x86;
164struct task_struct; 135struct task_struct;
165 136
166#ifdef CONFIG_X86_PTRACE_BTS
167extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
168extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
169#else
170#define ptrace_bts_init_intel(config) do {} while (0)
171#endif /* CONFIG_X86_PTRACE_BTS */
172
173extern unsigned long profile_pc(struct pt_regs *regs); 137extern unsigned long profile_pc(struct pt_regs *regs);
174 138
175extern unsigned long 139extern unsigned long
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..bf8113d16a33 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,6 +20,8 @@
20struct task_struct; 20struct task_struct;
21struct exec_domain; 21struct exec_domain;
22#include <asm/processor.h> 22#include <asm/processor.h>
23#include <asm/ftrace.h>
24#include <asm/atomic.h>
23 25
24struct thread_info { 26struct thread_info {
25 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
@@ -91,7 +93,6 @@ struct thread_info {
91#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 93#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
92#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 94#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
93#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 95#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
94#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
95 96
96#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 97#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
97#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 98#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -113,7 +114,6 @@ struct thread_info {
113#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 114#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
114#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 115#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
115#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 116#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
116#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
117 117
118/* work to do in syscall_trace_enter() */ 118/* work to do in syscall_trace_enter() */
119#define _TIF_WORK_SYSCALL_ENTRY \ 119#define _TIF_WORK_SYSCALL_ENTRY \
@@ -139,8 +139,7 @@ struct thread_info {
139 139
140/* flags to check in __switch_to() */ 140/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 141#define _TIF_WORK_CTXSW \
142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ 142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
143 _TIF_NOTSC)
144 143
145#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 144#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
146#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a7667828e..1cad9318d217 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,7 @@ CFLAGS_tsc.o := $(nostackp)
25 25
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o 28obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o 30obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o 31obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 65obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
68obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 69obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 70obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 71obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b525..b946ac19753b 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h>
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/smp.h> 36#include <asm/smp.h>
@@ -800,7 +801,7 @@ static void local_apic_timer_interrupt(void)
800 * [ if a single-CPU system runs an SMP kernel then we call the local 801 * [ if a single-CPU system runs an SMP kernel then we call the local
801 * interrupt as well. Thus we cannot inline the local irq ... ] 802 * interrupt as well. Thus we cannot inline the local irq ... ]
802 */ 803 */
803void smp_apic_timer_interrupt(struct pt_regs *regs) 804void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
804{ 805{
805 struct pt_regs *old_regs = set_irq_regs(regs); 806 struct pt_regs *old_regs = set_irq_regs(regs);
806 807
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 unsigned int next_perf_state = 0; /* Index into perf table */ 392 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 393 unsigned int i;
393 int result = 0; 394 int result = 0;
395 struct power_trace it;
394 396
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 397 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 398
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427 } 429 }
428 } 430 }
429 431
432 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433
430 switch (data->cpu_feature) { 434 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 435 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 436 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d55..cd413d9a0218 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -307,10 +306,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
307 set_cpu_cap(c, X86_FEATURE_P4); 306 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6) 307 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3); 308 set_cpu_cap(c, X86_FEATURE_P3);
310
311 if (cpu_has_bts)
312 ptrace_bts_init_intel(c);
313
314#endif 309#endif
315 310
316 detect_extended_topology(c); 311 detect_extended_topology(c);
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index a2d1176c38ee..98d271e60e08 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,14 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,22 +27,69 @@
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
31 31
32 32
33/* 33/*
34 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
35 */ 35 */
36struct ds_configuration { 36struct ds_configuration {
37 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
38 unsigned char sizeof_ds; 38 const char *name;
39 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
40 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
41 unsigned char sizeof_field; 42 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
44}; 48};
45static struct ds_configuration ds_cfg; 49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
46 50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
61
62/*
63 * A BTS or PEBS tracer.
64 *
65 * This holds the configuration of the tracer and serves as a handle
66 * to identify tracers.
67 */
68struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */
70 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */
72 void *buffer;
73 size_t size;
74};
75
76struct bts_tracer {
77 /* the common DS part */
78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
81 /* buffer overflow notification function */
82 bts_ovfl_callback_t ovfl;
83};
84
85struct pebs_tracer {
86 /* the common DS part */
87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
90 /* buffer overflow notification function */
91 pebs_ovfl_callback_t ovfl;
92};
47 93
48/* 94/*
49 * Debug Store (DS) save area configuration (see Intel64 and IA32 95 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
109 155
110 156
111/* 157/*
112 * Locking is done only for allocating BTS or PEBS resources and for 158 * Locking is done only for allocating BTS or PEBS resources.
113 * guarding context and buffer memory allocation.
114 *
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
118 */ 159 */
119static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
120
121/*
122 * Validate that the current task is allowed to access the BTS/PEBS
123 * buffer of the parameter task.
124 *
125 * Returns 0, if access is granted; -Eerrno, otherwise.
126 */
127static inline int ds_validate_access(struct ds_context *context,
128 enum ds_qualifier qual)
129{
130 if (!context)
131 return -EPERM;
132
133 if (context->owner[qual] == current)
134 return 0;
135
136 return -EPERM;
137}
138 161
139 162
140/* 163/*
@@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
150 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
151 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
152 * 175 *
153 * The below functions to get and put tracers and to check the
154 * allocation type require the ds_lock to be held by the caller.
155 *
156 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
157 * type of allocation. 177 * type of allocation.
158 */ 178 */
159static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
160 180
161static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
162{ 182{
163 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
164} 187}
165 188
166static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
167{ 190{
168 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
169} 195}
170 196
171static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
172{ 198{
173 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
174} 202}
175 203
176 204
@@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task)
183 * 211 *
184 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
186 *
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
194 *
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
198 */ 214 */
199static DEFINE_PER_CPU(struct ds_context *, system_context); 215struct ds_context {
200 216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
201#define this_system_context per_cpu(system_context, smp_processor_id()) 217 unsigned char ds[MAX_SIZEOF_DS];
202 218 /* the owner of the BTS and PEBS configuration, respectively */
203/* 219 struct bts_tracer *bts_master;
204 * Returns the pointer to the parameter task's context or to the 220 struct pebs_tracer *pebs_master;
205 * system-wide context, if task is NULL. 221 /* use count */
206 * 222 unsigned long count;
207 * Increases the use count of the returned context, if not NULL. 223 /* a pointer to the context location inside the thread_struct
208 */ 224 * or the per_cpu context array */
209static inline struct ds_context *ds_get_context(struct task_struct *task) 225 struct ds_context **this;
210{ 226 /* a pointer to the task owning this context, or NULL, if the
211 struct ds_context *context; 227 * context is owned by a cpu */
212 unsigned long irq; 228 struct task_struct *task;
229};
213 230
214 spin_lock_irqsave(&ds_lock, irq); 231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
215 232
216 context = (task ? task->thread.ds_ctx : this_system_context); 233#define system_context per_cpu(system_context_array, smp_processor_id())
217 if (context)
218 context->count++;
219 234
220 spin_unlock_irqrestore(&ds_lock, irq);
221
222 return context;
223}
224 235
225/* 236static inline struct ds_context *ds_get_context(struct task_struct *task)
226 * Same as ds_get_context, but allocates the context and it's DS
227 * structure, if necessary; returns NULL; if out of memory.
228 */
229static inline struct ds_context *ds_alloc_context(struct task_struct *task)
230{ 237{
231 struct ds_context **p_context = 238 struct ds_context **p_context =
232 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
233 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL;
234 unsigned long irq; 242 unsigned long irq;
235 243
236 if (!context) { 244 /* Chances are small that we already have a context. */
237 context = kzalloc(sizeof(*context), GFP_KERNEL); 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
238 if (!context) 246 if (!new_context)
239 return NULL; 247 return NULL;
240
241 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
242 if (!context->ds) {
243 kfree(context);
244 return NULL;
245 }
246 248
247 spin_lock_irqsave(&ds_lock, irq); 249 spin_lock_irqsave(&ds_lock, irq);
248 250
249 if (*p_context) { 251 context = *p_context;
250 kfree(context->ds); 252 if (!context) {
251 kfree(context); 253 context = new_context;
252 254
253 context = *p_context; 255 context->this = p_context;
254 } else { 256 context->task = task;
255 *p_context = context; 257 context->count = 0;
256 258
257 context->this = p_context; 259 if (task)
258 context->task = task; 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
259 261
260 if (task) 262 if (!task || (task == current))
261 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
262 264
263 if (!task || (task == current)) 265 *p_context = context;
264 wrmsrl(MSR_IA32_DS_AREA,
265 (unsigned long)context->ds);
266 }
267 spin_unlock_irqrestore(&ds_lock, irq);
268 } 266 }
269 267
270 context->count++; 268 context->count++;
271 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
272 return context; 275 return context;
273} 276}
274 277
275/*
276 * Decreases the use count of the parameter context, if not NULL.
277 * Deallocates the context, if the use count reaches zero.
278 */
279static inline void ds_put_context(struct ds_context *context) 278static inline void ds_put_context(struct ds_context *context)
280{ 279{
281 unsigned long irq; 280 unsigned long irq;
@@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)
285 284
286 spin_lock_irqsave(&ds_lock, irq); 285 spin_lock_irqsave(&ds_lock, irq);
287 286
288 if (--context->count) 287 if (--context->count) {
289 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
290 291
291 *(context->this) = NULL; 292 *(context->this) = NULL;
292 293
@@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
296 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
297 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
298 299
299 put_tracer(context->task); 300 spin_unlock_irqrestore(&ds_lock, irq);
300 301
301 /* free any leftover buffers from tracers that did not
302 * deallocate them properly. */
303 kfree(context->buffer[ds_bts]);
304 kfree(context->buffer[ds_pebs]);
305 kfree(context->ds);
306 kfree(context); 302 kfree(context);
307 out:
308 spin_unlock_irqrestore(&ds_lock, irq);
309} 303}
310 304
311 305
312/* 306/*
313 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
314 * 308 *
315 * task: the task whose buffers are overflowing;
316 * NULL for a buffer overflow on the current cpu
317 * context: the ds context 309 * context: the ds context
318 * qual: the buffer type 310 * qual: the buffer type
319 */ 311 */
320static void ds_overflow(struct task_struct *task, struct ds_context *context, 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
321 enum ds_qualifier qual)
322{ 313{
323 if (!context) 314 switch (qual) {
324 return; 315 case ds_bts:
325 316 if (context->bts_master &&
326 if (context->callback[qual]) 317 context->bts_master->ovfl)
327 (*context->callback[qual])(task); 318 context->bts_master->ovfl(context->bts_master);
328 319 break;
329 /* todo: do some more overflow handling */ 320 case ds_pebs:
321 if (context->pebs_master &&
322 context->pebs_master->ovfl)
323 context->pebs_master->ovfl(context->pebs_master);
324 break;
325 }
330} 326}
331 327
332 328
333/* 329/*
334 * Allocate a non-pageable buffer of the parameter size. 330 * Write raw data into the BTS or PEBS buffer.
335 * Checks the memory and the locked memory rlimit.
336 * 331 *
337 * Returns the buffer, if successful; 332 * The remainder of any partially written record is zeroed out.
338 * NULL, if out of memory or rlimit exceeded.
339 * 333 *
340 * size: the requested buffer size in bytes 334 * context: the DS context
341 * pages (out): if not NULL, contains the number of pages reserved 335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
342 */ 338 */
343static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) 339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
344{ 341{
345 unsigned long rlim, vm, pgsz; 342 int bytes_written = 0;
346 void *buffer;
347 343
348 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 344 if (!record)
345 return -EINVAL;
349 346
350 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 347 while (size) {
351 vm = current->mm->total_vm + pgsz; 348 unsigned long base, index, end, write_end, int_th;
352 if (rlim < vm) 349 unsigned long write_size, adj_write_size;
353 return NULL;
354 350
355 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 351 /*
356 vm = current->mm->locked_vm + pgsz; 352 * write as much as possible without producing an
357 if (rlim < vm) 353 * overflow interrupt.
358 return NULL; 354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
359 365
360 buffer = kzalloc(size, GFP_KERNEL); 366 write_end = min(end, int_th);
361 if (!buffer)
362 return NULL;
363 367
364 current->mm->total_vm += pgsz; 368 /* if we are already beyond the interrupt threshold,
365 current->mm->locked_vm += pgsz; 369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
366 372
367 if (pages) 373 if (write_end <= index)
368 *pages = pgsz; 374 break;
375
376 write_size = min((unsigned long) size, write_end - index);
377 memcpy((void *)index, record, write_size);
378
379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
369 385
370 return buffer; 386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
371} 400}
372 401
373static int ds_request(struct task_struct *task, void *base, size_t size, 402
374 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
375{ 443{
376 struct ds_context *context; 444 base += (ds_cfg.sizeof_field * field);
377 unsigned long buffer, adj; 445 return *(unsigned long *)base;
378 const unsigned long alignment = (1 << 3); 446}
379 unsigned long irq;
380 int error = 0;
381 447
382 if (!ds_cfg.sizeof_ds) 448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
383 return -EOPNOTSUPP; 449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
384 453
385 /* we require some space to do alignment adjustments below */ 454
386 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 455/*
456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
387 return -EINVAL; 470 return -EINVAL;
388 471
389 /* buffer overflow notification is not yet implemented */ 472 if (at < tracer->trace.ds.begin)
390 if (ovfl) 473 return -EINVAL;
391 return -EOPNOTSUPP;
392 474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
393 477
394 context = ds_alloc_context(task); 478 memset(out, 0, sizeof(*out));
395 if (!context) 479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
396 return -ENOMEM; 480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
490 }
397 491
398 spin_lock_irqsave(&ds_lock, irq); 492 return ds_cfg.sizeof_rec[ds_bts];
493}
399 494
400 error = -EPERM; 495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
401 if (!check_tracer(task)) 496{
402 goto out_unlock; 497 unsigned char raw[MAX_SIZEOF_BTS];
403 498
404 get_tracer(task); 499 if (!tracer)
500 return -EINVAL;
405 501
406 error = -EALREADY; 502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
407 if (context->owner[qual] == current) 503 return -EOVERFLOW;
408 goto out_put_tracer;
409 error = -EPERM;
410 if (context->owner[qual] != NULL)
411 goto out_put_tracer;
412 context->owner[qual] = current;
413 504
414 spin_unlock_irqrestore(&ds_lock, irq); 505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
524 }
415 525
526 return ds_write(tracer->ds.context, ds_bts, raw,
527 ds_cfg.sizeof_rec[ds_bts]);
528}
416 529
417 error = -ENOMEM;
418 if (!base) {
419 base = ds_allocate_buffer(size, &context->pages[qual]);
420 if (!base)
421 goto out_release;
422 530
423 context->buffer[qual] = base; 531static void ds_write_config(struct ds_context *context,
424 } 532 struct ds_trace *cfg, enum ds_qualifier qual)
425 error = 0; 533{
534 unsigned char *ds = context->ds;
426 535
427 context->callback[qual] = ovfl; 536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
546
547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
556 unsigned long buffer, adj;
428 557
429 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
430 * constraints: 559 * constraints:
@@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
436 */ 565 */
437 buffer = (unsigned long)base; 566 buffer = (unsigned long)base;
438 567
439 adj = ALIGN(buffer, alignment) - buffer; 568 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
440 buffer += adj; 569 buffer += adj;
441 size -= adj; 570 size -= adj;
442 571
443 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
444 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
445
446 ds_set(context->ds, qual, ds_buffer_base, buffer);
447 ds_set(context->ds, qual, ds_index, buffer);
448 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
449
450 if (ovfl) {
451 /* todo: select a suitable interrupt threshold */
452 } else
453 ds_set(context->ds, qual,
454 ds_interrupt_threshold, buffer + size + 1);
455 574
456 /* we keep the context until ds_release */ 575 size = (trace->n * trace->size);
457 return error;
458
459 out_release:
460 context->owner[qual] = NULL;
461 ds_put_context(context);
462 put_tracer(task);
463 return error;
464 576
465 out_put_tracer: 577 trace->begin = (void *)buffer;
466 spin_unlock_irqrestore(&ds_lock, irq); 578 trace->top = trace->begin;
467 ds_put_context(context); 579 trace->end = (void *)(buffer + size);
468 put_tracer(task); 580 /* The value for 'no threshold' is -1, which will set the
469 return error; 581 * threshold outside of the buffer, just like we want it.
582 */
583 trace->ith = (void *)(buffer + size - ith);
470 584
471 out_unlock: 585 trace->flags = flags;
472 spin_unlock_irqrestore(&ds_lock, irq);
473 ds_put_context(context);
474 return error;
475} 586}
476 587
477int ds_request_bts(struct task_struct *task, void *base, size_t size,
478 ds_ovfl_callback_t ovfl)
479{
480 return ds_request(task, base, size, ovfl, ds_bts);
481}
482 588
483int ds_request_pebs(struct task_struct *task, void *base, size_t size, 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
484 ds_ovfl_callback_t ovfl) 590 enum ds_qualifier qual, struct task_struct *task,
485{ 591 void *base, size_t size, size_t th, unsigned int flags)
486 return ds_request(task, base, size, ovfl, ds_pebs);
487}
488
489static int ds_release(struct task_struct *task, enum ds_qualifier qual)
490{ 592{
491 struct ds_context *context; 593 struct ds_context *context;
492 int error; 594 int error;
493 595
494 context = ds_get_context(task); 596 error = -EINVAL;
495 error = ds_validate_access(context, qual); 597 if (!base)
496 if (error < 0)
497 goto out; 598 goto out;
498 599
499 kfree(context->buffer[qual]); 600 /* we require some space to do alignment adjustments below */
500 context->buffer[qual] = NULL; 601 error = -EINVAL;
501 602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
502 current->mm->total_vm -= context->pages[qual]; 603 goto out;
503 current->mm->locked_vm -= context->pages[qual];
504 context->pages[qual] = 0;
505 context->owner[qual] = NULL;
506
507 /*
508 * we put the context twice:
509 * once for the ds_get_context
510 * once for the corresponding ds_request
511 */
512 ds_put_context(context);
513 out:
514 ds_put_context(context);
515 return error;
516}
517 604
518int ds_release_bts(struct task_struct *task) 605 if (th != (size_t)-1) {
519{ 606 th *= ds_cfg.sizeof_rec[qual];
520 return ds_release(task, ds_bts);
521}
522 607
523int ds_release_pebs(struct task_struct *task) 608 error = -EINVAL;
524{ 609 if (size <= th)
525 return ds_release(task, ds_pebs); 610 goto out;
526} 611 }
527 612
528static int ds_get_index(struct task_struct *task, size_t *pos, 613 tracer->buffer = base;
529 enum ds_qualifier qual) 614 tracer->size = size;
530{
531 struct ds_context *context;
532 unsigned long base, index;
533 int error;
534 615
616 error = -ENOMEM;
535 context = ds_get_context(task); 617 context = ds_get_context(task);
536 error = ds_validate_access(context, qual); 618 if (!context)
537 if (error < 0)
538 goto out; 619 goto out;
620 tracer->context = context;
539 621
540 base = ds_get(context->ds, qual, ds_buffer_base); 622 ds_init_ds_trace(trace, qual, base, size, th, flags);
541 index = ds_get(context->ds, qual, ds_index);
542 623
543 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 624 error = 0;
544 if (pos)
545 *pos = error;
546 out: 625 out:
547 ds_put_context(context);
548 return error; 626 return error;
549} 627}
550 628
551int ds_get_bts_index(struct task_struct *task, size_t *pos) 629struct bts_tracer *ds_request_bts(struct task_struct *task,
552{ 630 void *base, size_t size,
553 return ds_get_index(task, pos, ds_bts); 631 bts_ovfl_callback_t ovfl, size_t th,
554} 632 unsigned int flags)
555
556int ds_get_pebs_index(struct task_struct *task, size_t *pos)
557{
558 return ds_get_index(task, pos, ds_pebs);
559}
560
561static int ds_get_end(struct task_struct *task, size_t *pos,
562 enum ds_qualifier qual)
563{ 633{
564 struct ds_context *context; 634 struct bts_tracer *tracer;
565 unsigned long base, end; 635 unsigned long irq;
566 int error; 636 int error;
567 637
568 context = ds_get_context(task); 638 error = -EOPNOTSUPP;
569 error = ds_validate_access(context, qual); 639 if (!ds_cfg.ctl[dsf_bts])
570 if (error < 0)
571 goto out; 640 goto out;
572 641
573 base = ds_get(context->ds, qual, ds_buffer_base); 642 /* buffer overflow notification is not yet implemented */
574 end = ds_get(context->ds, qual, ds_absolute_maximum); 643 error = -EOPNOTSUPP;
644 if (ovfl)
645 goto out;
575 646
576 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 647 error = -ENOMEM;
577 if (pos) 648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
578 *pos = error; 649 if (!tracer)
579 out: 650 goto out;
580 ds_put_context(context); 651 tracer->ovfl = ovfl;
581 return error;
582}
583 652
584int ds_get_bts_end(struct task_struct *task, size_t *pos) 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
585{ 654 ds_bts, task, base, size, th, flags);
586 return ds_get_end(task, pos, ds_bts); 655 if (error < 0)
587} 656 goto out_tracer;
588 657
589int ds_get_pebs_end(struct task_struct *task, size_t *pos)
590{
591 return ds_get_end(task, pos, ds_pebs);
592}
593 658
594static int ds_access(struct task_struct *task, size_t index, 659 spin_lock_irqsave(&ds_lock, irq);
595 const void **record, enum ds_qualifier qual)
596{
597 struct ds_context *context;
598 unsigned long base, idx;
599 int error;
600 660
601 if (!record) 661 error = -EPERM;
602 return -EINVAL; 662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
603 665
604 context = ds_get_context(task); 666 error = -EPERM;
605 error = ds_validate_access(context, qual); 667 if (tracer->ds.context->bts_master)
606 if (error < 0) 668 goto out_put_tracer;
607 goto out; 669 tracer->ds.context->bts_master = tracer;
608 670
609 base = ds_get(context->ds, qual, ds_buffer_base); 671 spin_unlock_irqrestore(&ds_lock, irq);
610 idx = base + (index * ds_cfg.sizeof_rec[qual]);
611 672
612 error = -EINVAL;
613 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
614 goto out;
615 673
616 *record = (const void *)idx; 674 tracer->trace.read = bts_read;
617 error = ds_cfg.sizeof_rec[qual]; 675 tracer->trace.write = bts_write;
618 out:
619 ds_put_context(context);
620 return error;
621}
622 676
623int ds_access_bts(struct task_struct *task, size_t index, const void **record) 677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
624{ 678 ds_resume_bts(tracer);
625 return ds_access(task, index, record, ds_bts);
626}
627 679
628int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 680 return tracer;
629{ 681
630 return ds_access(task, index, record, ds_pebs); 682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
687 out_tracer:
688 kfree(tracer);
689 out:
690 return ERR_PTR(error);
631} 691}
632 692
633static int ds_write(struct task_struct *task, const void *record, size_t size, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
634 enum ds_qualifier qual, int force) 694 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
635{ 697{
636 struct ds_context *context; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
637 int error; 700 int error;
638 701
639 if (!record) 702 /* buffer overflow notification is not yet implemented */
640 return -EINVAL; 703 error = -EOPNOTSUPP;
704 if (ovfl)
705 goto out;
641 706
642 error = -EPERM; 707 error = -ENOMEM;
643 context = ds_get_context(task); 708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
644 if (!context) 709 if (!tracer)
645 goto out; 710 goto out;
711 tracer->ovfl = ovfl;
646 712
647 if (!force) { 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
648 error = ds_validate_access(context, qual); 714 ds_pebs, task, base, size, th, flags);
649 if (error < 0) 715 if (error < 0)
650 goto out; 716 goto out_tracer;
651 }
652 717
653 error = 0; 718 spin_lock_irqsave(&ds_lock, irq);
654 while (size) {
655 unsigned long base, index, end, write_end, int_th;
656 unsigned long write_size, adj_write_size;
657 719
658 /* 720 error = -EPERM;
659 * write as much as possible without producing an 721 if (!check_tracer(task))
660 * overflow interrupt. 722 goto out_unlock;
661 * 723 get_tracer(task);
662 * interrupt_threshold must either be
663 * - bigger than absolute_maximum or
664 * - point to a record between buffer_base and absolute_maximum
665 *
666 * index points to a valid record.
667 */
668 base = ds_get(context->ds, qual, ds_buffer_base);
669 index = ds_get(context->ds, qual, ds_index);
670 end = ds_get(context->ds, qual, ds_absolute_maximum);
671 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
672 724
673 write_end = min(end, int_th); 725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
674 729
675 /* if we are already beyond the interrupt threshold, 730 spin_unlock_irqrestore(&ds_lock, irq);
676 * we fill the entire buffer */
677 if (write_end <= index)
678 write_end = end;
679 731
680 if (write_end <= index) 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
681 goto out; 733 ds_resume_pebs(tracer);
682 734
683 write_size = min((unsigned long) size, write_end - index); 735 return tracer;
684 memcpy((void *)index, record, write_size);
685 736
686 record = (const char *)record + write_size; 737 out_put_tracer:
687 size -= write_size; 738 put_tracer(task);
688 error += write_size; 739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
742 out_tracer:
743 kfree(tracer);
744 out:
745 return ERR_PTR(error);
746}
689 747
690 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 748void ds_release_bts(struct bts_tracer *tracer)
691 adj_write_size *= ds_cfg.sizeof_rec[qual]; 749{
750 if (!tracer)
751 return;
692 752
693 /* zero out trailing bytes */ 753 ds_suspend_bts(tracer);
694 memset((char *)index + write_size, 0,
695 adj_write_size - write_size);
696 index += adj_write_size;
697 754
698 if (index >= end) 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
699 index = base; 756 tracer->ds.context->bts_master = NULL;
700 ds_set(context->ds, qual, ds_index, index);
701 757
702 if (index >= int_th) 758 put_tracer(tracer->ds.context->task);
703 ds_overflow(task, context, qual); 759 ds_put_context(tracer->ds.context);
704 }
705 760
706 out: 761 kfree(tracer);
707 ds_put_context(context);
708 return error;
709} 762}
710 763
711int ds_write_bts(struct task_struct *task, const void *record, size_t size) 764void ds_suspend_bts(struct bts_tracer *tracer)
712{ 765{
713 return ds_write(task, record, size, ds_bts, /* force = */ 0); 766 struct task_struct *task;
714}
715 767
716int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 768 if (!tracer)
717{ 769 return;
718 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
719}
720 770
721int ds_unchecked_write_bts(struct task_struct *task, 771 task = tracer->ds.context->task;
722 const void *record, size_t size)
723{
724 return ds_write(task, record, size, ds_bts, /* force = */ 1);
725}
726 772
727int ds_unchecked_write_pebs(struct task_struct *task, 773 if (!task || (task == current))
728 const void *record, size_t size) 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
729{ 775
730 return ds_write(task, record, size, ds_pebs, /* force = */ 1); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
778
779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
731} 782}
732 783
733static int ds_reset_or_clear(struct task_struct *task, 784void ds_resume_bts(struct bts_tracer *tracer)
734 enum ds_qualifier qual, int clear)
735{ 785{
736 struct ds_context *context; 786 struct task_struct *task;
737 unsigned long base, end; 787 unsigned long control;
738 int error;
739 788
740 context = ds_get_context(task); 789 if (!tracer)
741 error = ds_validate_access(context, qual); 790 return;
742 if (error < 0)
743 goto out;
744 791
745 base = ds_get(context->ds, qual, ds_buffer_base); 792 task = tracer->ds.context->task;
746 end = ds_get(context->ds, qual, ds_absolute_maximum);
747 793
748 if (clear) 794 control = ds_cfg.ctl[dsf_bts];
749 memset((void *)base, 0, end - base); 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
750 799
751 ds_set(context->ds, qual, ds_index, base); 800 if (task) {
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
752 804
753 error = 0; 805 if (!task || (task == current))
754 out: 806 update_debugctlmsr(get_debugctlmsr() | control);
755 ds_put_context(context);
756 return error;
757} 807}
758 808
759int ds_reset_bts(struct task_struct *task) 809void ds_release_pebs(struct pebs_tracer *tracer)
760{ 810{
761 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 811 if (!tracer)
812 return;
813
814 ds_suspend_pebs(tracer);
815
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL;
818
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context);
821
822 kfree(tracer);
762} 823}
763 824
764int ds_reset_pebs(struct task_struct *task) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
765{ 826{
766 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 827
767} 828}
768 829
769int ds_clear_bts(struct task_struct *task) 830void ds_resume_pebs(struct pebs_tracer *tracer)
770{ 831{
771 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 832
772} 833}
773 834
774int ds_clear_pebs(struct task_struct *task) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
775{ 836{
776 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 837 if (!tracer)
838 return NULL;
839
840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
841 return &tracer->trace;
777} 842}
778 843
779int ds_get_pebs_reset(struct task_struct *task, u64 *value) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
780{ 845{
781 struct ds_context *context; 846 if (!tracer)
782 int error; 847 return NULL;
783 848
784 if (!value) 849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
852
853 return &tracer->trace;
854}
855
856int ds_reset_bts(struct bts_tracer *tracer)
857{
858 if (!tracer)
785 return -EINVAL; 859 return -EINVAL;
786 860
787 context = ds_get_context(task); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 862
792 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
793 865
794 error = 0; 866 return 0;
795 out:
796 ds_put_context(context);
797 return error;
798} 867}
799 868
800int ds_set_pebs_reset(struct task_struct *task, u64 value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
801{ 870{
802 struct ds_context *context; 871 if (!tracer)
803 int error; 872 return -EINVAL;
804 873
805 context = ds_get_context(task); 874 tracer->trace.ds.top = tracer->trace.ds.begin;
806 error = ds_validate_access(context, ds_pebs);
807 if (error < 0)
808 goto out;
809 875
810 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
811 878
812 error = 0; 879 return 0;
813 out: 880}
814 ds_put_context(context); 881
815 return error; 882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
883{
884 if (!tracer)
885 return -EINVAL;
886
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
888
889 return 0;
816} 890}
817 891
818static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
819 .sizeof_ds = sizeof(long) * 12, 893 .name = "netburst",
820 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
821 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
822#ifdef __i386__ 900#ifdef __i386__
823 .sizeof_rec[ds_pebs] = sizeof(long) * 10 901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
824#else 902#else
825 .sizeof_rec[ds_pebs] = sizeof(long) * 18 903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
826#endif 904#endif
827}; 905};
828static const struct ds_configuration ds_cfg_64 = { 906static const struct ds_configuration ds_cfg_pentium_m = {
829 .sizeof_ds = 8 * 12, 907 .name = "pentium m",
830 .sizeof_field = 8, 908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
831 .sizeof_rec[ds_bts] = 8 * 3, 909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
832#ifdef __i386__ 912#ifdef __i386__
833 .sizeof_rec[ds_pebs] = 8 * 10 913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
834#else 914#else
835 .sizeof_rec[ds_pebs] = 8 * 18 915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
836#endif 916#endif
837}; 917};
918static const struct ds_configuration ds_cfg_core2 = {
919 .name = "core 2",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
927};
838 928
839static inline void 929static void
840ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
841{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
842 ds_cfg = *cfg; 933 ds_cfg = *cfg;
934
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
943
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
843} 945}
844 946
845void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
847 switch (c->x86) { 949 switch (c->x86) {
848 case 0x6: 950 case 0x6:
849 switch (c->x86_model) { 951 switch (c->x86_model) {
952 case 0 ... 0xC:
953 /* sorry, don't know about them */
954 break;
850 case 0xD: 955 case 0xD:
851 case 0xE: /* Pentium M */ 956 case 0xE: /* Pentium M */
852 ds_configure(&ds_cfg_var); 957 ds_configure(&ds_cfg_pentium_m);
853 break; 958 break;
854 case 0xF: /* Core2 */ 959 default: /* Core2, Atom, ... */
855 case 0x1C: /* Atom */ 960 ds_configure(&ds_cfg_core2);
856 ds_configure(&ds_cfg_64);
857 break;
858 default:
859 /* sorry, don't know about them */
860 break; 961 break;
861 } 962 }
862 break; 963 break;
@@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
865 case 0x0: 966 case 0x0:
866 case 0x1: 967 case 0x1:
867 case 0x2: /* Netburst */ 968 case 0x2: /* Netburst */
868 ds_configure(&ds_cfg_var); 969 ds_configure(&ds_cfg_netburst);
869 break; 970 break;
870 default: 971 default:
871 /* sorry, don't know about them */ 972 /* sorry, don't know about them */
@@ -878,12 +979,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
878 } 979 }
879} 980}
880 981
881void ds_free(struct ds_context *context) 982/*
882{ 983 * Change the DS configuration from tracing prev to tracing next.
883 /* This is called when the task owning the parameter context 984 */
884 * is dying. There should not be any user of that context left 985void ds_switch_to(struct task_struct *prev, struct task_struct *next)
885 * to disturb us, anymore. */ 986{
886 unsigned long leftovers = context->count; 987 struct ds_context *prev_ctx = prev->thread.ds_ctx;
887 while (leftovers--) 988 struct ds_context *next_ctx = next->thread.ds_ctx;
888 ds_put_context(context); 989
990 if (prev_ctx) {
991 update_debugctlmsr(0);
992
993 if (prev_ctx->bts_master &&
994 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
995 struct bts_struct ts = {
996 .qualifier = bts_task_departs,
997 .variant.timestamp.jiffies = jiffies_64,
998 .variant.timestamp.pid = prev->pid
999 };
1000 bts_write(prev_ctx->bts_master, &ts);
1001 }
1002 }
1003
1004 if (next_ctx) {
1005 if (next_ctx->bts_master &&
1006 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1007 struct bts_struct ts = {
1008 .qualifier = bts_task_arrives,
1009 .variant.timestamp.jiffies = jiffies_64,
1010 .variant.timestamp.pid = next->pid
1011 };
1012 bts_write(next_ctx->bts_master, &ts);
1013 }
1014
1015 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1016 }
1017
1018 update_debugctlmsr(next->thread.debugctlmsr);
889} 1019}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..6b1f6f6f8661
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#include "dumpstack.h"
21
22int panic_on_unrecovered_nmi;
23unsigned int code_bytes = 64;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
64/*
65 * x86-64 can have up to three kernel stacks:
66 * process stack
67 * interrupt stack
68 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
69 */
70
71static inline int valid_stack_ptr(struct thread_info *tinfo,
72 void *p, unsigned int size, void *end)
73{
74 void *t = tinfo;
75 if (end) {
76 if (p < end && p >= (end-THREAD_SIZE))
77 return 1;
78 else
79 return 0;
80 }
81 return p > t && p < t + THREAD_SIZE - size;
82}
83
84unsigned long
85print_context_stack(struct thread_info *tinfo,
86 unsigned long *stack, unsigned long bp,
87 const struct stacktrace_ops *ops, void *data,
88 unsigned long *end, int *graph)
89{
90 struct stack_frame *frame = (struct stack_frame *)bp;
91
92 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
93 unsigned long addr;
94
95 addr = *stack;
96 if (__kernel_text_address(addr)) {
97 if ((unsigned long) stack == bp + sizeof(long)) {
98 ops->address(data, addr, 1);
99 frame = frame->next_frame;
100 bp = (unsigned long) frame;
101 } else {
102 ops->address(data, addr, bp == 0);
103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 }
106 stack++;
107 }
108 return bp;
109}
110
111
112static void
113print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
114{
115 printk(data);
116 print_symbol(msg, symbol);
117 printk("\n");
118}
119
120static void print_trace_warning(void *data, char *msg)
121{
122 printk("%s%s\n", (char *)data, msg);
123}
124
125static int print_trace_stack(void *data, char *name)
126{
127 printk("%s <%s> ", (char *)data, name);
128 return 0;
129}
130
131/*
132 * Print one address/symbol entries per line.
133 */
134static void print_trace_address(void *data, unsigned long addr, int reliable)
135{
136 touch_nmi_watchdog();
137 printk(data);
138 printk_address(addr, reliable);
139}
140
141static const struct stacktrace_ops print_trace_ops = {
142 .warning = print_trace_warning,
143 .warning_symbol = print_trace_warning_symbol,
144 .stack = print_trace_stack,
145 .address = print_trace_address,
146};
147
148void
149show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
150 unsigned long *stack, unsigned long bp, char *log_lvl)
151{
152 printk("%sCall Trace:\n", log_lvl);
153 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
154}
155
156void show_trace(struct task_struct *task, struct pt_regs *regs,
157 unsigned long *stack, unsigned long bp)
158{
159 show_trace_log_lvl(task, regs, stack, bp, "");
160}
161
162void show_stack(struct task_struct *task, unsigned long *sp)
163{
164 show_stack_log_lvl(task, NULL, sp, 0, "");
165}
166
167/*
168 * The architecture-independent dump_stack generator
169 */
170void dump_stack(void)
171{
172 unsigned long bp = 0;
173 unsigned long stack;
174
175#ifdef CONFIG_FRAME_POINTER
176 if (!bp)
177 get_bp(bp);
178#endif
179
180 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version);
185 show_trace(NULL, NULL, &stack, bp);
186}
187EXPORT_SYMBOL(dump_stack);
188
189static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
190static int die_owner = -1;
191static unsigned int die_nest_count;
192
193unsigned __kprobes long oops_begin(void)
194{
195 int cpu;
196 unsigned long flags;
197
198 oops_enter();
199
200 /* racy, but better than risking deadlock. */
201 raw_local_irq_save(flags);
202 cpu = smp_processor_id();
203 if (!__raw_spin_trylock(&die_lock)) {
204 if (cpu == die_owner)
205 /* nested oops. should stop eventually */;
206 else
207 __raw_spin_lock(&die_lock);
208 }
209 die_nest_count++;
210 die_owner = cpu;
211 console_verbose();
212 bust_spinlocks(1);
213 return flags;
214}
215
216void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
217{
218 if (regs && kexec_should_crash(current))
219 crash_kexec(regs);
220
221 bust_spinlocks(0);
222 die_owner = -1;
223 add_taint(TAINT_DIE);
224 die_nest_count--;
225 if (!die_nest_count)
226 /* Nest count reaches zero, release the lock. */
227 __raw_spin_unlock(&die_lock);
228 raw_local_irq_restore(flags);
229 oops_exit();
230
231 if (!signr)
232 return;
233 if (in_interrupt())
234 panic("Fatal exception in interrupt");
235 if (panic_on_oops)
236 panic("Fatal exception");
237 do_exit(signr);
238}
239
240int __kprobes __die(const char *str, struct pt_regs *regs, long err)
241{
242#ifdef CONFIG_X86_32
243 unsigned short ss;
244 unsigned long sp;
245#endif
246 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
247#ifdef CONFIG_PREEMPT
248 printk("PREEMPT ");
249#endif
250#ifdef CONFIG_SMP
251 printk("SMP ");
252#endif
253#ifdef CONFIG_DEBUG_PAGEALLOC
254 printk("DEBUG_PAGEALLOC");
255#endif
256 printk("\n");
257 sysfs_printk_last_file();
258 if (notify_die(DIE_OOPS, str, regs, err,
259 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
260 return 1;
261
262 show_registers(regs);
263#ifdef CONFIG_X86_32
264 sp = (unsigned long) (&regs->sp);
265 savesegment(ss, ss);
266 if (user_mode(regs)) {
267 sp = regs->sp;
268 ss = regs->ss & 0xffff;
269 }
270 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
271 print_symbol("%s", regs->ip);
272 printk(" SS:ESP %04x:%08lx\n", ss, sp);
273#else
274 /* Executive summary in case the oops scrolled away */
275 printk(KERN_ALERT "RIP ");
276 printk_address(regs->ip, 1);
277 printk(" RSP <%016lx>\n", regs->sp);
278#endif
279 return 0;
280}
281
282/*
283 * This is gone through when something in the kernel has done something bad
284 * and is about to be terminated:
285 */
286void die(const char *str, struct pt_regs *regs, long err)
287{
288 unsigned long flags = oops_begin();
289 int sig = SIGSEGV;
290
291 if (!user_mode_vm(regs))
292 report_bug(regs->ip, regs);
293
294 if (__die(str, regs, err))
295 sig = 0;
296 oops_end(flags, regs, sig);
297}
298
299void notrace __kprobes
300die_nmi(char *str, struct pt_regs *regs, int do_panic)
301{
302 unsigned long flags;
303
304 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
305 return;
306
307 /*
308 * We are in trouble anyway, lets at least try
309 * to get a message out.
310 */
311 flags = oops_begin();
312 printk(KERN_EMERG "%s", str);
313 printk(" on CPU%d, ip %08lx, registers:\n",
314 smp_processor_id(), regs->ip);
315 show_registers(regs);
316 oops_end(flags, regs, 0);
317 if (do_panic || panic_on_oops)
318 panic("Non maskable interrupt");
319 nmi_exit();
320 local_irq_enable();
321 do_exit(SIGBUS);
322}
323
324static int __init oops_setup(char *s)
325{
326 if (!s)
327 return -EINVAL;
328 if (!strcmp(s, "panic"))
329 panic_on_oops = 1;
330 return 0;
331}
332early_param("oops", oops_setup);
333
334static int __init kstack_setup(char *s)
335{
336 if (!s)
337 return -EINVAL;
338 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
339 return 0;
340}
341early_param("kstack", kstack_setup);
342
343static int __init code_bytes_setup(char *s)
344{
345 code_bytes = simple_strtoul(s, NULL, 0);
346 if (code_bytes > 8192)
347 code_bytes = 8192;
348
349 return 1;
350}
351__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl);
26
27extern void
28show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl);
30
31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33
34/* The form of the top of the frame on the stack */
35struct stack_frame {
36 struct stack_frame *next_frame;
37 unsigned long return_address;
38};
39#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197b..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 8 20#include "dumpstack.h"
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78 21
79void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
82{ 25{
26 int graph = 0;
27
83 if (!task) 28 if (!task)
84 task = current; 29 task = current;
85 30
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
107 52
108 context = (struct thread_info *) 53 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
111 57
112 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
113 if (!stack) 59 if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
119} 65}
120EXPORT_SYMBOL(dump_trace); 66EXPORT_SYMBOL(dump_trace);
121 67
122static void 68void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
175{ 71{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 92 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197} 93}
198 94
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226 95
227void show_registers(struct pt_regs *regs) 96void show_registers(struct pt_regs *regs)
228{ 97{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
283 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
284} 153}
285 154
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a7..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 4 20#include "dumpstack.h"
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33 21
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp) 23 unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 101 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */ 102 */
115 103
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs, 104void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp, 105 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0; 110 unsigned used = 0;
168 struct thread_info *tinfo; 111 struct thread_info *tinfo;
112 int graph = 0;
169 113
170 if (!task) 114 if (!task)
171 task = current; 115 task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
206 break; 150 break;
207 151
208 bp = print_context_stack(tinfo, stack, bp, ops, 152 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end); 153 data, estack_end, &graph);
210 ops->stack(data, "<EOE>"); 154 ops->stack(data, "<EOE>");
211 /* 155 /*
212 * We link to the next stack via the 156 * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
225 if (ops->stack(data, "IRQ") < 0) 169 if (ops->stack(data, "IRQ") < 0)
226 break; 170 break;
227 bp = print_context_stack(tinfo, stack, bp, 171 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end); 172 ops, data, irqstack_end, &graph);
229 /* 173 /*
230 * We link to the next stack (which would be 174 * We link to the next stack (which would be
231 * the process stack normally) the last 175 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 /* 187 /*
244 * This handles the process stack: 188 * This handles the process stack:
245 */ 189 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 190 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
247 put_cpu(); 191 put_cpu();
248} 192}
249EXPORT_SYMBOL(dump_trace); 193EXPORT_SYMBOL(dump_trace);
250 194
251static void 195void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 196show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl) 197 unsigned long *sp, unsigned long bp, char *log_lvl)
304{ 198{
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 236 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343} 237}
344 238
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs) 239void show_registers(struct pt_regs *regs)
373{ 240{
374 int i; 241 int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
429 return ud2 == 0x0b0f; 296 return ud2 == 0x0b0f;
430} 297}
431 298
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..43ceb3f454bf 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
1157END(mcount) 1157END(mcount)
1158 1158
1159ENTRY(ftrace_caller) 1159ENTRY(ftrace_caller)
1160 cmpl $0, function_trace_stop
1161 jne ftrace_stub
1162
1160 pushl %eax 1163 pushl %eax
1161 pushl %ecx 1164 pushl %ecx
1162 pushl %edx 1165 pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
1171 popl %edx 1174 popl %edx
1172 popl %ecx 1175 popl %ecx
1173 popl %eax 1176 popl %eax
1177#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1178.globl ftrace_graph_call
1179ftrace_graph_call:
1180 jmp ftrace_stub
1181#endif
1174 1182
1175.globl ftrace_stub 1183.globl ftrace_stub
1176ftrace_stub: 1184ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 1188#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 1189
1182ENTRY(mcount) 1190ENTRY(mcount)
1191 cmpl $0, function_trace_stop
1192 jne ftrace_stub
1193
1183 cmpl $ftrace_stub, ftrace_trace_function 1194 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 1195 jnz trace
1196#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1197 cmpl $ftrace_stub, ftrace_graph_return
1198 jnz ftrace_graph_caller
1199
1200 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
1201 jnz ftrace_graph_caller
1202#endif
1185.globl ftrace_stub 1203.globl ftrace_stub
1186ftrace_stub: 1204ftrace_stub:
1187 ret 1205 ret
@@ -1200,12 +1218,43 @@ trace:
1200 popl %edx 1218 popl %edx
1201 popl %ecx 1219 popl %ecx
1202 popl %eax 1220 popl %eax
1203
1204 jmp ftrace_stub 1221 jmp ftrace_stub
1205END(mcount) 1222END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1223#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1224#endif /* CONFIG_FUNCTION_TRACER */
1208 1225
1226#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1227ENTRY(ftrace_graph_caller)
1228 cmpl $0, function_trace_stop
1229 jne ftrace_stub
1230
1231 pushl %eax
1232 pushl %ecx
1233 pushl %edx
1234 movl 0xc(%esp), %edx
1235 lea 0x4(%ebp), %eax
1236 subl $MCOUNT_INSN_SIZE, %edx
1237 call prepare_ftrace_return
1238 popl %edx
1239 popl %ecx
1240 popl %eax
1241 ret
1242END(ftrace_graph_caller)
1243
1244.globl return_to_handler
1245return_to_handler:
1246 pushl $0
1247 pushl %eax
1248 pushl %ecx
1249 pushl %edx
1250 call ftrace_return_to_handler
1251 movl %eax, 0xc(%esp)
1252 popl %edx
1253 popl %ecx
1254 popl %eax
1255 ret
1256#endif
1257
1209.section .rodata,"a" 1258.section .rodata,"a"
1210#include "syscall_table_32.S" 1259#include "syscall_table_32.S"
1211 1260
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..303dd84d2a98 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,16 +68,10 @@ ENTRY(mcount)
68END(mcount) 68END(mcount)
69 69
70ENTRY(ftrace_caller) 70ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop
72 jne ftrace_stub
71 73
72 /* taken from glibc */ 74 MCOUNT_SAVE_FRAME
73 subq $0x38, %rsp
74 movq %rax, (%rsp)
75 movq %rcx, 8(%rsp)
76 movq %rdx, 16(%rsp)
77 movq %rsi, 24(%rsp)
78 movq %rdi, 32(%rsp)
79 movq %r8, 40(%rsp)
80 movq %r9, 48(%rsp)
81 75
82 movq 0x38(%rsp), %rdi 76 movq 0x38(%rsp), %rdi
83 movq 8(%rbp), %rsi 77 movq 8(%rbp), %rsi
@@ -87,14 +81,13 @@ ENTRY(ftrace_caller)
87ftrace_call: 81ftrace_call:
88 call ftrace_stub 82 call ftrace_stub
89 83
90 movq 48(%rsp), %r9 84 MCOUNT_RESTORE_FRAME
91 movq 40(%rsp), %r8 85
92 movq 32(%rsp), %rdi 86#ifdef CONFIG_FUNCTION_GRAPH_TRACER
93 movq 24(%rsp), %rsi 87.globl ftrace_graph_call
94 movq 16(%rsp), %rdx 88ftrace_graph_call:
95 movq 8(%rsp), %rcx 89 jmp ftrace_stub
96 movq (%rsp), %rax 90#endif
97 addq $0x38, %rsp
98 91
99.globl ftrace_stub 92.globl ftrace_stub
100ftrace_stub: 93ftrace_stub:
@@ -103,15 +96,63 @@ END(ftrace_caller)
103 96
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 97#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 98ENTRY(mcount)
99 cmpl $0, function_trace_stop
100 jne ftrace_stub
101
106 cmpq $ftrace_stub, ftrace_trace_function 102 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 103 jnz trace
104
105#ifdef CONFIG_FUNCTION_GRAPH_TRACER
106 cmpq $ftrace_stub, ftrace_graph_return
107 jnz ftrace_graph_caller
108
109 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
110 jnz ftrace_graph_caller
111#endif
112
108.globl ftrace_stub 113.globl ftrace_stub
109ftrace_stub: 114ftrace_stub:
110 retq 115 retq
111 116
112trace: 117trace:
113 /* taken from glibc */ 118 MCOUNT_SAVE_FRAME
114 subq $0x38, %rsp 119
120 movq 0x38(%rsp), %rdi
121 movq 8(%rbp), %rsi
122 subq $MCOUNT_INSN_SIZE, %rdi
123
124 call *ftrace_trace_function
125
126 MCOUNT_RESTORE_FRAME
127
128 jmp ftrace_stub
129END(mcount)
130#endif /* CONFIG_DYNAMIC_FTRACE */
131#endif /* CONFIG_FUNCTION_TRACER */
132
133#ifdef CONFIG_FUNCTION_GRAPH_TRACER
134ENTRY(ftrace_graph_caller)
135 cmpl $0, function_trace_stop
136 jne ftrace_stub
137
138 MCOUNT_SAVE_FRAME
139
140 leaq 8(%rbp), %rdi
141 movq 0x38(%rsp), %rsi
142 subq $MCOUNT_INSN_SIZE, %rsi
143
144 call prepare_ftrace_return
145
146 MCOUNT_RESTORE_FRAME
147
148 retq
149END(ftrace_graph_caller)
150
151
152.globl return_to_handler
153return_to_handler:
154 subq $80, %rsp
155
115 movq %rax, (%rsp) 156 movq %rax, (%rsp)
116 movq %rcx, 8(%rsp) 157 movq %rcx, 8(%rsp)
117 movq %rdx, 16(%rsp) 158 movq %rdx, 16(%rsp)
@@ -119,13 +160,14 @@ trace:
119 movq %rdi, 32(%rsp) 160 movq %rdi, 32(%rsp)
120 movq %r8, 40(%rsp) 161 movq %r8, 40(%rsp)
121 movq %r9, 48(%rsp) 162 movq %r9, 48(%rsp)
163 movq %r10, 56(%rsp)
164 movq %r11, 64(%rsp)
122 165
123 movq 0x38(%rsp), %rdi 166 call ftrace_return_to_handler
124 movq 8(%rbp), %rsi
125 subq $MCOUNT_INSN_SIZE, %rdi
126
127 call *ftrace_trace_function
128 167
168 movq %rax, 72(%rsp)
169 movq 64(%rsp), %r11
170 movq 56(%rsp), %r10
129 movq 48(%rsp), %r9 171 movq 48(%rsp), %r9
130 movq 40(%rsp), %r8 172 movq 40(%rsp), %r8
131 movq 32(%rsp), %rdi 173 movq 32(%rsp), %rdi
@@ -133,12 +175,10 @@ trace:
133 movq 16(%rsp), %rdx 175 movq 16(%rsp), %rdx
134 movq 8(%rsp), %rcx 176 movq 8(%rsp), %rcx
135 movq (%rsp), %rax 177 movq (%rsp), %rax
136 addq $0x38, %rsp 178 addq $72, %rsp
179 retq
180#endif
137 181
138 jmp ftrace_stub
139END(mcount)
140#endif /* CONFIG_DYNAMIC_FTRACE */
141#endif /* CONFIG_FUNCTION_TRACER */
142 182
143#ifndef CONFIG_PREEMPT 183#ifndef CONFIG_PREEMPT
144#define retint_kernel retint_restore_args 184#define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
493 " movl $0, %[faulted]\n"
494
495 ".section .fixup, \"ax\"\n"
496 "3: movl $1, %[faulted]\n"
497 ".previous\n"
498
499 _ASM_EXTABLE(1b, 3b)
500 _ASM_EXTABLE(2b, 3b)
501
502 : [parent_replaced] "=r" (parent), [old] "=r" (old),
503 [faulted] "=r" (faulted)
504 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
505 : "memory"
506 );
507
508 if (unlikely(faulted)) {
509 ftrace_graph_stop();
510 WARN_ON(1);
511 return;
512 }
513
514 if (unlikely(!__kernel_text_address(old))) {
515 ftrace_graph_stop();
516 *parent = old;
517 WARN_ON(1);
518 return;
519 }
520
521 calltime = cpu_clock(raw_smp_processor_id());
522
523 if (push_return_trace(old, calltime,
524 self_addr, &trace.depth) == -EBUSY) {
525 *parent = old;
526 return;
527 }
528
529 trace.func = self_addr;
530
531 /* Only trace if the calling function expects to */
532 if (!ftrace_graph_entry(&trace)) {
533 current->curr_ret_stack--;
534 *parent = old;
535 }
536}
537#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a0..11c65e811ffe 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,6 +13,7 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/ftrace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
@@ -47,7 +48,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
47 * SMP cross-CPU interrupts have their own specific 48 * SMP cross-CPU interrupts have their own specific
48 * handlers). 49 * handlers).
49 */ 50 */
50asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 51asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
51{ 52{
52 struct pt_regs *old_regs = set_irq_regs(regs); 53 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc; 54 struct irq_desc *desc;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..c27af49a4ede 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <linux/ftrace.h>
10#include <asm/system.h> 11#include <asm/system.h>
11 12
12unsigned long idle_halt; 13unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 if (hlt_use_halt()) { 103 if (hlt_use_halt()) {
104 struct power_trace it;
105
106 trace_power_start(&it, POWER_CSTATE, 1);
103 current_thread_info()->status &= ~TS_POLLING; 107 current_thread_info()->status &= ~TS_POLLING;
104 /* 108 /*
105 * TS_POLLING-cleared state must be visible before we 109 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
112 else 116 else
113 local_irq_enable(); 117 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 118 current_thread_info()->status |= TS_POLLING;
119 trace_power_end(&it);
115 } else { 120 } else {
116 local_irq_enable(); 121 local_irq_enable();
117 /* loop is done by the caller */ 122 /* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154 */ 159 */
155void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 160void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156{ 161{
162 struct power_trace it;
163
164 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157 if (!need_resched()) { 165 if (!need_resched()) {
158 __monitor((void *)&current_thread_info()->flags, 0, 0); 166 __monitor((void *)&current_thread_info()->flags, 0, 0);
159 smp_mb(); 167 smp_mb();
160 if (!need_resched()) 168 if (!need_resched())
161 __mwait(ax, cx); 169 __mwait(ax, cx);
162 } 170 }
171 trace_power_end(&it);
163} 172}
164 173
165/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 174/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166static void mwait_idle(void) 175static void mwait_idle(void)
167{ 176{
177 struct power_trace it;
168 if (!need_resched()) { 178 if (!need_resched()) {
179 trace_power_start(&it, POWER_CSTATE, 1);
169 __monitor((void *)&current_thread_info()->flags, 0, 0); 180 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb(); 181 smp_mb();
171 if (!need_resched()) 182 if (!need_resched())
172 __sti_mwait(0, 0); 183 __sti_mwait(0, 0);
173 else 184 else
174 local_irq_enable(); 185 local_irq_enable();
186 trace_power_end(&it);
175 } else 187 } else
176 local_irq_enable(); 188 local_irq_enable();
177} 189}
@@ -183,9 +195,13 @@ static void mwait_idle(void)
183 */ 195 */
184static void poll_idle(void) 196static void poll_idle(void)
185{ 197{
198 struct power_trace it;
199
200 trace_power_start(&it, POWER_CSTATE, 0);
186 local_irq_enable(); 201 local_irq_enable();
187 while (!need_resched()) 202 while (!need_resched())
188 cpu_relax(); 203 cpu_relax();
204 trace_power_end(&it);
189} 205}
190 206
191/* 207/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..605eff9a8ac0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h> 40#include <linux/dmi.h>
41#include <linux/ftrace.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -251,11 +252,14 @@ void exit_thread(void)
251 put_cpu(); 252 put_cpu();
252 } 253 }
253#ifdef CONFIG_X86_DS 254#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */ 255 /* Free any BTS tracers that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) { 256 if (unlikely(current->bts)) {
256 /* we clear debugctl to make sure DS is not used. */ 257 ds_release_bts(current->bts);
257 update_debugctlmsr(0); 258 current->bts = NULL;
258 ds_free(current->thread.ds_ctx); 259
260 kfree(current->bts_buffer);
261 current->bts_buffer = NULL;
262 current->bts_size = 0;
259 } 263 }
260#endif /* CONFIG_X86_DS */ 264#endif /* CONFIG_X86_DS */
261} 265}
@@ -419,48 +423,19 @@ int set_tsc_mode(unsigned int val)
419 return 0; 423 return 0;
420} 424}
421 425
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
451static noinline void 426static noinline void
452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 427__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
453 struct tss_struct *tss) 428 struct tss_struct *tss)
454{ 429{
455 struct thread_struct *prev, *next; 430 struct thread_struct *prev, *next;
456 unsigned long debugctl;
457 431
458 prev = &prev_p->thread; 432 prev = &prev_p->thread;
459 next = &next_p->thread; 433 next = &next_p->thread;
460 434
461 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 435 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
462 436 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
463 if (next->debugctlmsr != debugctl) 437 ds_switch_to(prev_p, next_p);
438 else if (next->debugctlmsr != prev->debugctlmsr)
464 update_debugctlmsr(next->debugctlmsr); 439 update_debugctlmsr(next->debugctlmsr);
465 440
466 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 441 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
482 hard_enable_TSC(); 457 hard_enable_TSC();
483 } 458 }
484 459
485#ifdef CONFIG_X86_PTRACE_BTS
486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
488
489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
491#endif /* CONFIG_X86_PTRACE_BTS */
492
493
494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 460 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
495 /* 461 /*
496 * Disable the bitmap via an invalid offset. We still cache 462 * Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +514,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 514 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 515 * for example.
550 */ 516 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 517__notrace_funcgraph struct task_struct *
518__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 519{
553 struct thread_struct *prev = &prev_p->thread, 520 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 521 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..1cfd2a4bf853 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/io.h> 41#include <linux/io.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
@@ -236,11 +237,14 @@ void exit_thread(void)
236 put_cpu(); 237 put_cpu();
237 } 238 }
238#ifdef CONFIG_X86_DS 239#ifdef CONFIG_X86_DS
239 /* Free any DS contexts that have not been properly released. */ 240 /* Free any BTS tracers that have not been properly released. */
240 if (unlikely(t->ds_ctx)) { 241 if (unlikely(current->bts)) {
241 /* we clear debugctl to make sure DS is not used. */ 242 ds_release_bts(current->bts);
242 update_debugctlmsr(0); 243 current->bts = NULL;
243 ds_free(t->ds_ctx); 244
245 kfree(current->bts_buffer);
246 current->bts_buffer = NULL;
247 current->bts_size = 0;
244 } 248 }
245#endif /* CONFIG_X86_DS */ 249#endif /* CONFIG_X86_DS */
246} 250}
@@ -470,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
470 struct tss_struct *tss) 474 struct tss_struct *tss)
471{ 475{
472 struct thread_struct *prev, *next; 476 struct thread_struct *prev, *next;
473 unsigned long debugctl;
474 477
475 prev = &prev_p->thread, 478 prev = &prev_p->thread,
476 next = &next_p->thread; 479 next = &next_p->thread;
477 480
478 debugctl = prev->debugctlmsr; 481 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
479 482 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
480#ifdef CONFIG_X86_DS 483 ds_switch_to(prev_p, next_p);
481 { 484 else if (next->debugctlmsr != prev->debugctlmsr)
482 unsigned long ds_prev = 0, ds_next = 0;
483
484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
498 }
499#endif /* CONFIG_X86_DS */
500
501 if (next->debugctlmsr != debugctl)
502 update_debugctlmsr(next->debugctlmsr); 485 update_debugctlmsr(next->debugctlmsr);
503 486
504 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 487 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
533 */ 516 */
534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 517 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
535 } 518 }
536
537#ifdef CONFIG_X86_PTRACE_BTS
538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
540
541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
543#endif /* CONFIG_X86_PTRACE_BTS */
544} 519}
545 520
546/* 521/*
@@ -551,8 +526,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 526 * - could test fs/gs bitsliced
552 * 527 *
553 * Kprobes not supported here. Set the probe on schedule instead. 528 * Kprobes not supported here. Set the probe on schedule instead.
529 * Function graph tracer not supported too.
554 */ 530 */
555struct task_struct * 531__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 532__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 533{
558 struct thread_struct *prev = &prev_p->thread; 534 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..45e9855da2d2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,158 +581,73 @@ static int ioperm_get(struct task_struct *target,
581} 581}
582 582
583#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 584static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 585 struct bts_struct __user *out)
665{ 586{
666 struct bts_struct ret; 587 const struct bts_trace *trace;
667 const void *bts_record; 588 struct bts_struct bts;
668 size_t bts_index, bts_end; 589 const unsigned char *at;
669 int error; 590 int error;
670 591
671 error = ds_get_bts_end(child, &bts_end); 592 trace = ds_read_bts(child->bts);
672 if (error < 0) 593 if (!trace)
673 return error; 594 return -EPERM;
674 595
675 if (bts_end <= index) 596 at = trace->ds.top - ((index + 1) * trace->ds.size);
676 return -EINVAL; 597 if ((void *)at < trace->ds.begin)
677 598 at += (trace->ds.n * trace->ds.size);
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681 599
682 /* translate the ptrace bts index into the ds bts index */ 600 if (!trace->read)
683 bts_index += bts_end - (index + 1); 601 return -EOPNOTSUPP;
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 602
687 error = ds_access_bts(child, bts_index, &bts_record); 603 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 604 if (error < 0)
689 return error; 605 return error;
690 606
691 ptrace_bts_translate_record(&ret, bts_record); 607 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 608 return -EFAULT;
695 609
696 return sizeof(ret); 610 return sizeof(bts);
697} 611}
698 612
699static int ptrace_bts_drain(struct task_struct *child, 613static int ptrace_bts_drain(struct task_struct *child,
700 long size, 614 long size,
701 struct bts_struct __user *out) 615 struct bts_struct __user *out)
702{ 616{
703 struct bts_struct ret; 617 const struct bts_trace *trace;
704 const unsigned char *raw; 618 const unsigned char *at;
705 size_t end, i; 619 int error, drained = 0;
706 int error;
707 620
708 error = ds_get_bts_index(child, &end); 621 trace = ds_read_bts(child->bts);
709 if (error < 0) 622 if (!trace)
710 return error; 623 return -EPERM;
711 624
712 if (size < (end * sizeof(struct bts_struct))) 625 if (!trace->read)
626 return -EOPNOTSUPP;
627
628 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 629 return -EIO;
714 630
715 error = ds_access_bts(child, 0, (const void **)&raw); 631 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 632 out++, drained++, at += trace->ds.size) {
717 return error; 633 struct bts_struct bts;
634 int error;
718 635
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 636 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 637 if (error < 0)
638 return error;
721 639
722 if (copy_to_user(out, &ret, sizeof(ret))) 640 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 641 return -EFAULT;
724 } 642 }
725 643
726 error = ds_clear_bts(child); 644 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
645
646 error = ds_reset_bts(child->bts);
727 if (error < 0) 647 if (error < 0)
728 return error; 648 return error;
729 649
730 return end; 650 return drained;
731}
732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736} 651}
737 652
738static int ptrace_bts_config(struct task_struct *child, 653static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +655,89 @@ static int ptrace_bts_config(struct task_struct *child,
740 const struct ptrace_bts_config __user *ucfg) 655 const struct ptrace_bts_config __user *ucfg)
741{ 656{
742 struct ptrace_bts_config cfg; 657 struct ptrace_bts_config cfg;
743 int error = 0; 658 unsigned int flags = 0;
744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748 659
749 error = -EIO;
750 if (cfg_size < sizeof(cfg)) 660 if (cfg_size < sizeof(cfg))
751 goto errout; 661 return -EIO;
752 662
753 error = -EFAULT;
754 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 663 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
755 goto errout; 664 return -EFAULT;
756
757 error = -EINVAL;
758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
761 665
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 666 if (child->bts) {
763 ds_ovfl_callback_t ovfl = NULL; 667 ds_release_bts(child->bts);
764 unsigned int sig = 0; 668 child->bts = NULL;
669 }
765 670
766 /* we ignore the error in case we were not tracing child */ 671 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
767 (void)ds_release_bts(child); 672 if (!cfg.signal)
673 return -EINVAL;
768 674
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 675 return -EOPNOTSUPP;
770 if (!cfg.signal)
771 goto errout;
772 676
773 sig = cfg.signal; 677 child->thread.bts_ovfl_signal = cfg.signal;
774 ovfl = ptrace_bts_ovfl; 678 }
775 }
776 679
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 680 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
778 if (error < 0) 681 (cfg.size != child->bts_size)) {
779 goto errout; 682 kfree(child->bts_buffer);
780 683
781 child->thread.bts_ovfl_signal = sig; 684 child->bts_size = cfg.size;
685 child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
686 if (!child->bts_buffer) {
687 child->bts_size = 0;
688 return -ENOMEM;
689 }
782 } 690 }
783 691
784 error = -EINVAL;
785 if (!child->thread.ds_ctx && cfg.flags)
786 goto errout;
787
788 if (cfg.flags & PTRACE_BTS_O_TRACE) 692 if (cfg.flags & PTRACE_BTS_O_TRACE)
789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 693 flags |= BTS_USER;
790 else
791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
792 694
793 if (cfg.flags & PTRACE_BTS_O_SCHED) 695 if (cfg.flags & PTRACE_BTS_O_SCHED)
794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 696 flags |= BTS_TIMESTAMPS;
795 else
796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
797 697
798 error = sizeof(cfg); 698 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
699 /* ovfl = */ NULL, /* th = */ (size_t)-1,
700 flags);
701 if (IS_ERR(child->bts)) {
702 int error = PTR_ERR(child->bts);
799 703
800out: 704 kfree(child->bts_buffer);
801 if (child->thread.debugctlmsr) 705 child->bts = NULL;
802 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 706 child->bts_buffer = NULL;
803 else 707 child->bts_size = 0;
804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
805 708
806 return error; 709 return error;
710 }
807 711
808errout: 712 return sizeof(cfg);
809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
811 goto out;
812} 713}
813 714
814static int ptrace_bts_status(struct task_struct *child, 715static int ptrace_bts_status(struct task_struct *child,
815 long cfg_size, 716 long cfg_size,
816 struct ptrace_bts_config __user *ucfg) 717 struct ptrace_bts_config __user *ucfg)
817{ 718{
719 const struct bts_trace *trace;
818 struct ptrace_bts_config cfg; 720 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
822 721
823 if (cfg_size < sizeof(cfg)) 722 if (cfg_size < sizeof(cfg))
824 return -EIO; 723 return -EIO;
825 724
826 error = ds_get_bts_end(child, &end); 725 trace = ds_read_bts(child->bts);
827 if (error < 0) 726 if (!trace)
828 return error; 727 return -EPERM;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
833
834 error = ds_access_bts(child, /* index = */ end, &max);
835 if (error < 0)
836 return error;
837 728
838 memset(&cfg, 0, sizeof(cfg)); 729 memset(&cfg, 0, sizeof(cfg));
839 cfg.size = (max - base); 730 cfg.size = trace->ds.end - trace->ds.begin;
840 cfg.signal = child->thread.bts_ovfl_signal; 731 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct); 732 cfg.bts_size = sizeof(struct bts_struct);
842 733
843 if (cfg.signal) 734 if (cfg.signal)
844 cfg.flags |= PTRACE_BTS_O_SIGNAL; 735 cfg.flags |= PTRACE_BTS_O_SIGNAL;
845 736
846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 737 if (trace->ds.flags & BTS_USER)
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE; 738 cfg.flags |= PTRACE_BTS_O_TRACE;
849 739
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 740 if (trace->ds.flags & BTS_TIMESTAMPS)
851 cfg.flags |= PTRACE_BTS_O_SCHED; 741 cfg.flags |= PTRACE_BTS_O_SCHED;
852 742
853 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 743 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,108 +746,28 @@ static int ptrace_bts_status(struct task_struct *child,
856 return sizeof(cfg); 746 return sizeof(cfg);
857} 747}
858 748
859static int ptrace_bts_write_record(struct task_struct *child, 749static int ptrace_bts_clear(struct task_struct *child)
860 const struct bts_struct *in)
861{ 750{
862 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 751 const struct bts_trace *trace;
863 752
864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 753 trace = ds_read_bts(child->bts);
754 if (!trace)
755 return -EPERM;
865 756
866 memset(bts_record, 0, bts_cfg.sizeof_bts); 757 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
867 switch (in->qualifier) {
868 case BTS_INVALID:
869 break;
870
871 case BTS_BRANCH:
872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
875
876 case BTS_TASK_ARRIVES:
877 case BTS_TASK_DEPARTS:
878 bts_set(bts_record, bts_from, bts_escape);
879 bts_set(bts_record, bts_qual, in->qualifier);
880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
882
883 default:
884 return -EINVAL;
885 }
886 758
887 /* The writing task will be the switched-to task on a context 759 return ds_reset_bts(child->bts);
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 760}
892 761
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 762static int ptrace_bts_size(struct task_struct *child)
894 enum bts_qualifier qualifier)
895{ 763{
896 struct bts_struct rec = { 764 const struct bts_trace *trace;
897 .qualifier = qualifier,
898 .variant.jiffies = jiffies_64
899 };
900
901 ptrace_bts_write_record(tsk, &rec);
902}
903 765
904static const struct bts_configuration bts_cfg_netburst = { 766 trace = ds_read_bts(child->bts);
905 .sizeof_bts = sizeof(long) * 3, 767 if (!trace)
906 .sizeof_field = sizeof(long), 768 return -EPERM;
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909 769
910static const struct bts_configuration bts_cfg_pentium_m = { 770 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961} 771}
962#endif /* CONFIG_X86_PTRACE_BTS */ 772#endif /* CONFIG_X86_PTRACE_BTS */
963 773
@@ -973,13 +783,14 @@ void ptrace_disable(struct task_struct *child)
973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 783 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
974#endif 784#endif
975#ifdef CONFIG_X86_PTRACE_BTS 785#ifdef CONFIG_X86_PTRACE_BTS
976 (void)ds_release_bts(child); 786 if (child->bts) {
977 787 ds_release_bts(child->bts);
978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; 788 child->bts = NULL;
979 if (!child->thread.debugctlmsr)
980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
981 789
982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 790 kfree(child->bts_buffer);
791 child->bts_buffer = NULL;
792 child->bts_size = 0;
793 }
983#endif /* CONFIG_X86_PTRACE_BTS */ 794#endif /* CONFIG_X86_PTRACE_BTS */
984} 795}
985 796
@@ -1112,7 +923,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1112 break; 923 break;
1113 924
1114 case PTRACE_BTS_SIZE: 925 case PTRACE_BTS_SIZE:
1115 ret = ds_get_bts_index(child, /* pos = */ NULL); 926 ret = ptrace_bts_size(child);
1116 break; 927 break;
1117 928
1118 case PTRACE_BTS_GET: 929 case PTRACE_BTS_GET:
@@ -1121,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1121 break; 932 break;
1122 933
1123 case PTRACE_BTS_CLEAR: 934 case PTRACE_BTS_CLEAR:
1124 ret = ds_clear_bts(child); 935 ret = ptrace_bts_clear(child);
1125 break; 936 break;
1126 937
1127 case PTRACE_BTS_DRAIN: 938 case PTRACE_BTS_DRAIN:
@@ -1384,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1384 1195
1385 case PTRACE_GET_THREAD_AREA: 1196 case PTRACE_GET_THREAD_AREA:
1386 case PTRACE_SET_THREAD_AREA: 1197 case PTRACE_SET_THREAD_AREA:
1198#ifdef CONFIG_X86_PTRACE_BTS
1199 case PTRACE_BTS_CONFIG:
1200 case PTRACE_BTS_STATUS:
1201 case PTRACE_BTS_SIZE:
1202 case PTRACE_BTS_GET:
1203 case PTRACE_BTS_CLEAR:
1204 case PTRACE_BTS_DRAIN:
1205#endif /* CONFIG_X86_PTRACE_BTS */
1387 return arch_ptrace(child, request, addr, data); 1206 return arch_ptrace(child, request, addr, data);
1388 1207
1389 default: 1208 default:
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35 SCHED_TEXT 35 SCHED_TEXT
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT
38 *(.fixup) 39 *(.fixup)
39 *(.gnu.warning) 40 *(.gnu.warning)
40 _etext = .; /* End of text section */ 41 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..6f3d3d4cd973 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fea4565ff576..d8cc96a2738f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 11obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 12mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 13obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 14
16obj-$(CONFIG_NUMA) += numa_$(BITS).o 15obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730fa246..21e996a70d68 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -53,7 +53,7 @@
53 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 55{
56#ifdef CONFIG_MMIOTRACE_HOOKS 56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 58 if (kmmio_handler(regs, addr) == 1)
59 return -1; 59 return -1;
@@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code) 413 unsigned long error_code)
414{ 414{
415 unsigned long flags = oops_begin(); 415 unsigned long flags = oops_begin();
416 int sig = SIGKILL;
416 struct task_struct *tsk; 417 struct task_struct *tsk;
417 418
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
423 tsk->thread.trap_no = 14; 424 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code; 425 tsk->thread.error_code = error_code;
425 if (__die("Bad pagetable", regs, error_code)) 426 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL; 427 sig = 0;
427 oops_end(flags, regs, SIGKILL); 428 oops_end(flags, regs, sig);
428} 429}
429#endif 430#endif
430 431
@@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
590 int fault; 591 int fault;
591#ifdef CONFIG_X86_64 592#ifdef CONFIG_X86_64
592 unsigned long flags; 593 unsigned long flags;
594 int sig;
593#endif 595#endif
594 596
595 tsk = current; 597 tsk = current;
@@ -849,11 +851,12 @@ no_context:
849 bust_spinlocks(0); 851 bust_spinlocks(0);
850 do_exit(SIGKILL); 852 do_exit(SIGKILL);
851#else 853#else
854 sig = SIGKILL;
852 if (__die("Oops", regs, error_code)) 855 if (__die("Oops", regs, error_code))
853 regs = NULL; 856 sig = 0;
854 /* Executive summary in case the body of the oops scrolled away */ 857 /* Executive summary in case the body of the oops scrolled away */
855 printk(KERN_EMERG "CR2: %016lx\n", address); 858 printk(KERN_EMERG "CR2: %016lx\n", address);
856 oops_end(flags, regs, SIGKILL); 859 oops_end(flags, regs, sig);
857#endif 860#endif
858 861
859/* 862/*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d6..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
9 * Also alternative() doesn't work. 9 * Also alternative() doesn't work.
10 */ 10 */
11 11
12/* Disable profiling for userspace code: */
13#define DISABLE_BRANCH_PROFILING
14
12#include <linux/kernel.h> 15#include <linux/kernel.h>
13#include <linux/posix-timers.h> 16#include <linux/posix-timers.h>
14#include <linux/time.h> 17#include <linux/time.h>