aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS2
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/alpha/include/asm/perf_event.h6
-rw-r--r--arch/alpha/kernel/irq_alpha.c2
-rw-r--r--arch/alpha/kernel/perf_event.c9
-rw-r--r--arch/arm/kernel/perf_event.c2
-rw-r--r--arch/mips/kernel/perf_event_mipsxx.c2
-rw-r--r--arch/powerpc/kernel/e500-pmu.c2
-rw-r--r--arch/powerpc/kernel/mpc7450-pmu.c2
-rw-r--r--arch/powerpc/kernel/power4-pmu.c2
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c2
-rw-r--r--arch/powerpc/kernel/power5-pmu.c2
-rw-r--r--arch/powerpc/kernel/power6-pmu.c2
-rw-r--r--arch/powerpc/kernel/power7-pmu.c2
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c2
-rw-r--r--arch/sh/kernel/cpu/sh4/perf_event.c2
-rw-r--r--arch/sh/kernel/cpu/sh4a/perf_event.c2
-rw-r--r--arch/sparc/include/asm/perf_event.h4
-rw-r--r--arch/sparc/kernel/nmi.c2
-rw-r--r--arch/sparc/kernel/perf_event.c7
-rw-r--r--arch/x86/include/asm/alternative.h7
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/nmi.h51
-rw-r--r--arch/x86/include/asm/perf_event.h2
-rw-r--r--arch/x86/include/asm/perf_event_p4.h63
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h1
-rw-r--r--arch/x86/include/asm/stacktrace.h33
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/kernel/alternative.c49
-rw-r--r--arch/x86/kernel/apic/Makefile5
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c16
-rw-r--r--arch/x86/kernel/apic/io_apic.c46
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c24
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c642
-rw-r--r--arch/x86/kernel/dumpstack.c12
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c24
-rw-r--r--arch/x86/kernel/kprobes.c113
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/smpboot.c22
-rw-r--r--arch/x86/kernel/stacktrace.c8
-rw-r--r--arch/x86/kernel/time.c18
-rw-r--r--arch/x86/kernel/traps.c11
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c3
-rw-r--r--drivers/acpi/acpica/nsinit.c2
-rw-r--r--drivers/watchdog/hpwdt.c7
-rw-r--r--include/linux/ftrace_event.h10
-rw-r--r--include/linux/kprobes.h4
-rw-r--r--include/linux/nmi.h8
-rw-r--r--include/linux/perf_event.h7
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/stacktrace.h4
-rw-r--r--include/linux/syscalls.h10
-rw-r--r--include/linux/tracepoint.h4
-rw-r--r--include/trace/events/syscalls.h4
-rw-r--r--include/trace/ftrace.h7
-rw-r--r--init/main.c1
-rw-r--r--kernel/kprobes.c565
-rw-r--r--kernel/perf_event.c200
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/watchdog.c7
-rwxr-xr-xscripts/tags.sh4
-rw-r--r--tools/perf/Documentation/perf-annotate.txt37
-rw-r--r--tools/perf/Documentation/perf-buildid-list.txt3
-rw-r--r--tools/perf/Documentation/perf-diff.txt19
-rw-r--r--tools/perf/Documentation/perf-kvm.txt8
-rw-r--r--tools/perf/Documentation/perf-lock.txt15
-rw-r--r--tools/perf/Documentation/perf-probe.txt2
-rw-r--r--tools/perf/Documentation/perf-record.txt17
-rw-r--r--tools/perf/Documentation/perf-report.txt49
-rw-r--r--tools/perf/Documentation/perf-sched.txt18
-rw-r--r--tools/perf/Documentation/perf-script-perl.txt (renamed from tools/perf/Documentation/perf-trace-perl.txt)28
-rw-r--r--tools/perf/Documentation/perf-script-python.txt (renamed from tools/perf/Documentation/perf-trace-python.txt)88
-rw-r--r--tools/perf/Documentation/perf-script.txt (renamed from tools/perf/Documentation/perf-trace.txt)61
-rw-r--r--tools/perf/Documentation/perf-stat.txt44
-rw-r--r--tools/perf/Documentation/perf-test.txt2
-rw-r--r--tools/perf/Documentation/perf-top.txt28
-rw-r--r--tools/perf/MANIFEST1
-rw-r--r--tools/perf/Makefile17
-rw-r--r--tools/perf/bench/mem-memcpy-arch.h12
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h4
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c219
-rw-r--r--tools/perf/builtin-diff.c2
-rw-r--r--tools/perf/builtin-lock.c6
-rw-r--r--tools/perf/builtin-record.c16
-rw-r--r--tools/perf/builtin-sched.c6
-rw-r--r--tools/perf/builtin-script.c (renamed from tools/perf/builtin-trace.c)73
-rw-r--r--tools/perf/builtin-stat.c325
-rw-r--r--tools/perf/builtin-top.c6
-rw-r--r--tools/perf/builtin.h2
-rw-r--r--tools/perf/command-list.txt2
-rw-r--r--tools/perf/feature-tests.mak4
-rw-r--r--tools/perf/perf.c2
-rw-r--r--tools/perf/scripts/python/Perf-Trace-Util/Context.c2
-rw-r--r--tools/perf/util/debug.c42
-rw-r--r--tools/perf/util/debug.h2
-rw-r--r--tools/perf/util/event.c2
-rw-r--r--tools/perf/util/header.c11
-rw-r--r--tools/perf/util/header.h1
-rw-r--r--tools/perf/util/include/asm/cpufeature.h9
-rw-r--r--tools/perf/util/include/asm/dwarf2.h11
-rw-r--r--tools/perf/util/include/linux/bitops.h5
-rw-r--r--tools/perf/util/include/linux/linkage.h13
-rw-r--r--tools/perf/util/parse-events.c12
-rw-r--r--tools/perf/util/probe-finder.h6
-rw-r--r--tools/perf/util/scripting-engines/trace-event-perl.c6
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c4
-rw-r--r--tools/perf/util/session.c295
-rw-r--r--tools/perf/util/session.h8
-rw-r--r--tools/perf/util/symbol.c2
-rw-r--r--tools/perf/util/ui/util.c16
118 files changed, 2008 insertions, 2283 deletions
diff --git a/CREDITS b/CREDITS
index 41d8e63d5165..494b6e4746d7 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2365,8 +2365,6 @@ E: acme@redhat.com
2365W: http://oops.ghostprotocols.net:81/blog/ 2365W: http://oops.ghostprotocols.net:81/blog/
2366P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01 2366P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01
2367D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks 2367D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks
2368S: R. Brasílio Itiberê, 4270/1010 - Água Verde
2369S: 80240-060 - Curitiba - Paraná
2370S: Brazil 2368S: Brazil
2371 2369
2372N: Karsten Merker 2370N: Karsten Merker
diff --git a/MAINTAINERS b/MAINTAINERS
index 1a1c27b9c557..ed192f11ad23 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4612,7 +4612,7 @@ PERFORMANCE EVENTS SUBSYSTEM
4612M: Peter Zijlstra <a.p.zijlstra@chello.nl> 4612M: Peter Zijlstra <a.p.zijlstra@chello.nl>
4613M: Paul Mackerras <paulus@samba.org> 4613M: Paul Mackerras <paulus@samba.org>
4614M: Ingo Molnar <mingo@elte.hu> 4614M: Ingo Molnar <mingo@elte.hu>
4615M: Arnaldo Carvalho de Melo <acme@redhat.com> 4615M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
4616S: Supported 4616S: Supported
4617F: kernel/perf_event*.c 4617F: kernel/perf_event*.c
4618F: include/linux/perf_event.h 4618F: include/linux/perf_event.h
diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h
index fe792ca818f6..5996e7a6757e 100644
--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -1,10 +1,4 @@
1#ifndef __ASM_ALPHA_PERF_EVENT_H 1#ifndef __ASM_ALPHA_PERF_EVENT_H
2#define __ASM_ALPHA_PERF_EVENT_H 2#define __ASM_ALPHA_PERF_EVENT_H
3 3
4#ifdef CONFIG_PERF_EVENTS
5extern void init_hw_perf_events(void);
6#else
7static inline void init_hw_perf_events(void) { }
8#endif
9
10#endif /* __ASM_ALPHA_PERF_EVENT_H */ 4#endif /* __ASM_ALPHA_PERF_EVENT_H */
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c
index 5f77afb88e89..4c8bb374eb0a 100644
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -112,8 +112,6 @@ init_IRQ(void)
112 wrent(entInt, 0); 112 wrent(entInt, 0);
113 113
114 alpha_mv.init_irq(); 114 alpha_mv.init_irq();
115
116 init_hw_perf_events();
117} 115}
118 116
119/* 117/*
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 1cc49683fb69..3283059b6e85 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/kdebug.h> 15#include <linux/kdebug.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/init.h>
17 18
18#include <asm/hwrpb.h> 19#include <asm/hwrpb.h>
19#include <asm/atomic.h> 20#include <asm/atomic.h>
@@ -863,13 +864,13 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
863/* 864/*
864 * Init call to initialise performance events at kernel startup. 865 * Init call to initialise performance events at kernel startup.
865 */ 866 */
866void __init init_hw_perf_events(void) 867int __init init_hw_perf_events(void)
867{ 868{
868 pr_info("Performance events: "); 869 pr_info("Performance events: ");
869 870
870 if (!supported_cpu()) { 871 if (!supported_cpu()) {
871 pr_cont("No support for your CPU.\n"); 872 pr_cont("No support for your CPU.\n");
872 return; 873 return 0;
873 } 874 }
874 875
875 pr_cont("Supported CPU type!\n"); 876 pr_cont("Supported CPU type!\n");
@@ -882,5 +883,7 @@ void __init init_hw_perf_events(void)
882 alpha_pmu = &ev67_pmu; 883 alpha_pmu = &ev67_pmu;
883 884
884 perf_pmu_register(&pmu); 885 perf_pmu_register(&pmu);
885}
886 886
887 return 0;
888}
889early_initcall(init_hw_perf_events);
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 07a50357492a..d45f70e5f2ee 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -3038,7 +3038,7 @@ init_hw_perf_events(void)
3038 3038
3039 return 0; 3039 return 0;
3040} 3040}
3041arch_initcall(init_hw_perf_events); 3041early_initcall(init_hw_perf_events);
3042 3042
3043/* 3043/*
3044 * Callchain handling code. 3044 * Callchain handling code.
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 5c7c6fc07565..183e0d226669 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1047,6 +1047,6 @@ init_hw_perf_events(void)
1047 1047
1048 return 0; 1048 return 0;
1049} 1049}
1050arch_initcall(init_hw_perf_events); 1050early_initcall(init_hw_perf_events);
1051 1051
1052#endif /* defined(CONFIG_CPU_MIPS32)... */ 1052#endif /* defined(CONFIG_CPU_MIPS32)... */
diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c
index 7c07de0d8943..b150b510510f 100644
--- a/arch/powerpc/kernel/e500-pmu.c
+++ b/arch/powerpc/kernel/e500-pmu.c
@@ -126,4 +126,4 @@ static int init_e500_pmu(void)
126 return register_fsl_emb_pmu(&e500_pmu); 126 return register_fsl_emb_pmu(&e500_pmu);
127} 127}
128 128
129arch_initcall(init_e500_pmu); 129early_initcall(init_e500_pmu);
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
index 09d72028f317..2cc5e0301d0b 100644
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void)
414 return register_power_pmu(&mpc7450_pmu); 414 return register_power_pmu(&mpc7450_pmu);
415} 415}
416 416
417arch_initcall(init_mpc7450_pmu); 417early_initcall(init_mpc7450_pmu);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 2a361cdda635..ead8b3c2649e 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -613,4 +613,4 @@ static int init_power4_pmu(void)
613 return register_power_pmu(&power4_pmu); 613 return register_power_pmu(&power4_pmu);
614} 614}
615 615
616arch_initcall(init_power4_pmu); 616early_initcall(init_power4_pmu);
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 199de527d411..eca0ac595cb6 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -682,4 +682,4 @@ static int init_power5p_pmu(void)
682 return register_power_pmu(&power5p_pmu); 682 return register_power_pmu(&power5p_pmu);
683} 683}
684 684
685arch_initcall(init_power5p_pmu); 685early_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 98b6a729a9dd..d5ff0f64a5e6 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -621,4 +621,4 @@ static int init_power5_pmu(void)
621 return register_power_pmu(&power5_pmu); 621 return register_power_pmu(&power5_pmu);
622} 622}
623 623
624arch_initcall(init_power5_pmu); 624early_initcall(init_power5_pmu);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 84a607bda8fb..31603927e376 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -544,4 +544,4 @@ static int init_power6_pmu(void)
544 return register_power_pmu(&power6_pmu); 544 return register_power_pmu(&power6_pmu);
545} 545}
546 546
547arch_initcall(init_power6_pmu); 547early_initcall(init_power6_pmu);
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 852f7b7f6b40..593740fcb799 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -369,4 +369,4 @@ static int init_power7_pmu(void)
369 return register_power_pmu(&power7_pmu); 369 return register_power_pmu(&power7_pmu);
370} 370}
371 371
372arch_initcall(init_power7_pmu); 372early_initcall(init_power7_pmu);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 3fee685de4df..9a6e093858fe 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -494,4 +494,4 @@ static int init_ppc970_pmu(void)
494 return register_power_pmu(&ppc970_pmu); 494 return register_power_pmu(&ppc970_pmu);
495} 495}
496 496
497arch_initcall(init_ppc970_pmu); 497early_initcall(init_ppc970_pmu);
diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c
index dbf3b4bb71fe..748955df018d 100644
--- a/arch/sh/kernel/cpu/sh4/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4/perf_event.c
@@ -250,4 +250,4 @@ static int __init sh7750_pmu_init(void)
250 250
251 return register_sh_pmu(&sh7750_pmu); 251 return register_sh_pmu(&sh7750_pmu);
252} 252}
253arch_initcall(sh7750_pmu_init); 253early_initcall(sh7750_pmu_init);
diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c
index 580276525731..17e6bebfede0 100644
--- a/arch/sh/kernel/cpu/sh4a/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4a/perf_event.c
@@ -284,4 +284,4 @@ static int __init sh4a_pmu_init(void)
284 284
285 return register_sh_pmu(&sh4a_pmu); 285 return register_sh_pmu(&sh4a_pmu);
286} 286}
287arch_initcall(sh4a_pmu_init); 287early_initcall(sh4a_pmu_init);
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 6e8bfa1786da..4d3dbe3703e9 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -4,8 +4,6 @@
4#ifdef CONFIG_PERF_EVENTS 4#ifdef CONFIG_PERF_EVENTS
5#include <asm/ptrace.h> 5#include <asm/ptrace.h>
6 6
7extern void init_hw_perf_events(void);
8
9#define perf_arch_fetch_caller_regs(regs, ip) \ 7#define perf_arch_fetch_caller_regs(regs, ip) \
10do { \ 8do { \
11 unsigned long _pstate, _asi, _pil, _i7, _fp; \ 9 unsigned long _pstate, _asi, _pil, _i7, _fp; \
@@ -26,8 +24,6 @@ do { \
26 (regs)->u_regs[UREG_I6] = _fp; \ 24 (regs)->u_regs[UREG_I6] = _fp; \
27 (regs)->u_regs[UREG_I7] = _i7; \ 25 (regs)->u_regs[UREG_I7] = _i7; \
28} while (0) 26} while (0)
29#else
30static inline void init_hw_perf_events(void) { }
31#endif 27#endif
32 28
33#endif 29#endif
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index a4bd7ba74c89..300f810142f5 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -270,8 +270,6 @@ int __init nmi_init(void)
270 atomic_set(&nmi_active, -1); 270 atomic_set(&nmi_active, -1);
271 } 271 }
272 } 272 }
273 if (!err)
274 init_hw_perf_events();
275 273
276 return err; 274 return err;
277} 275}
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0d6deb55a2ae..75c5b1263970 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1307,20 +1307,23 @@ static bool __init supported_pmu(void)
1307 return false; 1307 return false;
1308} 1308}
1309 1309
1310void __init init_hw_perf_events(void) 1310int __init init_hw_perf_events(void)
1311{ 1311{
1312 pr_info("Performance events: "); 1312 pr_info("Performance events: ");
1313 1313
1314 if (!supported_pmu()) { 1314 if (!supported_pmu()) {
1315 pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); 1315 pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
1316 return; 1316 return 0;
1317 } 1317 }
1318 1318
1319 pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); 1319 pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
1320 1320
1321 perf_pmu_register(&pmu); 1321 perf_pmu_register(&pmu);
1322 register_die_notifier(&perf_event_nmi_notifier); 1322 register_die_notifier(&perf_event_nmi_notifier);
1323
1324 return 0;
1323} 1325}
1326early_initcall(init_hw_perf_event);
1324 1327
1325void perf_callchain_kernel(struct perf_callchain_entry *entry, 1328void perf_callchain_kernel(struct perf_callchain_entry *entry,
1326 struct pt_regs *regs) 1329 struct pt_regs *regs)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 76561d20ea2f..4a2adaa9aefc 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -180,8 +180,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
180 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 180 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
181 * inconsistent instruction while you patch. 181 * inconsistent instruction while you patch.
182 */ 182 */
183struct text_poke_param {
184 void *addr;
185 const void *opcode;
186 size_t len;
187};
188
183extern void *text_poke(void *addr, const void *opcode, size_t len); 189extern void *text_poke(void *addr, const void *opcode, size_t len);
184extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 190extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
191extern void text_poke_smp_batch(struct text_poke_param *params, int n);
185 192
186#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 193#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
187#define IDEAL_NOP_SIZE_5 5 194#define IDEAL_NOP_SIZE_5 5
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca86581b..f23eb2528464 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long);
28extern int __must_check __die(const char *, struct pt_regs *, long); 28extern int __must_check __die(const char *, struct pt_regs *, long);
29extern void show_registers(struct pt_regs *regs); 29extern void show_registers(struct pt_regs *regs);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs, 30extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp); 31 unsigned long *sp);
32extern void __show_regs(struct pt_regs *regs, int all); 32extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs); 33extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void); 34extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f86b4b7..3545838cddeb 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,39 +7,13 @@
7 7
8#ifdef ARCH_HAS_NMI_WATCHDOG 8#ifdef ARCH_HAS_NMI_WATCHDOG
9 9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); 10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
21extern int nmi_watchdog_enabled;
22#endif
23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
24extern int reserve_perfctr_nmi(unsigned int); 12extern int reserve_perfctr_nmi(unsigned int);
25extern void release_perfctr_nmi(unsigned int); 13extern void release_perfctr_nmi(unsigned int);
26extern int reserve_evntsel_nmi(unsigned int); 14extern int reserve_evntsel_nmi(unsigned int);
27extern void release_evntsel_nmi(unsigned int); 15extern void release_evntsel_nmi(unsigned int);
28 16
29extern void setup_apic_nmi_watchdog(void *);
30extern void stop_apic_nmi_watchdog(void *);
31extern void disable_timer_nmi_watchdog(void);
32extern void enable_timer_nmi_watchdog(void);
33extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
34extern void cpu_nmi_set_wd_enabled(void);
35
36extern atomic_t nmi_active;
37extern unsigned int nmi_watchdog;
38#define NMI_NONE 0
39#define NMI_IO_APIC 1
40#define NMI_LOCAL_APIC 2
41#define NMI_INVALID 3
42
43struct ctl_table; 17struct ctl_table;
44extern int proc_nmi_enabled(struct ctl_table *, int , 18extern int proc_nmi_enabled(struct ctl_table *, int ,
45 void __user *, size_t *, loff_t *); 19 void __user *, size_t *, loff_t *);
@@ -47,33 +21,8 @@ extern int unknown_nmi_panic;
47 21
48void arch_trigger_all_cpu_backtrace(void); 22void arch_trigger_all_cpu_backtrace(void);
49#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
50
51static inline void localise_nmi_watchdog(void)
52{
53 if (nmi_watchdog == NMI_IO_APIC)
54 nmi_watchdog = NMI_LOCAL_APIC;
55}
56
57/* check if nmi_watchdog is active (ie was specified at boot) */
58static inline int nmi_watchdog_active(void)
59{
60 /*
61 * actually it should be:
62 * return (nmi_watchdog == NMI_LOCAL_APIC ||
63 * nmi_watchdog == NMI_IO_APIC)
64 * but since they are power of two we could use a
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68}
69#endif 24#endif
70 25
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75void disable_lapic_nmi_watchdog(void);
76void enable_lapic_nmi_watchdog(void);
77void stop_nmi(void); 26void stop_nmi(void);
78void restart_nmi(void); 27void restart_nmi(void);
79 28
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b1dbb3..d9d4dae305f6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ 125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
126 126
127#ifdef CONFIG_PERF_EVENTS 127#ifdef CONFIG_PERF_EVENTS
128extern void init_hw_perf_events(void);
129extern void perf_events_lapic_init(void); 128extern void perf_events_lapic_init(void);
130 129
131#define PERF_EVENT_INDEX_OFFSET 0 130#define PERF_EVENT_INDEX_OFFSET 0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
156} 155}
157 156
158#else 157#else
159static inline void init_hw_perf_events(void) { }
160static inline void perf_events_lapic_init(void) { } 158static inline void perf_events_lapic_init(void) { }
161#endif 159#endif
162 160
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index a70cd216be5d..295e2ff18a6a 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS {
744}; 744};
745 745
746/* 746/*
747 * P4 PEBS specifics (Replay Event only)
748 *
749 * Format (bits):
750 * 0-6: metric from P4_PEBS_METRIC enum
751 * 7 : reserved
752 * 8 : reserved
753 * 9-11 : reserved
754 *
755 * Note we have UOP and PEBS bits reserved for now 747 * Note we have UOP and PEBS bits reserved for now
756 * just in case if we will need them once 748 * just in case if we will need them once
757 */ 749 */
@@ -788,5 +780,60 @@ enum P4_PEBS_METRIC {
788 P4_PEBS_METRIC__max 780 P4_PEBS_METRIC__max
789}; 781};
790 782
783/*
784 * Notes on internal configuration of ESCR+CCCR tuples
785 *
786 * Since P4 has quite the different architecture of
787 * performance registers in compare with "architectural"
788 * once and we have on 64 bits to keep configuration
789 * of performance event, the following trick is used.
790 *
791 * 1) Since both ESCR and CCCR registers have only low
792 * 32 bits valuable, we pack them into a single 64 bit
793 * configuration. Low 32 bits of such config correspond
794 * to low 32 bits of CCCR register and high 32 bits
795 * correspond to low 32 bits of ESCR register.
796 *
797 * 2) The meaning of every bit of such config field can
798 * be found in Intel SDM but it should be noted that
799 * we "borrow" some reserved bits for own usage and
800 * clean them or set to a proper value when we do
801 * a real write to hardware registers.
802 *
803 * 3) The format of bits of config is the following
804 * and should be either 0 or set to some predefined
805 * values:
806 *
807 * Low 32 bits
808 * -----------
809 * 0-6: P4_PEBS_METRIC enum
810 * 7-11: reserved
811 * 12: reserved (Enable)
812 * 13-15: reserved (ESCR select)
813 * 16-17: Active Thread
814 * 18: Compare
815 * 19: Complement
816 * 20-23: Threshold
817 * 24: Edge
818 * 25: reserved (FORCE_OVF)
819 * 26: reserved (OVF_PMI_T0)
820 * 27: reserved (OVF_PMI_T1)
821 * 28-29: reserved
822 * 30: reserved (Cascade)
823 * 31: reserved (OVF)
824 *
825 * High 32 bits
826 * ------------
827 * 0: reserved (T1_USR)
828 * 1: reserved (T1_OS)
829 * 2: reserved (T0_USR)
830 * 3: reserved (T0_OS)
831 * 4: Tag Enable
832 * 5-8: Tag Value
833 * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
834 * 25-30: enum P4_EVENTS
835 * 31: reserved (HT thread)
836 */
837
791#endif /* PERF_EVENT_P4_H */ 838#endif /* PERF_EVENT_P4_H */
792 839
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 1def60114906..6c22bf353f26 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
48 setup_IO_APIC(); 48 setup_IO_APIC();
49 else { 49 else {
50 nr_ioapics = 0; 50 nr_ioapics = 0;
51 localise_nmi_watchdog();
52 } 51 }
53#endif 52#endif
54} 53}
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2b16a2ad23dc..52b5c7ed3608 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
7#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
8 8
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/ptrace.h>
10 11
11extern int kstack_depth_to_print; 12extern int kstack_depth_to_print;
12 13
@@ -46,7 +47,7 @@ struct stacktrace_ops {
46}; 47};
47 48
48void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
49 unsigned long *stack, unsigned long bp, 50 unsigned long *stack,
50 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
51 52
52#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 58#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif 59#endif
59 60
61#ifdef CONFIG_FRAME_POINTER
62static inline unsigned long
63stack_frame(struct task_struct *task, struct pt_regs *regs)
64{
65 unsigned long bp;
66
67 if (regs)
68 return regs->bp;
69
70 if (task == current) {
71 /* Grab bp right from our regs */
72 get_bp(bp);
73 return bp;
74 }
75
76 /* bp is the last reg pushed by switch_to */
77 return *(unsigned long *)task->thread.sp;
78}
79#else
80static inline unsigned long
81stack_frame(struct task_struct *task, struct pt_regs *regs)
82{
83 return 0;
84}
85#endif
86
60extern void 87extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl); 89 unsigned long *stack, char *log_lvl);
63 90
64extern void 91extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl); 93 unsigned long *sp, char *log_lvl);
67 94
68extern unsigned int code_bytes; 95extern unsigned int code_bytes;
69 96
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 5469630b27f5..fa7b9176b76c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11extern int recalibrate_cpu_khz(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack;
15#else
16# define timer_ack (0)
17#endif
18
19extern int no_timer_check; 13extern int no_timer_check;
20 14
21/* Accelerators for sched_clock() 15/* Accelerators for sched_clock()
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5079f24c955a..553d0b0d639b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -591,17 +591,21 @@ static atomic_t stop_machine_first;
591static int wrote_text; 591static int wrote_text;
592 592
593struct text_poke_params { 593struct text_poke_params {
594 void *addr; 594 struct text_poke_param *params;
595 const void *opcode; 595 int nparams;
596 size_t len;
597}; 596};
598 597
599static int __kprobes stop_machine_text_poke(void *data) 598static int __kprobes stop_machine_text_poke(void *data)
600{ 599{
601 struct text_poke_params *tpp = data; 600 struct text_poke_params *tpp = data;
601 struct text_poke_param *p;
602 int i;
602 603
603 if (atomic_dec_and_test(&stop_machine_first)) { 604 if (atomic_dec_and_test(&stop_machine_first)) {
604 text_poke(tpp->addr, tpp->opcode, tpp->len); 605 for (i = 0; i < tpp->nparams; i++) {
606 p = &tpp->params[i];
607 text_poke(p->addr, p->opcode, p->len);
608 }
605 smp_wmb(); /* Make sure other cpus see that this has run */ 609 smp_wmb(); /* Make sure other cpus see that this has run */
606 wrote_text = 1; 610 wrote_text = 1;
607 } else { 611 } else {
@@ -610,8 +614,12 @@ static int __kprobes stop_machine_text_poke(void *data)
610 smp_mb(); /* Load wrote_text before following execution */ 614 smp_mb(); /* Load wrote_text before following execution */
611 } 615 }
612 616
613 flush_icache_range((unsigned long)tpp->addr, 617 for (i = 0; i < tpp->nparams; i++) {
614 (unsigned long)tpp->addr + tpp->len); 618 p = &tpp->params[i];
619 flush_icache_range((unsigned long)p->addr,
620 (unsigned long)p->addr + p->len);
621 }
622
615 return 0; 623 return 0;
616} 624}
617 625
@@ -631,10 +639,13 @@ static int __kprobes stop_machine_text_poke(void *data)
631void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) 639void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
632{ 640{
633 struct text_poke_params tpp; 641 struct text_poke_params tpp;
642 struct text_poke_param p;
634 643
635 tpp.addr = addr; 644 p.addr = addr;
636 tpp.opcode = opcode; 645 p.opcode = opcode;
637 tpp.len = len; 646 p.len = len;
647 tpp.params = &p;
648 tpp.nparams = 1;
638 atomic_set(&stop_machine_first, 1); 649 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 650 wrote_text = 0;
640 /* Use __stop_machine() because the caller already got online_cpus. */ 651 /* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +653,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
642 return addr; 653 return addr;
643} 654}
644 655
656/**
657 * text_poke_smp_batch - Update instructions on a live kernel on SMP
658 * @params: an array of text_poke parameters
659 * @n: the number of elements in params.
660 *
661 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
662 * stop_machine() is heavy task, it is better to aggregate text_poke requests
663 * and do it once if possible.
664 *
665 * Note: Must be called under get_online_cpus() and text_mutex.
666 */
667void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
668{
669 struct text_poke_params tpp = {.params = params, .nparams = n};
670
671 atomic_set(&stop_machine_first, 1);
672 wrote_text = 0;
673 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
674}
675
645#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 676#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
646 677
647#ifdef CONFIG_X86_64 678#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..3966b564ea47 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) 6obj-y += hw_nmi.o
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10 7
11obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
12obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3f838d537392..e9e2a93783f9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -31,7 +31,6 @@
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/cpu.h> 32#include <linux/cpu.h>
33#include <linux/dmi.h> 33#include <linux/dmi.h>
34#include <linux/nmi.h>
35#include <linux/smp.h> 34#include <linux/smp.h>
36#include <linux/mm.h> 35#include <linux/mm.h>
37 36
@@ -799,11 +798,7 @@ void __init setup_boot_APIC_clock(void)
799 * PIT/HPET going. Otherwise register lapic as a dummy 798 * PIT/HPET going. Otherwise register lapic as a dummy
800 * device. 799 * device.
801 */ 800 */
802 if (nmi_watchdog != NMI_IO_APIC) 801 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
803 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
804 else
805 pr_warning("APIC timer registered as dummy,"
806 " due to nmi_watchdog=%d!\n", nmi_watchdog);
807 802
808 /* Setup the lapic or request the broadcast */ 803 /* Setup the lapic or request the broadcast */
809 setup_APIC_timer(); 804 setup_APIC_timer();
@@ -1387,7 +1382,6 @@ void __cpuinit end_local_APIC_setup(void)
1387 } 1382 }
1388#endif 1383#endif
1389 1384
1390 setup_apic_nmi_watchdog(NULL);
1391 apic_pm_activate(); 1385 apic_pm_activate();
1392} 1386}
1393 1387
@@ -1750,17 +1744,10 @@ int __init APIC_init_uniprocessor(void)
1750 setup_IO_APIC(); 1744 setup_IO_APIC();
1751 else { 1745 else {
1752 nr_ioapics = 0; 1746 nr_ioapics = 0;
1753 localise_nmi_watchdog();
1754 } 1747 }
1755#else
1756 localise_nmi_watchdog();
1757#endif 1748#endif
1758 1749
1759 x86_init.timers.setup_percpu_clockev(); 1750 x86_init.timers.setup_percpu_clockev();
1760#ifdef CONFIG_X86_64
1761 check_nmi_watchdog();
1762#endif
1763
1764 return 0; 1751 return 0;
1765} 1752}
1766 1753
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 62f6e1e55b90..a0e71cb4fa9c 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,16 +17,18 @@
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19 19
20#ifdef CONFIG_HARDLOCKUP_DETECTOR
20u64 hw_nmi_get_sample_period(void) 21u64 hw_nmi_get_sample_period(void)
21{ 22{
22 return (u64)(cpu_khz) * 1000 * 60; 23 return (u64)(cpu_khz) * 1000 * 60;
23} 24}
25#endif
24 26
25#ifdef ARCH_HAS_NMI_WATCHDOG
26 27
27/* For reliability, we're prepared to waste bits here. */ 28/* For reliability, we're prepared to waste bits here. */
28static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; 29static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
29 30
31#ifdef arch_trigger_all_cpu_backtrace
30void arch_trigger_all_cpu_backtrace(void) 32void arch_trigger_all_cpu_backtrace(void)
31{ 33{
32 int i; 34 int i;
@@ -93,16 +95,4 @@ early_initcall(register_trigger_all_cpu_backtrace);
93#endif 95#endif
94 96
95/* STUB calls to mimic old nmi_watchdog behaviour */ 97/* STUB calls to mimic old nmi_watchdog behaviour */
96#if defined(CONFIG_X86_LOCAL_APIC)
97unsigned int nmi_watchdog = NMI_NONE;
98EXPORT_SYMBOL(nmi_watchdog);
99void acpi_nmi_enable(void) { return; }
100void acpi_nmi_disable(void) { return; }
101#endif
102atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
103EXPORT_SYMBOL(nmi_active);
104int unknown_nmi_panic; 98int unknown_nmi_panic;
105void cpu_nmi_set_wd_enabled(void) { return; }
106void stop_apic_nmi_watchdog(void *unused) { return; }
107void setup_apic_nmi_watchdog(void *unused) { return; }
108int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7cc0a721f628..e4a040c28de1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -54,7 +54,6 @@
54#include <asm/dma.h> 54#include <asm/dma.h>
55#include <asm/timer.h> 55#include <asm/timer.h>
56#include <asm/i8259.h> 56#include <asm/i8259.h>
57#include <asm/nmi.h>
58#include <asm/msidef.h> 57#include <asm/msidef.h>
59#include <asm/hypertransport.h> 58#include <asm/hypertransport.h>
60#include <asm/setup.h> 59#include <asm/setup.h>
@@ -2643,24 +2642,6 @@ static void lapic_register_intr(int irq)
2643 "edge"); 2642 "edge");
2644} 2643}
2645 2644
2646static void __init setup_nmi(void)
2647{
2648 /*
2649 * Dirty trick to enable the NMI watchdog ...
2650 * We put the 8259A master into AEOI mode and
2651 * unmask on all local APICs LVT0 as NMI.
2652 *
2653 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2654 * is from Maciej W. Rozycki - so we do not have to EOI from
2655 * the NMI handler or the timer interrupt.
2656 */
2657 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2658
2659 enable_NMI_through_LVT0();
2660
2661 apic_printk(APIC_VERBOSE, " done.\n");
2662}
2663
2664/* 2645/*
2665 * This looks a bit hackish but it's about the only one way of sending 2646 * This looks a bit hackish but it's about the only one way of sending
2666 * a few INTA cycles to 8259As and any associated glue logic. ICR does 2647 * a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2766,15 +2747,6 @@ static inline void __init check_timer(void)
2766 */ 2747 */
2767 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2748 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2768 legacy_pic->init(1); 2749 legacy_pic->init(1);
2769#ifdef CONFIG_X86_32
2770 {
2771 unsigned int ver;
2772
2773 ver = apic_read(APIC_LVR);
2774 ver = GET_APIC_VERSION(ver);
2775 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2776 }
2777#endif
2778 2750
2779 pin1 = find_isa_irq_pin(0, mp_INT); 2751 pin1 = find_isa_irq_pin(0, mp_INT);
2780 apic1 = find_isa_irq_apic(0, mp_INT); 2752 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2822,10 +2794,6 @@ static inline void __init check_timer(void)
2822 unmask_ioapic(cfg); 2794 unmask_ioapic(cfg);
2823 } 2795 }
2824 if (timer_irq_works()) { 2796 if (timer_irq_works()) {
2825 if (nmi_watchdog == NMI_IO_APIC) {
2826 setup_nmi();
2827 legacy_pic->unmask(0);
2828 }
2829 if (disable_timer_pin_1 > 0) 2797 if (disable_timer_pin_1 > 0)
2830 clear_IO_APIC_pin(0, pin1); 2798 clear_IO_APIC_pin(0, pin1);
2831 goto out; 2799 goto out;
@@ -2851,11 +2819,6 @@ static inline void __init check_timer(void)
2851 if (timer_irq_works()) { 2819 if (timer_irq_works()) {
2852 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2820 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2853 timer_through_8259 = 1; 2821 timer_through_8259 = 1;
2854 if (nmi_watchdog == NMI_IO_APIC) {
2855 legacy_pic->mask(0);
2856 setup_nmi();
2857 legacy_pic->unmask(0);
2858 }
2859 goto out; 2822 goto out;
2860 } 2823 }
2861 /* 2824 /*
@@ -2867,15 +2830,6 @@ static inline void __init check_timer(void)
2867 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2830 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2868 } 2831 }
2869 2832
2870 if (nmi_watchdog == NMI_IO_APIC) {
2871 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2872 "through the IO-APIC - disabling NMI Watchdog!\n");
2873 nmi_watchdog = NMI_NONE;
2874 }
2875#ifdef CONFIG_X86_32
2876 timer_ack = 0;
2877#endif
2878
2879 apic_printk(APIC_QUIET, KERN_INFO 2833 apic_printk(APIC_QUIET, KERN_INFO
2880 "...trying to set up timer as Virtual Wire IRQ...\n"); 2834 "...trying to set up timer as Virtual Wire IRQ...\n");
2881 2835
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index c90041ccb742..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <asm/apic.h>
15
16#include <linux/nmi.h>
17#include <linux/mm.h>
18#include <linux/delay.h>
19#include <linux/interrupt.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/sysdev.h>
23#include <linux/sysctl.h>
24#include <linux/percpu.h>
25#include <linux/kprobes.h>
26#include <linux/cpumask.h>
27#include <linux/kernel_stat.h>
28#include <linux/kdebug.h>
29#include <linux/smp.h>
30
31#include <asm/i8259.h>
32#include <asm/io_apic.h>
33#include <asm/proto.h>
34#include <asm/timer.h>
35
36#include <asm/mce.h>
37
38#include <asm/mach_traps.h>
39
40int unknown_nmi_panic;
41int nmi_watchdog_enabled;
42
43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
45
46/* nmi_active:
47 * >0: the lapic NMI watchdog is active, but can be disabled
48 * <0: the lapic NMI watchdog has not been set up, and cannot
49 * be enabled
50 * 0: the lapic NMI watchdog is disabled, but can be enabled
51 */
52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
58static int panic_on_timeout;
59
60static unsigned int nmi_hz = HZ;
61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
63
64static inline unsigned int get_nmi_count(int cpu)
65{
66 return per_cpu(irq_stat, cpu).__nmi_count;
67}
68
69static inline int mce_in_progress(void)
70{
71#if defined(CONFIG_X86_MCE)
72 return atomic_read(&mce_entry) > 0;
73#endif
74 return 0;
75}
76
77/*
78 * Take the local apic timer and PIT/HPET into account. We don't
79 * know which one is active, when we have highres/dyntick on
80 */
81static inline unsigned int get_timer_irqs(int cpu)
82{
83 return per_cpu(irq_stat, cpu).apic_timer_irqs +
84 per_cpu(irq_stat, cpu).irq0_irqs;
85}
86
87#ifdef CONFIG_SMP
88/*
89 * The performance counters used by NMI_LOCAL_APIC don't trigger when
90 * the CPU is idle. To make sure the NMI watchdog really ticks on all
91 * CPUs during the test make them busy.
92 */
93static __init void nmi_cpu_busy(void *data)
94{
95 local_irq_enable_in_hardirq();
96 /*
97 * Intentionally don't use cpu_relax here. This is
98 * to make sure that the performance counter really ticks,
99 * even if there is a simulator or similar that catches the
100 * pause instruction. On a real HT machine this is fine because
101 * all other CPUs are busy with "useless" delay loops and don't
102 * care if they get somewhat less cycles.
103 */
104 while (endflag == 0)
105 mb();
106}
107#endif
108
109static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
110{
111 printk(KERN_CONT "\n");
112
113 printk(KERN_WARNING
114 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
116
117 printk(KERN_WARNING
118 "Please report this to bugzilla.kernel.org,\n");
119 printk(KERN_WARNING
120 "and attach the output of the 'dmesg' command.\n");
121
122 per_cpu(wd_enabled, cpu) = 0;
123 atomic_dec(&nmi_active);
124}
125
126static void __acpi_nmi_disable(void *__unused)
127{
128 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
129}
130
131int __init check_nmi_watchdog(void)
132{
133 unsigned int *prev_nmi_count;
134 int cpu;
135
136 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
137 return 0;
138
139 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
140 if (!prev_nmi_count)
141 goto error;
142
143 printk(KERN_INFO "Testing NMI watchdog ... ");
144
145#ifdef CONFIG_SMP
146 if (nmi_watchdog == NMI_LOCAL_APIC)
147 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
148#endif
149
150 for_each_possible_cpu(cpu)
151 prev_nmi_count[cpu] = get_nmi_count(cpu);
152 local_irq_enable();
153 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
154
155 for_each_online_cpu(cpu) {
156 if (!per_cpu(wd_enabled, cpu))
157 continue;
158 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
159 report_broken_nmi(cpu, prev_nmi_count);
160 }
161 endflag = 1;
162 if (!atomic_read(&nmi_active)) {
163 kfree(prev_nmi_count);
164 atomic_set(&nmi_active, -1);
165 goto error;
166 }
167 printk("OK.\n");
168
169 /*
170 * now that we know it works we can reduce NMI frequency to
171 * something more reasonable; makes a difference in some configs
172 */
173 if (nmi_watchdog == NMI_LOCAL_APIC)
174 nmi_hz = lapic_adjust_nmi_hz(1);
175
176 kfree(prev_nmi_count);
177 return 0;
178error:
179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259)
181 legacy_pic->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 }
184
185#ifdef CONFIG_X86_32
186 timer_ack = 0;
187#endif
188 return -1;
189}
190
191static int __init setup_nmi_watchdog(char *str)
192{
193 unsigned int nmi;
194
195 if (!strncmp(str, "panic", 5)) {
196 panic_on_timeout = 1;
197 str = strchr(str, ',');
198 if (!str)
199 return 1;
200 ++str;
201 }
202
203 if (!strncmp(str, "lapic", 5))
204 nmi_watchdog = NMI_LOCAL_APIC;
205 else if (!strncmp(str, "ioapic", 6))
206 nmi_watchdog = NMI_IO_APIC;
207 else {
208 get_option(&str, &nmi);
209 if (nmi >= NMI_INVALID)
210 return 0;
211 nmi_watchdog = nmi;
212 }
213
214 return 1;
215}
216__setup("nmi_watchdog=", setup_nmi_watchdog);
217
218/*
219 * Suspend/resume support
220 */
221#ifdef CONFIG_PM
222
223static int nmi_pm_active; /* nmi_active before suspend */
224
225static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
226{
227 /* only CPU0 goes here, other CPUs should be offline */
228 nmi_pm_active = atomic_read(&nmi_active);
229 stop_apic_nmi_watchdog(NULL);
230 BUG_ON(atomic_read(&nmi_active) != 0);
231 return 0;
232}
233
234static int lapic_nmi_resume(struct sys_device *dev)
235{
236 /* only CPU0 goes here, other CPUs should be offline */
237 if (nmi_pm_active > 0) {
238 setup_apic_nmi_watchdog(NULL);
239 touch_nmi_watchdog();
240 }
241 return 0;
242}
243
244static struct sysdev_class nmi_sysclass = {
245 .name = "lapic_nmi",
246 .resume = lapic_nmi_resume,
247 .suspend = lapic_nmi_suspend,
248};
249
250static struct sys_device device_lapic_nmi = {
251 .id = 0,
252 .cls = &nmi_sysclass,
253};
254
255static int __init init_lapic_nmi_sysfs(void)
256{
257 int error;
258
259 /*
260 * should really be a BUG_ON but b/c this is an
261 * init call, it just doesn't work. -dcz
262 */
263 if (nmi_watchdog != NMI_LOCAL_APIC)
264 return 0;
265
266 if (atomic_read(&nmi_active) < 0)
267 return 0;
268
269 error = sysdev_class_register(&nmi_sysclass);
270 if (!error)
271 error = sysdev_register(&device_lapic_nmi);
272 return error;
273}
274
275/* must come after the local APIC's device_initcall() */
276late_initcall(init_lapic_nmi_sysfs);
277
278#endif /* CONFIG_PM */
279
280static void __acpi_nmi_enable(void *__unused)
281{
282 apic_write(APIC_LVT0, APIC_DM_NMI);
283}
284
285/*
286 * Enable timer based NMIs on all CPUs:
287 */
288void acpi_nmi_enable(void)
289{
290 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
291 on_each_cpu(__acpi_nmi_enable, NULL, 1);
292}
293
294/*
295 * Disable timer based NMIs on all CPUs:
296 */
297void acpi_nmi_disable(void)
298{
299 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
300 on_each_cpu(__acpi_nmi_disable, NULL, 1);
301}
302
303/*
304 * This function is called as soon the LAPIC NMI watchdog driver has everything
305 * in place and it's ready to check if the NMIs belong to the NMI watchdog
306 */
307void cpu_nmi_set_wd_enabled(void)
308{
309 __get_cpu_var(wd_enabled) = 1;
310}
311
312void setup_apic_nmi_watchdog(void *unused)
313{
314 if (__get_cpu_var(wd_enabled))
315 return;
316
317 /* cheap hack to support suspend/resume */
318 /* if cpu0 is not active neither should the other cpus */
319 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
320 return;
321
322 switch (nmi_watchdog) {
323 case NMI_LOCAL_APIC:
324 if (lapic_watchdog_init(nmi_hz) < 0) {
325 __get_cpu_var(wd_enabled) = 0;
326 return;
327 }
328 /* FALL THROUGH */
329 case NMI_IO_APIC:
330 __get_cpu_var(wd_enabled) = 1;
331 atomic_inc(&nmi_active);
332 }
333}
334
335void stop_apic_nmi_watchdog(void *unused)
336{
337 /* only support LOCAL and IO APICs for now */
338 if (!nmi_watchdog_active())
339 return;
340 if (__get_cpu_var(wd_enabled) == 0)
341 return;
342 if (nmi_watchdog == NMI_LOCAL_APIC)
343 lapic_watchdog_stop();
344 else
345 __acpi_nmi_disable(NULL);
346 __get_cpu_var(wd_enabled) = 0;
347 atomic_dec(&nmi_active);
348}
349
350/*
351 * the best way to detect whether a CPU has a 'hard lockup' problem
352 * is to check it's local APIC timer IRQ counts. If they are not
353 * changing then that CPU has some problem.
354 *
355 * as these watchdog NMI IRQs are generated on every CPU, we only
356 * have to check the current processor.
357 *
358 * since NMIs don't listen to _any_ locks, we have to be extremely
359 * careful not to rely on unsafe variables. The printk might lock
360 * up though, so we have to break up any console locks first ...
361 * [when there will be more tty-related locks, break them up here too!]
362 */
363
364static DEFINE_PER_CPU(unsigned, last_irq_sum);
365static DEFINE_PER_CPU(long, alert_counter);
366static DEFINE_PER_CPU(int, nmi_touch);
367
368void touch_nmi_watchdog(void)
369{
370 if (nmi_watchdog_active()) {
371 unsigned cpu;
372
373 /*
374 * Tell other CPUs to reset their alert counters. We cannot
375 * do it ourselves because the alert count increase is not
376 * atomic.
377 */
378 for_each_present_cpu(cpu) {
379 if (per_cpu(nmi_touch, cpu) != 1)
380 per_cpu(nmi_touch, cpu) = 1;
381 }
382 }
383
384 /*
385 * Tickle the softlockup detector too:
386 */
387 touch_softlockup_watchdog();
388}
389EXPORT_SYMBOL(touch_nmi_watchdog);
390
391notrace __kprobes int
392nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
393{
394 /*
395 * Since current_thread_info()-> is always on the stack, and we
396 * always switch the stack NMI-atomically, it's safe to use
397 * smp_processor_id().
398 */
399 unsigned int sum;
400 int touched = 0;
401 int cpu = smp_processor_id();
402 int rc = 0;
403
404 sum = get_timer_irqs(cpu);
405
406 if (__get_cpu_var(nmi_touch)) {
407 __get_cpu_var(nmi_touch) = 0;
408 touched = 1;
409 }
410
411 /* We can be called before check_nmi_watchdog, hence NULL check. */
412 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
413 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
414
415 raw_spin_lock(&lock);
416 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
417 show_regs(regs);
418 dump_stack();
419 raw_spin_unlock(&lock);
420 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
421
422 rc = 1;
423 }
424
425 /* Could check oops_in_progress here too, but it's safer not to */
426 if (mce_in_progress())
427 touched = 1;
428
429 /* if the none of the timers isn't firing, this cpu isn't doing much */
430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
431 /*
432 * Ayiee, looks like this CPU is stuck ...
433 * wait a few IRQs (5 seconds) before doing the oops ...
434 */
435 __this_cpu_inc(alert_counter);
436 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
437 /*
438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
442 } else {
443 __get_cpu_var(last_irq_sum) = sum;
444 __this_cpu_write(alert_counter, 0);
445 }
446
447 /* see if the nmi watchdog went off */
448 if (!__get_cpu_var(wd_enabled))
449 return rc;
450 switch (nmi_watchdog) {
451 case NMI_LOCAL_APIC:
452 rc |= lapic_wd_event(nmi_hz);
453 break;
454 case NMI_IO_APIC:
455 /*
456 * don't know how to accurately check for this.
457 * just assume it was a watchdog timer interrupt
458 * This matches the old behaviour.
459 */
460 rc = 1;
461 break;
462 }
463 return rc;
464}
465
466#ifdef CONFIG_SYSCTL
467
468static void enable_ioapic_nmi_watchdog_single(void *unused)
469{
470 __get_cpu_var(wd_enabled) = 1;
471 atomic_inc(&nmi_active);
472 __acpi_nmi_enable(NULL);
473}
474
475static void enable_ioapic_nmi_watchdog(void)
476{
477 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
478 touch_nmi_watchdog();
479}
480
481static void disable_ioapic_nmi_watchdog(void)
482{
483 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
484}
485
486static int __init setup_unknown_nmi_panic(char *str)
487{
488 unknown_nmi_panic = 1;
489 return 1;
490}
491__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
492
493static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
494{
495 unsigned char reason = get_nmi_reason();
496 char buf[64];
497
498 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
499 die_nmi(buf, regs, 1); /* Always panic here */
500 return 0;
501}
502
503/*
504 * proc handler for /proc/sys/kernel/nmi
505 */
506int proc_nmi_enabled(struct ctl_table *table, int write,
507 void __user *buffer, size_t *length, loff_t *ppos)
508{
509 int old_state;
510
511 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
512 old_state = nmi_watchdog_enabled;
513 proc_dointvec(table, write, buffer, length, ppos);
514 if (!!old_state == !!nmi_watchdog_enabled)
515 return 0;
516
517 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
518 printk(KERN_WARNING
519 "NMI watchdog is permanently disabled\n");
520 return -EIO;
521 }
522
523 if (nmi_watchdog == NMI_LOCAL_APIC) {
524 if (nmi_watchdog_enabled)
525 enable_lapic_nmi_watchdog();
526 else
527 disable_lapic_nmi_watchdog();
528 } else if (nmi_watchdog == NMI_IO_APIC) {
529 if (nmi_watchdog_enabled)
530 enable_ioapic_nmi_watchdog();
531 else
532 disable_ioapic_nmi_watchdog();
533 } else {
534 printk(KERN_WARNING
535 "NMI watchdog doesn't know what hardware to touch\n");
536 return -EIO;
537 }
538 return 0;
539}
540
541#endif /* CONFIG_SYSCTL */
542
543int do_nmi_callback(struct pt_regs *regs, int cpu)
544{
545#ifdef CONFIG_SYSCTL
546 if (unknown_nmi_panic)
547 return unknown_nmi_panic_callback(regs, cpu);
548#endif
549 return 0;
550}
551
552void arch_trigger_all_cpu_backtrace(void)
553{
554 int i;
555
556 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
557
558 printk(KERN_INFO "sending NMI to all CPUs:\n");
559 apic->send_IPI_all(NMI_VECTOR);
560
561 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
562 for (i = 0; i < 10 * 1000; i++) {
563 if (cpumask_empty(to_cpumask(backtrace_mask)))
564 break;
565 mdelay(1);
566 }
567}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b68bda30938..1d59834396bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
894#else 894#else
895 vgetcpu_set_mode(); 895 vgetcpu_set_mode();
896#endif 896#endif
897 init_hw_perf_events();
898} 897}
899 898
900void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 899void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6d75b9145b13..817d2b195e8e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
330{ 330{
331 int i; 331 int i;
332 332
333 if (nmi_watchdog == NMI_LOCAL_APIC)
334 disable_lapic_nmi_watchdog();
335
336 for (i = 0; i < x86_pmu.num_counters; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
338 goto perfctr_fail; 335 goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
355 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
356 release_perfctr_nmi(x86_pmu.perfctr + i); 353 release_perfctr_nmi(x86_pmu.perfctr + i);
357 354
358 if (nmi_watchdog == NMI_LOCAL_APIC)
359 enable_lapic_nmi_watchdog();
360
361 return false; 355 return false;
362} 356}
363 357
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
369 release_perfctr_nmi(x86_pmu.perfctr + i); 363 release_perfctr_nmi(x86_pmu.perfctr + i);
370 release_evntsel_nmi(x86_pmu.eventsel + i); 364 release_evntsel_nmi(x86_pmu.eventsel + i);
371 } 365 }
372
373 if (nmi_watchdog == NMI_LOCAL_APIC)
374 enable_lapic_nmi_watchdog();
375} 366}
376 367
377#else 368#else
@@ -451,7 +442,7 @@ static int x86_setup_perfctr(struct perf_event *event)
451 struct hw_perf_event *hwc = &event->hw; 442 struct hw_perf_event *hwc = &event->hw;
452 u64 config; 443 u64 config;
453 444
454 if (!hwc->sample_period) { 445 if (!is_sampling_event(event)) {
455 hwc->sample_period = x86_pmu.max_period; 446 hwc->sample_period = x86_pmu.max_period;
456 hwc->last_period = hwc->sample_period; 447 hwc->last_period = hwc->sample_period;
457 local64_set(&hwc->period_left, hwc->sample_period); 448 local64_set(&hwc->period_left, hwc->sample_period);
@@ -1362,7 +1353,7 @@ static void __init pmu_check_apic(void)
1362 pr_info("no hardware sampling interrupt available.\n"); 1353 pr_info("no hardware sampling interrupt available.\n");
1363} 1354}
1364 1355
1365void __init init_hw_perf_events(void) 1356int __init init_hw_perf_events(void)
1366{ 1357{
1367 struct event_constraint *c; 1358 struct event_constraint *c;
1368 int err; 1359 int err;
@@ -1377,11 +1368,11 @@ void __init init_hw_perf_events(void)
1377 err = amd_pmu_init(); 1368 err = amd_pmu_init();
1378 break; 1369 break;
1379 default: 1370 default:
1380 return; 1371 return 0;
1381 } 1372 }
1382 if (err != 0) { 1373 if (err != 0) {
1383 pr_cont("no PMU driver, software events only.\n"); 1374 pr_cont("no PMU driver, software events only.\n");
1384 return; 1375 return 0;
1385 } 1376 }
1386 1377
1387 pmu_check_apic(); 1378 pmu_check_apic();
@@ -1389,7 +1380,7 @@ void __init init_hw_perf_events(void)
1389 /* sanity check that the hardware exists or is emulated */ 1380 /* sanity check that the hardware exists or is emulated */
1390 if (!check_hw_exists()) { 1381 if (!check_hw_exists()) {
1391 pr_cont("Broken PMU hardware detected, software events only.\n"); 1382 pr_cont("Broken PMU hardware detected, software events only.\n");
1392 return; 1383 return 0;
1393 } 1384 }
1394 1385
1395 pr_cont("%s PMU driver.\n", x86_pmu.name); 1386 pr_cont("%s PMU driver.\n", x86_pmu.name);
@@ -1440,7 +1431,10 @@ void __init init_hw_perf_events(void)
1440 1431
1441 perf_pmu_register(&pmu); 1432 perf_pmu_register(&pmu);
1442 perf_cpu_notifier(x86_pmu_notifier); 1433 perf_cpu_notifier(x86_pmu_notifier);
1434
1435 return 0;
1443} 1436}
1437early_initcall(init_hw_perf_events);
1444 1438
1445static inline void x86_pmu_read(struct perf_event *event) 1439static inline void x86_pmu_read(struct perf_event *event)
1446{ 1440{
@@ -1686,7 +1680,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1686 1680
1687 perf_callchain_store(entry, regs->ip); 1681 perf_callchain_store(entry, regs->ip);
1688 1682
1689 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1683 dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
1690} 1684}
1691 1685
1692#ifdef CONFIG_COMPAT 1686#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d9f4ff8fcd69..14d45928c282 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -22,26 +22,6 @@
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_event.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr;
27 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
28 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29};
30
31/* Interface defining a CPU specific perfctr watchdog */
32struct wd_ops {
33 int (*reserve)(void);
34 void (*unreserve)(void);
35 int (*setup)(unsigned nmi_hz);
36 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37 void (*stop)(void);
38 unsigned perfctr;
39 unsigned evntsel;
40 u64 checkbit;
41};
42
43static const struct wd_ops *wd_ops;
44
45/* 25/*
46 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 26 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47 * offset from MSR_P4_BSU_ESCR0. 27 * offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
60static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); 40static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); 41static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62 42
63static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65/* converts an msr to an appropriate reservation bit */ 43/* converts an msr to an appropriate reservation bit */
66static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) 44static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67{ 45{
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
172 clear_bit(counter, evntsel_nmi_owner); 150 clear_bit(counter, evntsel_nmi_owner);
173} 151}
174EXPORT_SYMBOL(release_evntsel_nmi); 152EXPORT_SYMBOL(release_evntsel_nmi);
175
176void disable_lapic_nmi_watchdog(void)
177{
178 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
179
180 if (atomic_read(&nmi_active) <= 0)
181 return;
182
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184
185 if (wd_ops)
186 wd_ops->unreserve();
187
188 BUG_ON(atomic_read(&nmi_active) != 0);
189}
190
191void enable_lapic_nmi_watchdog(void)
192{
193 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
194
195 /* are we already enabled */
196 if (atomic_read(&nmi_active) != 0)
197 return;
198
199 /* are we lapic aware */
200 if (!wd_ops)
201 return;
202 if (!wd_ops->reserve()) {
203 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
204 return;
205 }
206
207 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
208 touch_nmi_watchdog();
209}
210
211/*
212 * Activate the NMI watchdog via the local APIC.
213 */
214
215static unsigned int adjust_for_32bit_ctr(unsigned int hz)
216{
217 u64 counter_val;
218 unsigned int retval = hz;
219
220 /*
221 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
222 * are writable, with higher bits sign extending from bit 31.
223 * So, we can only program the counter with 31 bit values and
224 * 32nd bit should be 1, for 33.. to be 1.
225 * Find the appropriate nmi_hz
226 */
227 counter_val = (u64)cpu_khz * 1000;
228 do_div(counter_val, retval);
229 if (counter_val > 0x7fffffffULL) {
230 u64 count = (u64)cpu_khz * 1000;
231 do_div(count, 0x7fffffffUL);
232 retval = count + 1;
233 }
234 return retval;
235}
236
237static void write_watchdog_counter(unsigned int perfctr_msr,
238 const char *descr, unsigned nmi_hz)
239{
240 u64 count = (u64)cpu_khz * 1000;
241
242 do_div(count, nmi_hz);
243 if (descr)
244 pr_debug("setting %s to -0x%08Lx\n", descr, count);
245 wrmsrl(perfctr_msr, 0 - count);
246}
247
248static void write_watchdog_counter32(unsigned int perfctr_msr,
249 const char *descr, unsigned nmi_hz)
250{
251 u64 count = (u64)cpu_khz * 1000;
252
253 do_div(count, nmi_hz);
254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsr(perfctr_msr, (u32)(-count), 0);
257}
258
259/*
260 * AMD K7/K8/Family10h/Family11h support.
261 * AMD keeps this interface nicely stable so there is not much variety
262 */
263#define K7_EVNTSEL_ENABLE (1 << 22)
264#define K7_EVNTSEL_INT (1 << 20)
265#define K7_EVNTSEL_OS (1 << 17)
266#define K7_EVNTSEL_USR (1 << 16)
267#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
268#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
269
270static int setup_k7_watchdog(unsigned nmi_hz)
271{
272 unsigned int perfctr_msr, evntsel_msr;
273 unsigned int evntsel;
274 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
275
276 perfctr_msr = wd_ops->perfctr;
277 evntsel_msr = wd_ops->evntsel;
278
279 wrmsrl(perfctr_msr, 0UL);
280
281 evntsel = K7_EVNTSEL_INT
282 | K7_EVNTSEL_OS
283 | K7_EVNTSEL_USR
284 | K7_NMI_EVENT;
285
286 /* setup the timer */
287 wrmsr(evntsel_msr, evntsel, 0);
288 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
289
290 /* initialize the wd struct before enabling */
291 wd->perfctr_msr = perfctr_msr;
292 wd->evntsel_msr = evntsel_msr;
293 wd->cccr_msr = 0; /* unused */
294
295 /* ok, everything is initialized, announce that we're set */
296 cpu_nmi_set_wd_enabled();
297
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301
302 return 1;
303}
304
305static void single_msr_stop_watchdog(void)
306{
307 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
308
309 wrmsr(wd->evntsel_msr, 0, 0);
310}
311
312static int single_msr_reserve(void)
313{
314 if (!reserve_perfctr_nmi(wd_ops->perfctr))
315 return 0;
316
317 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
318 release_perfctr_nmi(wd_ops->perfctr);
319 return 0;
320 }
321 return 1;
322}
323
324static void single_msr_unreserve(void)
325{
326 release_evntsel_nmi(wd_ops->evntsel);
327 release_perfctr_nmi(wd_ops->perfctr);
328}
329
330static void __kprobes
331single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
332{
333 /* start the cycle over again */
334 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
335}
336
337static const struct wd_ops k7_wd_ops = {
338 .reserve = single_msr_reserve,
339 .unreserve = single_msr_unreserve,
340 .setup = setup_k7_watchdog,
341 .rearm = single_msr_rearm,
342 .stop = single_msr_stop_watchdog,
343 .perfctr = MSR_K7_PERFCTR0,
344 .evntsel = MSR_K7_EVNTSEL0,
345 .checkbit = 1ULL << 47,
346};
347
348/*
349 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
350 */
351#define P6_EVNTSEL0_ENABLE (1 << 22)
352#define P6_EVNTSEL_INT (1 << 20)
353#define P6_EVNTSEL_OS (1 << 17)
354#define P6_EVNTSEL_USR (1 << 16)
355#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
356#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
357
358static int setup_p6_watchdog(unsigned nmi_hz)
359{
360 unsigned int perfctr_msr, evntsel_msr;
361 unsigned int evntsel;
362 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
363
364 perfctr_msr = wd_ops->perfctr;
365 evntsel_msr = wd_ops->evntsel;
366
367 /* KVM doesn't implement this MSR */
368 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
369 return 0;
370
371 evntsel = P6_EVNTSEL_INT
372 | P6_EVNTSEL_OS
373 | P6_EVNTSEL_USR
374 | P6_NMI_EVENT;
375
376 /* setup the timer */
377 wrmsr(evntsel_msr, evntsel, 0);
378 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
379 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
380
381 /* initialize the wd struct before enabling */
382 wd->perfctr_msr = perfctr_msr;
383 wd->evntsel_msr = evntsel_msr;
384 wd->cccr_msr = 0; /* unused */
385
386 /* ok, everything is initialized, announce that we're set */
387 cpu_nmi_set_wd_enabled();
388
389 apic_write(APIC_LVTPC, APIC_DM_NMI);
390 evntsel |= P6_EVNTSEL0_ENABLE;
391 wrmsr(evntsel_msr, evntsel, 0);
392
393 return 1;
394}
395
396static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
397{
398 /*
399 * P6 based Pentium M need to re-unmask
400 * the apic vector but it doesn't hurt
401 * other P6 variant.
402 * ArchPerfom/Core Duo also needs this
403 */
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405
406 /* P6/ARCH_PERFMON has 32 bit counter write */
407 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
408}
409
410static const struct wd_ops p6_wd_ops = {
411 .reserve = single_msr_reserve,
412 .unreserve = single_msr_unreserve,
413 .setup = setup_p6_watchdog,
414 .rearm = p6_rearm,
415 .stop = single_msr_stop_watchdog,
416 .perfctr = MSR_P6_PERFCTR0,
417 .evntsel = MSR_P6_EVNTSEL0,
418 .checkbit = 1ULL << 39,
419};
420
421/*
422 * Intel P4 performance counters.
423 * By far the most complicated of all.
424 */
425#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
426#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
427#define P4_ESCR_OS (1 << 3)
428#define P4_ESCR_USR (1 << 2)
429#define P4_CCCR_OVF_PMI0 (1 << 26)
430#define P4_CCCR_OVF_PMI1 (1 << 27)
431#define P4_CCCR_THRESHOLD(N) ((N) << 20)
432#define P4_CCCR_COMPLEMENT (1 << 19)
433#define P4_CCCR_COMPARE (1 << 18)
434#define P4_CCCR_REQUIRED (3 << 16)
435#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
436#define P4_CCCR_ENABLE (1 << 12)
437#define P4_CCCR_OVF (1 << 31)
438
439#define P4_CONTROLS 18
440static unsigned int p4_controls[18] = {
441 MSR_P4_BPU_CCCR0,
442 MSR_P4_BPU_CCCR1,
443 MSR_P4_BPU_CCCR2,
444 MSR_P4_BPU_CCCR3,
445 MSR_P4_MS_CCCR0,
446 MSR_P4_MS_CCCR1,
447 MSR_P4_MS_CCCR2,
448 MSR_P4_MS_CCCR3,
449 MSR_P4_FLAME_CCCR0,
450 MSR_P4_FLAME_CCCR1,
451 MSR_P4_FLAME_CCCR2,
452 MSR_P4_FLAME_CCCR3,
453 MSR_P4_IQ_CCCR0,
454 MSR_P4_IQ_CCCR1,
455 MSR_P4_IQ_CCCR2,
456 MSR_P4_IQ_CCCR3,
457 MSR_P4_IQ_CCCR4,
458 MSR_P4_IQ_CCCR5,
459};
460/*
461 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
462 * CRU_ESCR0 (with any non-null event selector) through a complemented
463 * max threshold. [IA32-Vol3, Section 14.9.9]
464 */
465static int setup_p4_watchdog(unsigned nmi_hz)
466{
467 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
468 unsigned int evntsel, cccr_val;
469 unsigned int misc_enable, dummy;
470 unsigned int ht_num;
471 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472
473 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
474 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
475 return 0;
476
477#ifdef CONFIG_SMP
478 /* detect which hyperthread we are on */
479 if (smp_num_siblings == 2) {
480 unsigned int ebx, apicid;
481
482 ebx = cpuid_ebx(1);
483 apicid = (ebx >> 24) & 0xff;
484 ht_num = apicid & 1;
485 } else
486#endif
487 ht_num = 0;
488
489 /*
490 * performance counters are shared resources
491 * assign each hyperthread its own set
492 * (re-use the ESCR0 register, seems safe
493 * and keeps the cccr_val the same)
494 */
495 if (!ht_num) {
496 /* logical cpu 0 */
497 perfctr_msr = MSR_P4_IQ_PERFCTR0;
498 evntsel_msr = MSR_P4_CRU_ESCR0;
499 cccr_msr = MSR_P4_IQ_CCCR0;
500 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
501
502 /*
503 * If we're on the kdump kernel or other situation, we may
504 * still have other performance counter registers set to
505 * interrupt and they'll keep interrupting forever because
506 * of the P4_CCCR_OVF quirk. So we need to ACK all the
507 * pending interrupts and disable all the registers here,
508 * before reenabling the NMI delivery. Refer to p4_rearm()
509 * about the P4_CCCR_OVF quirk.
510 */
511 if (reset_devices) {
512 unsigned int low, high;
513 int i;
514
515 for (i = 0; i < P4_CONTROLS; i++) {
516 rdmsr(p4_controls[i], low, high);
517 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
518 wrmsr(p4_controls[i], low, high);
519 }
520 }
521 } else {
522 /* logical cpu 1 */
523 perfctr_msr = MSR_P4_IQ_PERFCTR1;
524 evntsel_msr = MSR_P4_CRU_ESCR0;
525 cccr_msr = MSR_P4_IQ_CCCR1;
526
527 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
528 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
529 cccr_val = P4_CCCR_OVF_PMI0;
530 else
531 cccr_val = P4_CCCR_OVF_PMI1;
532 cccr_val |= P4_CCCR_ESCR_SELECT(4);
533 }
534
535 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
536 | P4_ESCR_OS
537 | P4_ESCR_USR;
538
539 cccr_val |= P4_CCCR_THRESHOLD(15)
540 | P4_CCCR_COMPLEMENT
541 | P4_CCCR_COMPARE
542 | P4_CCCR_REQUIRED;
543
544 wrmsr(evntsel_msr, evntsel, 0);
545 wrmsr(cccr_msr, cccr_val, 0);
546 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
547
548 wd->perfctr_msr = perfctr_msr;
549 wd->evntsel_msr = evntsel_msr;
550 wd->cccr_msr = cccr_msr;
551
552 /* ok, everything is initialized, announce that we're set */
553 cpu_nmi_set_wd_enabled();
554
555 apic_write(APIC_LVTPC, APIC_DM_NMI);
556 cccr_val |= P4_CCCR_ENABLE;
557 wrmsr(cccr_msr, cccr_val, 0);
558 return 1;
559}
560
561static void stop_p4_watchdog(void)
562{
563 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
564 wrmsr(wd->cccr_msr, 0, 0);
565 wrmsr(wd->evntsel_msr, 0, 0);
566}
567
568static int p4_reserve(void)
569{
570 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
571 return 0;
572#ifdef CONFIG_SMP
573 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
574 goto fail1;
575#endif
576 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
577 goto fail2;
578 /* RED-PEN why is ESCR1 not reserved here? */
579 return 1;
580 fail2:
581#ifdef CONFIG_SMP
582 if (smp_num_siblings > 1)
583 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
584 fail1:
585#endif
586 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
587 return 0;
588}
589
590static void p4_unreserve(void)
591{
592#ifdef CONFIG_SMP
593 if (smp_num_siblings > 1)
594 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595#endif
596 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
597 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598}
599
600static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
601{
602 unsigned dummy;
603 /*
604 * P4 quirks:
605 * - An overflown perfctr will assert its interrupt
606 * until the OVF flag in its CCCR is cleared.
607 * - LVTPC is masked on interrupt and must be
608 * unmasked by the LVTPC handler.
609 */
610 rdmsrl(wd->cccr_msr, dummy);
611 dummy &= ~P4_CCCR_OVF;
612 wrmsrl(wd->cccr_msr, dummy);
613 apic_write(APIC_LVTPC, APIC_DM_NMI);
614 /* start the cycle over again */
615 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
616}
617
618static const struct wd_ops p4_wd_ops = {
619 .reserve = p4_reserve,
620 .unreserve = p4_unreserve,
621 .setup = setup_p4_watchdog,
622 .rearm = p4_rearm,
623 .stop = stop_p4_watchdog,
624 /* RED-PEN this is wrong for the other sibling */
625 .perfctr = MSR_P4_BPU_PERFCTR0,
626 .evntsel = MSR_P4_BSU_ESCR0,
627 .checkbit = 1ULL << 39,
628};
629
630/*
631 * Watchdog using the Intel architected PerfMon.
632 * Used for Core2 and hopefully all future Intel CPUs.
633 */
634#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
635#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
636
637static struct wd_ops intel_arch_wd_ops;
638
639static int setup_intel_arch_watchdog(unsigned nmi_hz)
640{
641 unsigned int ebx;
642 union cpuid10_eax eax;
643 unsigned int unused;
644 unsigned int perfctr_msr, evntsel_msr;
645 unsigned int evntsel;
646 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
647
648 /*
649 * Check whether the Architectural PerfMon supports
650 * Unhalted Core Cycles Event or not.
651 * NOTE: Corresponding bit = 0 in ebx indicates event present.
652 */
653 cpuid(10, &(eax.full), &ebx, &unused, &unused);
654 if ((eax.split.mask_length <
655 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
656 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
657 return 0;
658
659 perfctr_msr = wd_ops->perfctr;
660 evntsel_msr = wd_ops->evntsel;
661
662 wrmsrl(perfctr_msr, 0UL);
663
664 evntsel = ARCH_PERFMON_EVENTSEL_INT
665 | ARCH_PERFMON_EVENTSEL_OS
666 | ARCH_PERFMON_EVENTSEL_USR
667 | ARCH_PERFMON_NMI_EVENT_SEL
668 | ARCH_PERFMON_NMI_EVENT_UMASK;
669
670 /* setup the timer */
671 wrmsr(evntsel_msr, evntsel, 0);
672 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
673 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
674
675 wd->perfctr_msr = perfctr_msr;
676 wd->evntsel_msr = evntsel_msr;
677 wd->cccr_msr = 0; /* unused */
678
679 /* ok, everything is initialized, announce that we're set */
680 cpu_nmi_set_wd_enabled();
681
682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1;
687}
688
689static struct wd_ops intel_arch_wd_ops __read_mostly = {
690 .reserve = single_msr_reserve,
691 .unreserve = single_msr_unreserve,
692 .setup = setup_intel_arch_watchdog,
693 .rearm = p6_rearm,
694 .stop = single_msr_stop_watchdog,
695 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
696 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
697};
698
699static void probe_nmi_watchdog(void)
700{
701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 == 6 ||
704 (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
705 wd_ops = &k7_wd_ops;
706 return;
707 case X86_VENDOR_INTEL:
708 /* Work around where perfctr1 doesn't have a working enable
709 * bit as described in the following errata:
710 * AE49 Core Duo and Intel Core Solo 65 nm
711 * AN49 Intel Pentium Dual-Core
712 * AF49 Dual-Core Intel Xeon Processor LV
713 */
714 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
715 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
716 boot_cpu_data.x86_mask == 4))) {
717 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
718 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
719 }
720 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
721 wd_ops = &intel_arch_wd_ops;
722 break;
723 }
724 switch (boot_cpu_data.x86) {
725 case 6:
726 if (boot_cpu_data.x86_model > 13)
727 return;
728
729 wd_ops = &p6_wd_ops;
730 break;
731 case 15:
732 wd_ops = &p4_wd_ops;
733 break;
734 default:
735 return;
736 }
737 break;
738 }
739}
740
741/* Interface to nmi.c */
742
743int lapic_watchdog_init(unsigned nmi_hz)
744{
745 if (!wd_ops) {
746 probe_nmi_watchdog();
747 if (!wd_ops) {
748 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
749 return -1;
750 }
751
752 if (!wd_ops->reserve()) {
753 printk(KERN_ERR
754 "NMI watchdog: cannot reserve perfctrs\n");
755 return -1;
756 }
757 }
758
759 if (!(wd_ops->setup(nmi_hz))) {
760 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
761 raw_smp_processor_id());
762 return -1;
763 }
764
765 return 0;
766}
767
768void lapic_watchdog_stop(void)
769{
770 if (wd_ops)
771 wd_ops->stop();
772}
773
774unsigned lapic_adjust_nmi_hz(unsigned hz)
775{
776 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
777 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
778 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
779 hz = adjust_for_32bit_ctr(hz);
780 return hz;
781}
782
783int __kprobes lapic_wd_event(unsigned nmi_hz)
784{
785 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
786 u64 ctr;
787
788 rdmsrl(wd->perfctr_msr, ctr);
789 if (ctr & wd_ops->checkbit) /* perfctr still running? */
790 return 0;
791
792 wd_ops->rearm(wd, nmi_hz);
793 return 1;
794}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..8474c998cbd4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, unsigned long bp, char *log_lvl) 178 unsigned long *stack, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack, unsigned long bp) 185 unsigned long *stack)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, bp, ""); 187 show_trace_log_lvl(task, regs, stack, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, 0, ""); 192 show_stack_log_lvl(task, NULL, sp, "");
193} 193}
194 194
195/* 195/*
@@ -210,7 +210,7 @@ void dump_stack(void)
210 init_utsname()->release, 210 init_utsname()->release,
211 (int)strcspn(init_utsname()->version, " "), 211 (int)strcspn(init_utsname()->version, " "),
212 init_utsname()->version); 212 init_utsname()->version);
213 show_trace(NULL, NULL, &stack, bp); 213 show_trace(NULL, NULL, &stack);
214} 214}
215EXPORT_SYMBOL(dump_stack); 215EXPORT_SYMBOL(dump_stack);
216 216
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1bc7f75a5bda..74cc1eda384b 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,11 +17,12 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task,
21 unsigned long *stack, unsigned long bp, 21 struct pt_regs *regs, unsigned long *stack,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
25 26
26 if (!task) 27 if (!task)
27 task = current; 28 task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
34 stack = (unsigned long *)task->thread.sp; 35 stack = (unsigned long *)task->thread.sp;
35 } 36 }
36 37
37#ifdef CONFIG_FRAME_POINTER 38 bp = stack_frame(task, regs);
38 if (!bp) {
39 if (task == current) {
40 /* Grab bp right from our regs */
41 get_bp(bp);
42 } else {
43 /* bp is the last reg pushed by switch_to */
44 bp = *(unsigned long *) task->thread.sp;
45 }
46 }
47#endif
48
49 for (;;) { 39 for (;;) {
50 struct thread_info *context; 40 struct thread_info *context;
51 41
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
65 55
66void 56void
67show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
68 unsigned long *sp, unsigned long bp, char *log_lvl) 58 unsigned long *sp, char *log_lvl)
69{ 59{
70 unsigned long *stack; 60 unsigned long *stack;
71 int i; 61 int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
87 touch_nmi_watchdog(); 77 touch_nmi_watchdog();
88 } 78 }
89 printk(KERN_CONT "\n"); 79 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 80 show_trace_log_lvl(task, regs, sp, log_lvl);
91} 81}
92 82
93 83
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
112 u8 *ip; 102 u8 *ip;
113 103
114 printk(KERN_EMERG "Stack:\n"); 104 printk(KERN_EMERG "Stack:\n");
115 show_stack_log_lvl(NULL, regs, &regs->sp, 105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
116 0, KERN_EMERG);
117 106
118 printk(KERN_EMERG "Code: "); 107 printk(KERN_EMERG "Code: ");
119 108
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6a340485249a..64101335de19 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, struct pt_regs *regs, 142void dump_trace(struct task_struct *task,
143 unsigned long *stack, unsigned long bp, 143 struct pt_regs *regs, unsigned long *stack,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long bp;
152 153
153 if (!task) 154 if (!task)
154 task = current; 155 task = current;
@@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *)task->thread.sp; 161 stack = (unsigned long *)task->thread.sp;
161 } 162 }
162 163
163#ifdef CONFIG_FRAME_POINTER 164 bp = stack_frame(task, regs);
164 if (!bp) {
165 if (task == current) {
166 /* Grab bp right from our regs */
167 get_bp(bp);
168 } else {
169 /* bp is the last reg pushed by switch_to */
170 bp = *(unsigned long *) task->thread.sp;
171 }
172 }
173#endif
174
175 /* 165 /*
176 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
177 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
235 225
236void 226void
237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
238 unsigned long *sp, unsigned long bp, char *log_lvl) 228 unsigned long *sp, char *log_lvl)
239{ 229{
240 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
241 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
279 preempt_enable(); 269 preempt_enable();
280 270
281 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, log_lvl);
283} 273}
284 274
285void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
308 298
309 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
310 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
311 regs->bp, KERN_EMERG); 301 KERN_EMERG);
312 302
313 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
314 304
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1cbd54c0df99..5940282bd2f9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1184{ 1184{
1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 1186
1187 /* This is possible if op is under delayed unoptimizing */
1188 if (kprobe_disabled(&op->kp))
1189 return;
1190
1187 preempt_disable(); 1191 preempt_disable();
1188 if (kprobe_running()) { 1192 if (kprobe_running()) {
1189 kprobes_inc_nmissed_count(&op->kp); 1193 kprobes_inc_nmissed_count(&op->kp);
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1401 return 0; 1405 return 0;
1402} 1406}
1403 1407
1404/* Replace a breakpoint (int3) with a relative jump. */ 1408#define MAX_OPTIMIZE_PROBES 256
1405int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) 1409static struct text_poke_param *jump_poke_params;
1410static struct jump_poke_buffer {
1411 u8 buf[RELATIVEJUMP_SIZE];
1412} *jump_poke_bufs;
1413
1414static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1415 u8 *insn_buf,
1416 struct optimized_kprobe *op)
1406{ 1417{
1407 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1408 s32 rel = (s32)((long)op->optinsn.insn - 1418 s32 rel = (s32)((long)op->optinsn.insn -
1409 ((long)op->kp.addr + RELATIVEJUMP_SIZE)); 1419 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1410 1420
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1412 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, 1422 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1413 RELATIVE_ADDR_SIZE); 1423 RELATIVE_ADDR_SIZE);
1414 1424
1415 jmp_code[0] = RELATIVEJUMP_OPCODE; 1425 insn_buf[0] = RELATIVEJUMP_OPCODE;
1416 *(s32 *)(&jmp_code[1]) = rel; 1426 *(s32 *)(&insn_buf[1]) = rel;
1427
1428 tprm->addr = op->kp.addr;
1429 tprm->opcode = insn_buf;
1430 tprm->len = RELATIVEJUMP_SIZE;
1431}
1432
1433/*
1434 * Replace breakpoints (int3) with relative jumps.
1435 * Caller must call with locking kprobe_mutex and text_mutex.
1436 */
1437void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1438{
1439 struct optimized_kprobe *op, *tmp;
1440 int c = 0;
1441
1442 list_for_each_entry_safe(op, tmp, oplist, list) {
1443 WARN_ON(kprobe_disabled(&op->kp));
1444 /* Setup param */
1445 setup_optimize_kprobe(&jump_poke_params[c],
1446 jump_poke_bufs[c].buf, op);
1447 list_del_init(&op->list);
1448 if (++c >= MAX_OPTIMIZE_PROBES)
1449 break;
1450 }
1417 1451
1418 /* 1452 /*
1419 * text_poke_smp doesn't support NMI/MCE code modifying. 1453 * text_poke_smp doesn't support NMI/MCE code modifying.
1420 * However, since kprobes itself also doesn't support NMI/MCE 1454 * However, since kprobes itself also doesn't support NMI/MCE
1421 * code probing, it's not a problem. 1455 * code probing, it's not a problem.
1422 */ 1456 */
1423 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); 1457 text_poke_smp_batch(jump_poke_params, c);
1424 return 0; 1458}
1459
1460static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1461 u8 *insn_buf,
1462 struct optimized_kprobe *op)
1463{
1464 /* Set int3 to first byte for kprobes */
1465 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1466 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1467
1468 tprm->addr = op->kp.addr;
1469 tprm->opcode = insn_buf;
1470 tprm->len = RELATIVEJUMP_SIZE;
1471}
1472
1473/*
1474 * Recover original instructions and breakpoints from relative jumps.
1475 * Caller must call with locking kprobe_mutex.
1476 */
1477extern void arch_unoptimize_kprobes(struct list_head *oplist,
1478 struct list_head *done_list)
1479{
1480 struct optimized_kprobe *op, *tmp;
1481 int c = 0;
1482
1483 list_for_each_entry_safe(op, tmp, oplist, list) {
1484 /* Setup param */
1485 setup_unoptimize_kprobe(&jump_poke_params[c],
1486 jump_poke_bufs[c].buf, op);
1487 list_move(&op->list, done_list);
1488 if (++c >= MAX_OPTIMIZE_PROBES)
1489 break;
1490 }
1491
1492 /*
1493 * text_poke_smp doesn't support NMI/MCE code modifying.
1494 * However, since kprobes itself also doesn't support NMI/MCE
1495 * code probing, it's not a problem.
1496 */
1497 text_poke_smp_batch(jump_poke_params, c);
1425} 1498}
1426 1499
1427/* Replace a relative jump with a breakpoint (int3). */ 1500/* Replace a relative jump with a breakpoint (int3). */
@@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
1453 } 1526 }
1454 return 0; 1527 return 0;
1455} 1528}
1529
1530static int __kprobes init_poke_params(void)
1531{
1532 /* Allocate code buffer and parameter array */
1533 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1534 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1535 if (!jump_poke_bufs)
1536 return -ENOMEM;
1537
1538 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1539 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1540 if (!jump_poke_params) {
1541 kfree(jump_poke_bufs);
1542 jump_poke_bufs = NULL;
1543 return -ENOMEM;
1544 }
1545
1546 return 0;
1547}
1548#else /* !CONFIG_OPTPROBES */
1549static int __kprobes init_poke_params(void)
1550{
1551 return 0;
1552}
1456#endif 1553#endif
1457 1554
1458int __init arch_init_kprobes(void) 1555int __init arch_init_kprobes(void)
1459{ 1556{
1460 return 0; 1557 return init_poke_params();
1461} 1558}
1462 1559
1463int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1560int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..96ed1aac543a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -91,8 +91,7 @@ void exit_thread(void)
91void show_regs(struct pt_regs *regs) 91void show_regs(struct pt_regs *regs)
92{ 92{
93 show_registers(regs); 93 show_registers(regs);
94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
95 regs->bp);
96} 95}
97 96
98void show_regs_common(void) 97void show_regs_common(void)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d1b7df..68f61ac632e1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
281 */ 281 */
282 smp_store_cpu_info(cpuid); 282 smp_store_cpu_info(cpuid);
283 283
284 /*
285 * This must be done before setting cpu_online_mask
286 * or calling notify_cpu_starting.
287 */
288 set_cpu_sibling_map(raw_smp_processor_id());
289 wmb();
290
284 notify_cpu_starting(cpuid); 291 notify_cpu_starting(cpuid);
285 292
286 /* 293 /*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
316 */ 323 */
317 check_tsc_sync_target(); 324 check_tsc_sync_target();
318 325
319 if (nmi_watchdog == NMI_IO_APIC) {
320 legacy_pic->mask(0);
321 enable_NMI_through_LVT0();
322 legacy_pic->unmask(0);
323 }
324
325 /* This must be done before setting cpu_online_mask */
326 set_cpu_sibling_map(raw_smp_processor_id());
327 wmb();
328
329 /* 326 /*
330 * We need to hold call_lock, so there is no inconsistency 327 * We need to hold call_lock, so there is no inconsistency
331 * between the time smp_call_function() determines number of 328 * between the time smp_call_function() determines number of
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
1061 printk(KERN_INFO "SMP mode deactivated.\n"); 1058 printk(KERN_INFO "SMP mode deactivated.\n");
1062 smpboot_clear_io_apic(); 1059 smpboot_clear_io_apic();
1063 1060
1064 localise_nmi_watchdog();
1065
1066 connect_bsp_APIC(); 1061 connect_bsp_APIC();
1067 setup_local_APIC(); 1062 setup_local_APIC();
1068 end_local_APIC_setup(); 1063 end_local_APIC_setup();
@@ -1196,7 +1191,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1196#ifdef CONFIG_X86_IO_APIC 1191#ifdef CONFIG_X86_IO_APIC
1197 setup_ioapic_dest(); 1192 setup_ioapic_dest();
1198#endif 1193#endif
1199 check_nmi_watchdog();
1200 mtrr_aps_init(); 1194 mtrr_aps_init();
1201} 1195}
1202 1196
@@ -1341,8 +1335,6 @@ int native_cpu_disable(void)
1341 if (cpu == 0) 1335 if (cpu == 0)
1342 return -EBUSY; 1336 return -EBUSY;
1343 1337
1344 if (nmi_watchdog == NMI_LOCAL_APIC)
1345 stop_apic_nmi_watchdog(NULL);
1346 clear_local_APIC(); 1338 clear_local_APIC();
1347 1339
1348 cpu_disable_common(); 1340 cpu_disable_common();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..938c8e10a19a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
80EXPORT_SYMBOL_GPL(save_stack_trace); 80EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..25a28a245937 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,10 +22,6 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif 27#endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
63 /* Keep nmi watchdog up to date */ 59 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs); 60 inc_irq_stat(irq0_irqs);
65 61
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 raw_spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
81 63
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb838ca42c96..f02c179c2552 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -398,15 +398,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
398 == NOTIFY_STOP) 398 == NOTIFY_STOP)
399 return; 399 return;
400 400
401#ifndef CONFIG_LOCKUP_DETECTOR
402 /*
403 * Ok, so this is none of the documented NMI sources,
404 * so it must be the NMI watchdog.
405 */
406 if (nmi_watchdog_tick(regs, reason))
407 return;
408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
410 unknown_nmi_error(reason, regs); 401 unknown_nmi_error(reason, regs);
411#else 402#else
412 unknown_nmi_error(reason, regs); 403 unknown_nmi_error(reason, regs);
@@ -446,14 +437,12 @@ do_nmi(struct pt_regs *regs, long error_code)
446 437
447void stop_nmi(void) 438void stop_nmi(void)
448{ 439{
449 acpi_nmi_disable();
450 ignore_nmis++; 440 ignore_nmis++;
451} 441}
452 442
453void restart_nmi(void) 443void restart_nmi(void)
454{ 444{
455 ignore_nmis--; 445 ignore_nmis--;
456 acpi_nmi_enable();
457} 446}
458 447
459/* May run on IST stack. */ 448/* May run on IST stack. */
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_bp(&e->trace, regs->bp); 188 save_stack_trace_regs(&e->trace, regs);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..72cbec14d783 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 0, 129 dump_trace(NULL, regs, (unsigned long *)stack,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71b5790..0636dd93cef8 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -58,9 +58,6 @@ static void timer_stop(void)
58 58
59int __init op_nmi_timer_init(struct oprofile_operations *ops) 59int __init op_nmi_timer_init(struct oprofile_operations *ops)
60{ 60{
61 if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
62 return -ENODEV;
63
64 ops->start = timer_start; 61 ops->start = timer_start;
65 ops->stop = timer_stop; 62 ops->stop = timer_stop;
66 ops->cpu_type = "timer"; 63 ops->cpu_type = "timer";
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index 660a2728908d..0cac7ec0d2ec 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -577,9 +577,7 @@ acpi_ns_init_one_device(acpi_handle obj_handle,
577 * as possible (without an NMI being received in the middle of 577 * as possible (without an NMI being received in the middle of
578 * this) - so disable NMIs and initialize the device: 578 * this) - so disable NMIs and initialize the device:
579 */ 579 */
580 acpi_nmi_disable();
581 status = acpi_ns_evaluate(info); 580 status = acpi_ns_evaluate(info);
582 acpi_nmi_enable();
583 581
584 if (ACPI_SUCCESS(status)) { 582 if (ACPI_SUCCESS(status)) {
585 walk_info->num_INI++; 583 walk_info->num_INI++;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 3d77116e4634..c19f4a20794a 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -649,12 +649,7 @@ static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
649 * If nmi_watchdog is turned off then we can turn on 649 * If nmi_watchdog is turned off then we can turn on
650 * our nmi decoding capability. 650 * our nmi decoding capability.
651 */ 651 */
652 if (!nmi_watchdog_active()) 652 hpwdt_nmi_decoding = 1;
653 hpwdt_nmi_decoding = 1;
654 else
655 dev_warn(&dev->dev, "NMI decoding is disabled. To enable this "
656 "functionality you must reboot with nmi_watchdog=0 "
657 "and load the hpwdt driver with priority=1.\n");
658} 653}
659#else 654#else
660static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev) 655static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 8beabb958f61..725bf6bd39f7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -154,12 +154,14 @@ enum {
154 TRACE_EVENT_FL_ENABLED_BIT, 154 TRACE_EVENT_FL_ENABLED_BIT,
155 TRACE_EVENT_FL_FILTERED_BIT, 155 TRACE_EVENT_FL_FILTERED_BIT,
156 TRACE_EVENT_FL_RECORDED_CMD_BIT, 156 TRACE_EVENT_FL_RECORDED_CMD_BIT,
157 TRACE_EVENT_FL_CAP_ANY_BIT,
157}; 158};
158 159
159enum { 160enum {
160 TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), 161 TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT),
161 TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), 162 TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
162 TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), 163 TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
164 TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
163}; 165};
164 166
165struct ftrace_event_call { 167struct ftrace_event_call {
@@ -196,6 +198,14 @@ struct ftrace_event_call {
196#endif 198#endif
197}; 199};
198 200
201#define __TRACE_EVENT_FLAGS(name, value) \
202 static int __init trace_init_flags_##name(void) \
203 { \
204 event_##name.flags = value; \
205 return 0; \
206 } \
207 early_initcall(trace_init_flags_##name);
208
199#define PERF_MAX_TRACE_SIZE 2048 209#define PERF_MAX_TRACE_SIZE 2048
200 210
201#define MAX_FILTER_PRED 32 211#define MAX_FILTER_PRED 32
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e7d1b2e0070d..b78edb58ee66 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -275,7 +275,9 @@ extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
275extern int arch_check_optimized_kprobe(struct optimized_kprobe *op); 275extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
276extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op); 276extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
277extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op); 277extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
278extern int arch_optimize_kprobe(struct optimized_kprobe *op); 278extern void arch_optimize_kprobes(struct list_head *oplist);
279extern void arch_unoptimize_kprobes(struct list_head *oplist,
280 struct list_head *done_list);
279extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); 281extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
280extern kprobe_opcode_t *get_optinsn_slot(void); 282extern kprobe_opcode_t *get_optinsn_slot(void);
281extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); 283extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 06aab5eee134..1c451e6ecc17 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -16,10 +16,7 @@
16 */ 16 */
17#ifdef ARCH_HAS_NMI_WATCHDOG 17#ifdef ARCH_HAS_NMI_WATCHDOG
18#include <asm/nmi.h> 18#include <asm/nmi.h>
19extern void touch_nmi_watchdog(void); 19#endif
20extern void acpi_nmi_disable(void);
21extern void acpi_nmi_enable(void);
22#else
23#ifndef CONFIG_HARDLOCKUP_DETECTOR 20#ifndef CONFIG_HARDLOCKUP_DETECTOR
24static inline void touch_nmi_watchdog(void) 21static inline void touch_nmi_watchdog(void)
25{ 22{
@@ -28,9 +25,6 @@ static inline void touch_nmi_watchdog(void)
28#else 25#else
29extern void touch_nmi_watchdog(void); 26extern void touch_nmi_watchdog(void);
30#endif 27#endif
31static inline void acpi_nmi_disable(void) { }
32static inline void acpi_nmi_enable(void) { }
33#endif
34 28
35/* 29/*
36 * Create trigger_all_cpu_backtrace() out of the arch-provided 30 * Create trigger_all_cpu_backtrace() out of the arch-provided
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index de2c41758e29..adf6d9931643 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -758,6 +758,8 @@ struct perf_event {
758 u64 shadow_ctx_time; 758 u64 shadow_ctx_time;
759 759
760 struct perf_event_attr attr; 760 struct perf_event_attr attr;
761 u16 header_size;
762 u16 read_size;
761 struct hw_perf_event hw; 763 struct hw_perf_event hw;
762 764
763 struct perf_event_context *ctx; 765 struct perf_event_context *ctx;
@@ -969,6 +971,11 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
969 struct perf_sample_data *data, 971 struct perf_sample_data *data,
970 struct pt_regs *regs); 972 struct pt_regs *regs);
971 973
974static inline bool is_sampling_event(struct perf_event *event)
975{
976 return event->attr.sample_period != 0;
977}
978
972/* 979/*
973 * Return 1 for a software event, 0 for a hardware event 980 * Return 1 for a software event, 0 for a hardware event
974 */ 981 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c79e921a68b..d2e63d1e725c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -316,6 +316,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
316 size_t *lenp, loff_t *ppos); 316 size_t *lenp, loff_t *ppos);
317extern unsigned int softlockup_panic; 317extern unsigned int softlockup_panic;
318extern int softlockup_thresh; 318extern int softlockup_thresh;
319void lockup_detector_init(void);
319#else 320#else
320static inline void touch_softlockup_watchdog(void) 321static inline void touch_softlockup_watchdog(void)
321{ 322{
@@ -326,6 +327,9 @@ static inline void touch_softlockup_watchdog_sync(void)
326static inline void touch_all_softlockup_watchdogs(void) 327static inline void touch_all_softlockup_watchdogs(void)
327{ 328{
328} 329}
330static inline void lockup_detector_init(void)
331{
332}
329#endif 333#endif
330 334
331#ifdef CONFIG_DETECT_HUNG_TASK 335#ifdef CONFIG_DETECT_HUNG_TASK
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 51efbef38fb0..25310f1d7f37 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -2,6 +2,7 @@
2#define __LINUX_STACKTRACE_H 2#define __LINUX_STACKTRACE_H
3 3
4struct task_struct; 4struct task_struct;
5struct pt_regs;
5 6
6#ifdef CONFIG_STACKTRACE 7#ifdef CONFIG_STACKTRACE
7struct task_struct; 8struct task_struct;
@@ -13,7 +14,8 @@ struct stack_trace {
13}; 14};
14 15
15extern void save_stack_trace(struct stack_trace *trace); 16extern void save_stack_trace(struct stack_trace *trace);
16extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp); 17extern void save_stack_trace_regs(struct stack_trace *trace,
18 struct pt_regs *regs);
17extern void save_stack_trace_tsk(struct task_struct *tsk, 19extern void save_stack_trace_tsk(struct task_struct *tsk,
18 struct stack_trace *trace); 20 struct stack_trace *trace);
19 21
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cacc27a0e285..18cd0684fc4e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -127,8 +127,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
127#define SYSCALL_TRACE_ENTER_EVENT(sname) \ 127#define SYSCALL_TRACE_ENTER_EVENT(sname) \
128 static struct syscall_metadata \ 128 static struct syscall_metadata \
129 __attribute__((__aligned__(4))) __syscall_meta_##sname; \ 129 __attribute__((__aligned__(4))) __syscall_meta_##sname; \
130 static struct ftrace_event_call \
131 __attribute__((__aligned__(4))) event_enter_##sname; \
132 static struct ftrace_event_call __used \ 130 static struct ftrace_event_call __used \
133 __attribute__((__aligned__(4))) \ 131 __attribute__((__aligned__(4))) \
134 __attribute__((section("_ftrace_events"))) \ 132 __attribute__((section("_ftrace_events"))) \
@@ -137,13 +135,12 @@ extern struct trace_event_functions exit_syscall_print_funcs;
137 .class = &event_class_syscall_enter, \ 135 .class = &event_class_syscall_enter, \
138 .event.funcs = &enter_syscall_print_funcs, \ 136 .event.funcs = &enter_syscall_print_funcs, \
139 .data = (void *)&__syscall_meta_##sname,\ 137 .data = (void *)&__syscall_meta_##sname,\
140 } 138 }; \
139 __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
141 140
142#define SYSCALL_TRACE_EXIT_EVENT(sname) \ 141#define SYSCALL_TRACE_EXIT_EVENT(sname) \
143 static struct syscall_metadata \ 142 static struct syscall_metadata \
144 __attribute__((__aligned__(4))) __syscall_meta_##sname; \ 143 __attribute__((__aligned__(4))) __syscall_meta_##sname; \
145 static struct ftrace_event_call \
146 __attribute__((__aligned__(4))) event_exit_##sname; \
147 static struct ftrace_event_call __used \ 144 static struct ftrace_event_call __used \
148 __attribute__((__aligned__(4))) \ 145 __attribute__((__aligned__(4))) \
149 __attribute__((section("_ftrace_events"))) \ 146 __attribute__((section("_ftrace_events"))) \
@@ -152,7 +149,8 @@ extern struct trace_event_functions exit_syscall_print_funcs;
152 .class = &event_class_syscall_exit, \ 149 .class = &event_class_syscall_exit, \
153 .event.funcs = &exit_syscall_print_funcs, \ 150 .event.funcs = &exit_syscall_print_funcs, \
154 .data = (void *)&__syscall_meta_##sname,\ 151 .data = (void *)&__syscall_meta_##sname,\
155 } 152 }; \
153 __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
156 154
157#define SYSCALL_METADATA(sname, nb) \ 155#define SYSCALL_METADATA(sname, nb) \
158 SYSCALL_TRACE_ENTER_EVENT(sname); \ 156 SYSCALL_TRACE_ENTER_EVENT(sname); \
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index a4a90b6726ce..5a6074fcd81d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -234,6 +234,8 @@ do_trace: \
234 PARAMS(void *__data, proto), \ 234 PARAMS(void *__data, proto), \
235 PARAMS(__data, args)) 235 PARAMS(__data, args))
236 236
237#define TRACE_EVENT_FLAGS(event, flag)
238
237#endif /* DECLARE_TRACE */ 239#endif /* DECLARE_TRACE */
238 240
239#ifndef TRACE_EVENT 241#ifndef TRACE_EVENT
@@ -354,4 +356,6 @@ do_trace: \
354 assign, print, reg, unreg) \ 356 assign, print, reg, unreg) \
355 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) 357 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
356 358
359#define TRACE_EVENT_FLAGS(event, flag)
360
357#endif /* ifdef TRACE_EVENT (see note above) */ 361#endif /* ifdef TRACE_EVENT (see note above) */
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
index fb726ac7caee..5a4c04a75b3d 100644
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -40,6 +40,8 @@ TRACE_EVENT_FN(sys_enter,
40 syscall_regfunc, syscall_unregfunc 40 syscall_regfunc, syscall_unregfunc
41); 41);
42 42
43TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
44
43TRACE_EVENT_FN(sys_exit, 45TRACE_EVENT_FN(sys_exit,
44 46
45 TP_PROTO(struct pt_regs *regs, long ret), 47 TP_PROTO(struct pt_regs *regs, long ret),
@@ -62,6 +64,8 @@ TRACE_EVENT_FN(sys_exit,
62 syscall_regfunc, syscall_unregfunc 64 syscall_regfunc, syscall_unregfunc
63); 65);
64 66
67TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY)
68
65#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ 69#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
66 70
67#endif /* _TRACE_EVENTS_SYSCALLS_H */ 71#endif /* _TRACE_EVENTS_SYSCALLS_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a9377c0083ad..e718a917d897 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -82,6 +82,10 @@
82 TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \ 82 TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \
83 PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \ 83 PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \
84 84
85#undef TRACE_EVENT_FLAGS
86#define TRACE_EVENT_FLAGS(name, value) \
87 __TRACE_EVENT_FLAGS(name, value)
88
85#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 89#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
86 90
87 91
@@ -129,6 +133,9 @@
129#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ 133#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
130 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) 134 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
131 135
136#undef TRACE_EVENT_FLAGS
137#define TRACE_EVENT_FLAGS(event, flag)
138
132#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 139#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
133 140
134/* 141/*
diff --git a/init/main.c b/init/main.c
index 8646401f7a0e..261ad7b3fe0b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -882,6 +882,7 @@ static int __init kernel_init(void * unused)
882 smp_prepare_cpus(setup_max_cpus); 882 smp_prepare_cpus(setup_max_cpus);
883 883
884 do_pre_smp_initcalls(); 884 do_pre_smp_initcalls();
885 lockup_detector_init();
885 886
886 smp_init(); 887 smp_init();
887 sched_init_smp(); 888 sched_init_smp();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..7663e5df0e6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
354 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
355} 355}
356 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
357/* 364/*
358 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
359 */ 366 */
360static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
361{ 368{
362 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
363 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
364} 371}
365 372
366#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 } 391 }
385} 392}
386 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
387/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
388static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
389{ 407{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
397 return 0; 415 return 0;
398} 416}
399 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
400/* 445/*
401 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
402 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
422 467
423/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
424static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
425 471
426static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
427static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
428#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
429 476
430/* Kprobe jump optimizer */ 477/*
431static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
432{ 482{
433 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
434 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
435 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
436 mutex_lock(&module_mutex); 486 return;
437 mutex_lock(&kprobe_mutex);
438 if (kprobes_all_disarmed || !kprobes_allow_optimization)
439 goto end;
440
441 /*
442 * Wait for quiesence period to ensure all running interrupts
443 * are done. Because optprobe may modify multiple instructions
444 * there is a chance that Nth instruction is interrupted. In that
445 * case, running interrupt can return to 2nd-Nth byte of jump
446 * instruction. This wait is for avoiding it.
447 */
448 synchronize_sched();
449 487
450 /* 488 /*
451 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
459 */ 497 */
460 get_online_cpus(); 498 get_online_cpus();
461 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
462 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
463 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
464 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
465 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
466 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
467 } 535 }
468 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
469 put_online_cpus(); 537 put_online_cpus();
470end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
471 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
472 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
473} 605}
474 606
475/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
495 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
496 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
497 return; 629 return;
498
499 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
500 list_add(&op->list, &optimizing_list); 631
501 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
502 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
503} 649}
504 650
505/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
506static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
507{ 653{
508 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
509 655
510 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
511 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
512 if (!list_empty(&op->list)) 658
513 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
514 list_del_init(&op->list); 668 list_del_init(&op->list);
515 else 669 force_unoptimize_kprobe(op);
516 /* Replace jump with break */ 670 }
517 arch_unoptimize_kprobe(op); 671 return;
518 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
519 } 687 }
520} 688}
521 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
522/* Remove optimized instructions */ 711/* Remove optimized instructions */
523static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
524{ 713{
525 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
526 715
527 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
528 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
529 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
530 list_del_init(&op->list); 719 list_del_init(&op->list);
531 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
532 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
533 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
534 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
535} 724}
536 725
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
543 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
544} 733}
545 734
546/* Free optimized instructions and optimized_kprobe */
547static __kprobes void free_aggr_kprobe(struct kprobe *p)
548{
549 struct optimized_kprobe *op;
550
551 op = container_of(p, struct optimized_kprobe, kp);
552 arch_remove_optimized_kprobe(op);
553 kfree(op);
554}
555
556/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
557static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
558{ 737{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
587 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
588 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
589 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
590 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
591 return; 771 return;
592 } 772 }
593 773
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
631 return; 811 return;
632 812
633 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
634 printk(KERN_INFO "Kprobes globally unoptimized\n");
635 get_online_cpus(); /* For avoiding text_mutex deadlock */
636 mutex_lock(&text_mutex);
637 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
638 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
639 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
640 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
641 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
642 } 819 }
643 } 820 }
644 821 /* Wait for unoptimizing completion */
645 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
646 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
647 /* Allow all currently running kprobes to complete */
648 synchronize_sched();
649} 824}
650 825
651int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
669} 844}
670#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
671 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
672static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
673{ 849{
674 struct kprobe *old_p; 850 struct kprobe *_p;
675 851
676 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
677 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
678 if (unlikely(old_p)) 854 if (unlikely(_p))
679 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
680 857
681 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
682 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
683} 860}
684 861
685static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
686{ 864{
687 struct kprobe *old_p; 865 struct kprobe *_p;
688 866
689 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
690 arch_disarm_kprobe(p);
691 868
692 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
693 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
694 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
695 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
696} 877}
697 878
698#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
699 880
700#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
701#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
702#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
703#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
704#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
705#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
706#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
707 897
708static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
709{ 899{
900 arch_remove_kprobe(p);
710 kfree(p); 901 kfree(p);
711} 902}
712 903
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
732/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
733static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
734{ 925{
735 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
736 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
737 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
738 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
739 put_online_cpus();
740} 930}
741 931
742/* 932/*
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
942 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
943 1133
944 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
945 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
946 1136
947 if (p->break_handler) { 1137 if (p->break_handler) {
948 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
993 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
994 * the intricacies 1184 * the intricacies
995 */ 1185 */
996static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
997 struct kprobe *p) 1187 struct kprobe *p)
998{ 1188{
999 int ret = 0; 1189 int ret = 0;
1000 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
1001 1191
1002 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1003 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1004 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1005 if (!ap) 1195 if (!ap)
1006 return -ENOMEM; 1196 return -ENOMEM;
1007 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1008 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1009 1201
1010 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1011 /* 1203 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1039 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1040} 1232}
1041 1233
1042/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1043static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1044{
1045 struct kprobe *kp;
1046
1047 list_for_each_entry_rcu(kp, &p->list, list) {
1048 if (!kprobe_disabled(kp))
1049 /*
1050 * There is an active probe on the list.
1051 * We can't disable aggr_kprobe.
1052 */
1053 return 0;
1054 }
1055 p->flags |= KPROBE_FLAG_DISABLED;
1056 return 1;
1057}
1058
1059static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1060{ 1235{
1061 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1098/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1099static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1100{ 1275{
1101 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1102 1277
1103 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1104 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1105 return NULL; 1280 return NULL;
1106 1281
1107 if (p != old_p) { 1282 if (p != ap) {
1108 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1109 if (list_p == p) 1284 if (list_p == p)
1110 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1111 goto valid; 1286 goto valid;
1112 return NULL; 1287 return NULL;
1113 } 1288 }
1114valid: 1289valid:
1115 return old_p; 1290 return ap;
1116} 1291}
1117 1292
1118/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1119static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1120{ 1295{
1121 int ret = 0; 1296 int ret = 0;
1122 struct kprobe *old_p;
1123 1297
1124 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1125 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1126 if (old_p)
1127 ret = -EINVAL; 1300 ret = -EINVAL;
1128 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1129 return ret; 1303 return ret;
1130} 1304}
1131 1305
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
1229} 1403}
1230EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1231 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1232/* 1447/*
1233 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1234 */ 1449 */
1235static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1236{ 1451{
1237 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1238 1453
1239 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1240 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1241 return -EINVAL; 1457 return -EINVAL;
1242 1458
1243 if (old_p == p || 1459 if (ap == p)
1244 (kprobe_aggrprobe(old_p) &&
1245 list_is_singular(&old_p->list))) {
1246 /* 1460 /*
1247 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1248 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1249 * already have been removed. We save on flushing icache.
1250 */ 1463 */
1251 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1252 disarm_kprobe(old_p); 1465
1253 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1254 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1255 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1256 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1257 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1258 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1259 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1260 goto noclean; 1482 goto noclean;
1261 } 1483 }
1262 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1263 } 1485 }
1264noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1265 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1266 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1267 try_to_disable_aggr_kprobe(old_p); 1493 /*
1268 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1269 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1270 disarm_kprobe(old_p); 1496 */
1271 else 1497 optimize_kprobe(ap);
1272 /* Try to optimize this probe again */
1273 optimize_kprobe(old_p);
1274 }
1275 }
1276 } 1498 }
1277 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1278} 1505}
1279 1506
1280static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1281{ 1508{
1282 struct kprobe *old_p; 1509 struct kprobe *ap;
1283 1510
1284 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1285 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1286 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1287 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1288 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1289 list_del(&p->list); 1517 list_del(&p->list);
1290 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1291 free_aggr_kprobe(old_p);
1292 } 1519 }
1520 /* Otherwise, do nothing. */
1293} 1521}
1294 1522
1295int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1607int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1608{ 1836{
1609 int ret = 0; 1837 int ret = 0;
1610 struct kprobe *p;
1611 1838
1612 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1613 1840
1614 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1615 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1616 if (unlikely(p == NULL)) {
1617 ret = -EINVAL; 1843 ret = -EINVAL;
1618 goto out;
1619 }
1620 1844
1621 /* If the probe is already disabled (or gone), just return */
1622 if (kprobe_disabled(kp))
1623 goto out;
1624
1625 kp->flags |= KPROBE_FLAG_DISABLED;
1626 if (p != kp)
1627 /* When kp != p, p is always enabled. */
1628 try_to_disable_aggr_kprobe(p);
1629
1630 if (!kprobes_all_disarmed && kprobe_disabled(p))
1631 disarm_kprobe(p);
1632out:
1633 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1634 return ret; 1846 return ret;
1635} 1847}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1927 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1928 2140
1929 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1930 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1931 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1932 2146
1933 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1934 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1935 2149
1936 /*
1937 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1938 * because disarming may also unoptimize kprobes.
1939 */
1940 get_online_cpus();
1941 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1942 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1943 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1944 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1945 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1946 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1947 } 2156 }
1948 } 2157 }
1949
1950 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1951 put_online_cpus();
1952 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1953 /* Allow all currently running kprobes to complete */
1954 synchronize_sched();
1955 return;
1956 2160
1957already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1958 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1959 return;
1960} 2163}
1961 2164
1962/* 2165/*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e3364335..aede71245e9f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -312,9 +312,75 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
312 ctx->nr_stat++; 312 ctx->nr_stat++;
313} 313}
314 314
315/*
316 * Called at perf_event creation and when events are attached/detached from a
317 * group.
318 */
319static void perf_event__read_size(struct perf_event *event)
320{
321 int entry = sizeof(u64); /* value */
322 int size = 0;
323 int nr = 1;
324
325 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
326 size += sizeof(u64);
327
328 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
329 size += sizeof(u64);
330
331 if (event->attr.read_format & PERF_FORMAT_ID)
332 entry += sizeof(u64);
333
334 if (event->attr.read_format & PERF_FORMAT_GROUP) {
335 nr += event->group_leader->nr_siblings;
336 size += sizeof(u64);
337 }
338
339 size += entry * nr;
340 event->read_size = size;
341}
342
343static void perf_event__header_size(struct perf_event *event)
344{
345 struct perf_sample_data *data;
346 u64 sample_type = event->attr.sample_type;
347 u16 size = 0;
348
349 perf_event__read_size(event);
350
351 if (sample_type & PERF_SAMPLE_IP)
352 size += sizeof(data->ip);
353
354 if (sample_type & PERF_SAMPLE_TID)
355 size += sizeof(data->tid_entry);
356
357 if (sample_type & PERF_SAMPLE_TIME)
358 size += sizeof(data->time);
359
360 if (sample_type & PERF_SAMPLE_ADDR)
361 size += sizeof(data->addr);
362
363 if (sample_type & PERF_SAMPLE_ID)
364 size += sizeof(data->id);
365
366 if (sample_type & PERF_SAMPLE_STREAM_ID)
367 size += sizeof(data->stream_id);
368
369 if (sample_type & PERF_SAMPLE_CPU)
370 size += sizeof(data->cpu_entry);
371
372 if (sample_type & PERF_SAMPLE_PERIOD)
373 size += sizeof(data->period);
374
375 if (sample_type & PERF_SAMPLE_READ)
376 size += event->read_size;
377
378 event->header_size = size;
379}
380
315static void perf_group_attach(struct perf_event *event) 381static void perf_group_attach(struct perf_event *event)
316{ 382{
317 struct perf_event *group_leader = event->group_leader; 383 struct perf_event *group_leader = event->group_leader, *pos;
318 384
319 /* 385 /*
320 * We can have double attach due to group movement in perf_event_open. 386 * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +399,11 @@ static void perf_group_attach(struct perf_event *event)
333 399
334 list_add_tail(&event->group_entry, &group_leader->sibling_list); 400 list_add_tail(&event->group_entry, &group_leader->sibling_list);
335 group_leader->nr_siblings++; 401 group_leader->nr_siblings++;
402
403 perf_event__header_size(group_leader);
404
405 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
406 perf_event__header_size(pos);
336} 407}
337 408
338/* 409/*
@@ -391,7 +462,7 @@ static void perf_group_detach(struct perf_event *event)
391 if (event->group_leader != event) { 462 if (event->group_leader != event) {
392 list_del_init(&event->group_entry); 463 list_del_init(&event->group_entry);
393 event->group_leader->nr_siblings--; 464 event->group_leader->nr_siblings--;
394 return; 465 goto out;
395 } 466 }
396 467
397 if (!list_empty(&event->group_entry)) 468 if (!list_empty(&event->group_entry))
@@ -410,6 +481,12 @@ static void perf_group_detach(struct perf_event *event)
410 /* Inherit group flags from the previous leader */ 481 /* Inherit group flags from the previous leader */
411 sibling->group_flags = event->group_flags; 482 sibling->group_flags = event->group_flags;
412 } 483 }
484
485out:
486 perf_event__header_size(event->group_leader);
487
488 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
489 perf_event__header_size(tmp);
413} 490}
414 491
415static inline int 492static inline int
@@ -1073,7 +1150,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1073 /* 1150 /*
1074 * not supported on inherited events 1151 * not supported on inherited events
1075 */ 1152 */
1076 if (event->attr.inherit) 1153 if (event->attr.inherit || !is_sampling_event(event))
1077 return -EINVAL; 1154 return -EINVAL;
1078 1155
1079 atomic_add(refresh, &event->event_limit); 1156 atomic_add(refresh, &event->event_limit);
@@ -2289,31 +2366,6 @@ static int perf_release(struct inode *inode, struct file *file)
2289 return perf_event_release_kernel(event); 2366 return perf_event_release_kernel(event);
2290} 2367}
2291 2368
2292static int perf_event_read_size(struct perf_event *event)
2293{
2294 int entry = sizeof(u64); /* value */
2295 int size = 0;
2296 int nr = 1;
2297
2298 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2299 size += sizeof(u64);
2300
2301 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2302 size += sizeof(u64);
2303
2304 if (event->attr.read_format & PERF_FORMAT_ID)
2305 entry += sizeof(u64);
2306
2307 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2308 nr += event->group_leader->nr_siblings;
2309 size += sizeof(u64);
2310 }
2311
2312 size += entry * nr;
2313
2314 return size;
2315}
2316
2317u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2369u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2318{ 2370{
2319 struct perf_event *child; 2371 struct perf_event *child;
@@ -2428,7 +2480,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2428 if (event->state == PERF_EVENT_STATE_ERROR) 2480 if (event->state == PERF_EVENT_STATE_ERROR)
2429 return 0; 2481 return 0;
2430 2482
2431 if (count < perf_event_read_size(event)) 2483 if (count < event->read_size)
2432 return -ENOSPC; 2484 return -ENOSPC;
2433 2485
2434 WARN_ON_ONCE(event->ctx->parent_ctx); 2486 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2566,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2514 int ret = 0; 2566 int ret = 0;
2515 u64 value; 2567 u64 value;
2516 2568
2517 if (!event->attr.sample_period) 2569 if (!is_sampling_event(event))
2518 return -EINVAL; 2570 return -EINVAL;
2519 2571
2520 if (copy_from_user(&value, arg, sizeof(value))) 2572 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3606,59 +3658,34 @@ void perf_prepare_sample(struct perf_event_header *header,
3606 data->type = sample_type; 3658 data->type = sample_type;
3607 3659
3608 header->type = PERF_RECORD_SAMPLE; 3660 header->type = PERF_RECORD_SAMPLE;
3609 header->size = sizeof(*header); 3661 header->size = sizeof(*header) + event->header_size;
3610 3662
3611 header->misc = 0; 3663 header->misc = 0;
3612 header->misc |= perf_misc_flags(regs); 3664 header->misc |= perf_misc_flags(regs);
3613 3665
3614 if (sample_type & PERF_SAMPLE_IP) { 3666 if (sample_type & PERF_SAMPLE_IP)
3615 data->ip = perf_instruction_pointer(regs); 3667 data->ip = perf_instruction_pointer(regs);
3616 3668
3617 header->size += sizeof(data->ip);
3618 }
3619
3620 if (sample_type & PERF_SAMPLE_TID) { 3669 if (sample_type & PERF_SAMPLE_TID) {
3621 /* namespace issues */ 3670 /* namespace issues */
3622 data->tid_entry.pid = perf_event_pid(event, current); 3671 data->tid_entry.pid = perf_event_pid(event, current);
3623 data->tid_entry.tid = perf_event_tid(event, current); 3672 data->tid_entry.tid = perf_event_tid(event, current);
3624
3625 header->size += sizeof(data->tid_entry);
3626 } 3673 }
3627 3674
3628 if (sample_type & PERF_SAMPLE_TIME) { 3675 if (sample_type & PERF_SAMPLE_TIME)
3629 data->time = perf_clock(); 3676 data->time = perf_clock();
3630 3677
3631 header->size += sizeof(data->time); 3678 if (sample_type & PERF_SAMPLE_ID)
3632 }
3633
3634 if (sample_type & PERF_SAMPLE_ADDR)
3635 header->size += sizeof(data->addr);
3636
3637 if (sample_type & PERF_SAMPLE_ID) {
3638 data->id = primary_event_id(event); 3679 data->id = primary_event_id(event);
3639 3680
3640 header->size += sizeof(data->id); 3681 if (sample_type & PERF_SAMPLE_STREAM_ID)
3641 }
3642
3643 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3644 data->stream_id = event->id; 3682 data->stream_id = event->id;
3645 3683
3646 header->size += sizeof(data->stream_id);
3647 }
3648
3649 if (sample_type & PERF_SAMPLE_CPU) { 3684 if (sample_type & PERF_SAMPLE_CPU) {
3650 data->cpu_entry.cpu = raw_smp_processor_id(); 3685 data->cpu_entry.cpu = raw_smp_processor_id();
3651 data->cpu_entry.reserved = 0; 3686 data->cpu_entry.reserved = 0;
3652
3653 header->size += sizeof(data->cpu_entry);
3654 } 3687 }
3655 3688
3656 if (sample_type & PERF_SAMPLE_PERIOD)
3657 header->size += sizeof(data->period);
3658
3659 if (sample_type & PERF_SAMPLE_READ)
3660 header->size += perf_event_read_size(event);
3661
3662 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3689 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3663 int size = 1; 3690 int size = 1;
3664 3691
@@ -3726,7 +3753,7 @@ perf_event_read_event(struct perf_event *event,
3726 .header = { 3753 .header = {
3727 .type = PERF_RECORD_READ, 3754 .type = PERF_RECORD_READ,
3728 .misc = 0, 3755 .misc = 0,
3729 .size = sizeof(read_event) + perf_event_read_size(event), 3756 .size = sizeof(read_event) + event->read_size,
3730 }, 3757 },
3731 .pid = perf_event_pid(event, task), 3758 .pid = perf_event_pid(event, task),
3732 .tid = perf_event_tid(event, task), 3759 .tid = perf_event_tid(event, task),
@@ -4240,6 +4267,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4240 struct hw_perf_event *hwc = &event->hw; 4267 struct hw_perf_event *hwc = &event->hw;
4241 int ret = 0; 4268 int ret = 0;
4242 4269
4270 /*
4271 * Non-sampling counters might still use the PMI to fold short
4272 * hardware counters, ignore those.
4273 */
4274 if (unlikely(!is_sampling_event(event)))
4275 return 0;
4276
4243 if (!throttle) { 4277 if (!throttle) {
4244 hwc->interrupts++; 4278 hwc->interrupts++;
4245 } else { 4279 } else {
@@ -4385,7 +4419,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4385 if (!regs) 4419 if (!regs)
4386 return; 4420 return;
4387 4421
4388 if (!hwc->sample_period) 4422 if (!is_sampling_event(event))
4389 return; 4423 return;
4390 4424
4391 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4425 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4548,7 +4582,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4548 struct hw_perf_event *hwc = &event->hw; 4582 struct hw_perf_event *hwc = &event->hw;
4549 struct hlist_head *head; 4583 struct hlist_head *head;
4550 4584
4551 if (hwc->sample_period) { 4585 if (is_sampling_event(event)) {
4552 hwc->last_period = hwc->sample_period; 4586 hwc->last_period = hwc->sample_period;
4553 perf_swevent_set_period(event); 4587 perf_swevent_set_period(event);
4554 } 4588 }
@@ -4805,15 +4839,6 @@ static int perf_tp_event_init(struct perf_event *event)
4805 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4839 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4806 return -ENOENT; 4840 return -ENOENT;
4807 4841
4808 /*
4809 * Raw tracepoint data is a severe data leak, only allow root to
4810 * have these.
4811 */
4812 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4813 perf_paranoid_tracepoint_raw() &&
4814 !capable(CAP_SYS_ADMIN))
4815 return -EPERM;
4816
4817 err = perf_trace_init(event); 4842 err = perf_trace_init(event);
4818 if (err) 4843 if (err)
4819 return err; 4844 return err;
@@ -4926,31 +4951,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4926static void perf_swevent_start_hrtimer(struct perf_event *event) 4951static void perf_swevent_start_hrtimer(struct perf_event *event)
4927{ 4952{
4928 struct hw_perf_event *hwc = &event->hw; 4953 struct hw_perf_event *hwc = &event->hw;
4954 s64 period;
4955
4956 if (!is_sampling_event(event))
4957 return;
4929 4958
4930 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4959 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4931 hwc->hrtimer.function = perf_swevent_hrtimer; 4960 hwc->hrtimer.function = perf_swevent_hrtimer;
4932 if (hwc->sample_period) {
4933 s64 period = local64_read(&hwc->period_left);
4934 4961
4935 if (period) { 4962 period = local64_read(&hwc->period_left);
4936 if (period < 0) 4963 if (period) {
4937 period = 10000; 4964 if (period < 0)
4965 period = 10000;
4938 4966
4939 local64_set(&hwc->period_left, 0); 4967 local64_set(&hwc->period_left, 0);
4940 } else { 4968 } else {
4941 period = max_t(u64, 10000, hwc->sample_period); 4969 period = max_t(u64, 10000, hwc->sample_period);
4942 } 4970 }
4943 __hrtimer_start_range_ns(&hwc->hrtimer, 4971 __hrtimer_start_range_ns(&hwc->hrtimer,
4944 ns_to_ktime(period), 0, 4972 ns_to_ktime(period), 0,
4945 HRTIMER_MODE_REL_PINNED, 0); 4973 HRTIMER_MODE_REL_PINNED, 0);
4946 }
4947} 4974}
4948 4975
4949static void perf_swevent_cancel_hrtimer(struct perf_event *event) 4976static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4950{ 4977{
4951 struct hw_perf_event *hwc = &event->hw; 4978 struct hw_perf_event *hwc = &event->hw;
4952 4979
4953 if (hwc->sample_period) { 4980 if (is_sampling_event(event)) {
4954 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 4981 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4955 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 4982 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4956 4983
@@ -5715,6 +5742,11 @@ SYSCALL_DEFINE5(perf_event_open,
5715 mutex_unlock(&current->perf_event_mutex); 5742 mutex_unlock(&current->perf_event_mutex);
5716 5743
5717 /* 5744 /*
5745 * Precalculate sample_data sizes
5746 */
5747 perf_event__header_size(event);
5748
5749 /*
5718 * Drop the reference on the group_event after placing the 5750 * Drop the reference on the group_event after placing the
5719 * new event on the sibling_list. This ensures destruction 5751 * new event on the sibling_list. This ensures destruction
5720 * of the group leader will find the pointer to itself in 5752 * of the group leader will find the pointer to itself in
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..cbd97da7a613 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -746,22 +746,6 @@ static struct ctl_table kern_table[] = {
746 .extra2 = &one, 746 .extra2 = &one,
747 }, 747 },
748#endif 748#endif
749#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
750 {
751 .procname = "unknown_nmi_panic",
752 .data = &unknown_nmi_panic,
753 .maxlen = sizeof (int),
754 .mode = 0644,
755 .proc_handler = proc_dointvec,
756 },
757 {
758 .procname = "nmi_watchdog",
759 .data = &nmi_watchdog_enabled,
760 .maxlen = sizeof (int),
761 .mode = 0644,
762 .proc_handler = proc_nmi_enabled,
763 },
764#endif
765#if defined(CONFIG_X86) 749#if defined(CONFIG_X86)
766 { 750 {
767 .procname = "panic_on_unrecovered_nmi", 751 .procname = "panic_on_unrecovered_nmi",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..cad4e42060a9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -547,13 +547,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
547 .notifier_call = cpu_callback 547 .notifier_call = cpu_callback
548}; 548};
549 549
550static int __init spawn_watchdog_task(void) 550void __init lockup_detector_init(void)
551{ 551{
552 void *cpu = (void *)(long)smp_processor_id(); 552 void *cpu = (void *)(long)smp_processor_id();
553 int err; 553 int err;
554 554
555 if (no_watchdog) 555 if (no_watchdog)
556 return 0; 556 return;
557 557
558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
559 WARN_ON(notifier_to_errno(err)); 559 WARN_ON(notifier_to_errno(err));
@@ -561,6 +561,5 @@ static int __init spawn_watchdog_task(void)
561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
562 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
563 563
564 return 0; 564 return;
565} 565}
566early_initcall(spawn_watchdog_task);
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 8509bb512935..bbbe584d4494 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -125,7 +125,9 @@ exuberant()
125 -I DEFINE_TRACE,EXPORT_TRACEPOINT_SYMBOL,EXPORT_TRACEPOINT_SYMBOL_GPL \ 125 -I DEFINE_TRACE,EXPORT_TRACEPOINT_SYMBOL,EXPORT_TRACEPOINT_SYMBOL_GPL \
126 --extra=+f --c-kinds=-px \ 126 --extra=+f --c-kinds=-px \
127 --regex-asm='/^ENTRY\(([^)]*)\).*/\1/' \ 127 --regex-asm='/^ENTRY\(([^)]*)\).*/\1/' \
128 --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' 128 --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' \
129 --regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1/' \
130 --regex-c++='/^DEFINE_EVENT\(([^,)]*).*/trace_\1/'
129 131
130 all_kconfigs | xargs $1 -a \ 132 all_kconfigs | xargs $1 -a \
131 --langdef=kconfig --language-force=kconfig \ 133 --langdef=kconfig --language-force=kconfig \
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index b2c63309a651..6f5a498608b2 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -24,12 +24,47 @@ OPTIONS
24--input=:: 24--input=::
25 Input file name. (default: perf.data) 25 Input file name. (default: perf.data)
26 26
27-d::
28--dsos=<dso[,dso...]>::
29 Only consider symbols in these dsos.
30-s::
31--symbol=<symbol>::
32 Symbol to annotate.
33
34-f::
35--force::
36 Don't complain, do it.
37
38-v::
39--verbose::
40 Be more verbose. (Show symbol address, etc)
41
42-D::
43--dump-raw-trace::
44 Dump raw trace in ASCII.
45
46-k::
47--vmlinux=<file>::
48 vmlinux pathname.
49
50-m::
51--modules::
52 Load module symbols. WARNING: use only with -k and LIVE kernel.
53
54-l::
55--print-line::
56 Print matching source lines (may be slow).
57
58-P::
59--full-paths::
60 Don't shorten the displayed pathnames.
61
27--stdio:: Use the stdio interface. 62--stdio:: Use the stdio interface.
28 63
29--tui:: Use the TUI interface Use of --tui requires a tty, if one is not 64--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
30 present, as when piping to other commands, the stdio interface is 65 present, as when piping to other commands, the stdio interface is
31 used. This interfaces starts by centering on the line with more 66 used. This interfaces starts by centering on the line with more
32 samples, TAB/UNTAB cycles thru the lines with more samples. 67 samples, TAB/UNTAB cycles through the lines with more samples.
33 68
34SEE ALSO 69SEE ALSO
35-------- 70--------
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
index 01b642c0bf8f..5eaac6f26d51 100644
--- a/tools/perf/Documentation/perf-buildid-list.txt
+++ b/tools/perf/Documentation/perf-buildid-list.txt
@@ -18,6 +18,9 @@ perf report.
18 18
19OPTIONS 19OPTIONS
20------- 20-------
21-H::
22--with-hits::
23 Show only DSOs with hits.
21-i:: 24-i::
22--input=:: 25--input=::
23 Input file name. (default: perf.data) 26 Input file name. (default: perf.data)
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 20d97d84ea1c..6a9ec2b35310 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -19,6 +19,18 @@ If no parameters are passed it will assume perf.data.old and perf.data.
19 19
20OPTIONS 20OPTIONS
21------- 21-------
22-M::
23--displacement::
24 Show position displacement relative to baseline.
25
26-D::
27--dump-raw-trace::
28 Dump raw trace in ASCII.
29
30-m::
31--modules::
32 Load module symbols. WARNING: use only with -k and LIVE kernel
33
22-d:: 34-d::
23--dsos=:: 35--dsos=::
24 Only consider symbols in these dsos. CSV that understands 36 Only consider symbols in these dsos. CSV that understands
@@ -42,7 +54,7 @@ OPTIONS
42--field-separator=:: 54--field-separator=::
43 55
44 Use a special separator character and don't pad with spaces, replacing 56 Use a special separator character and don't pad with spaces, replacing
45 all occurances of this separator in symbol names (and other output) 57 all occurrences of this separator in symbol names (and other output)
46 with a '.' character, that thus it's the only non valid separator. 58 with a '.' character, that thus it's the only non valid separator.
47 59
48-v:: 60-v::
@@ -50,6 +62,11 @@ OPTIONS
50 Be verbose, for instance, show the raw counts in addition to the 62 Be verbose, for instance, show the raw counts in addition to the
51 diff. 63 diff.
52 64
65-f::
66--force::
67 Don't complain, do it.
68
69
53SEE ALSO 70SEE ALSO
54-------- 71--------
55linkperf:perf-record[1] 72linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
index d004e19fe6d6..dd84cb2f0a88 100644
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -22,7 +22,7 @@ There are a couple of variants of perf kvm:
22 a performance counter profile of guest os in realtime 22 a performance counter profile of guest os in realtime
23 of an arbitrary workload. 23 of an arbitrary workload.
24 24
25 'perf kvm record <command>' to record the performance couinter profile 25 'perf kvm record <command>' to record the performance counter profile
26 of an arbitrary workload and save it into a perf data file. If both 26 of an arbitrary workload and save it into a perf data file. If both
27 --host and --guest are input, the perf data file name is perf.data.kvm. 27 --host and --guest are input, the perf data file name is perf.data.kvm.
28 If there is no --host but --guest, the file name is perf.data.guest. 28 If there is no --host but --guest, the file name is perf.data.guest.
@@ -40,6 +40,12 @@ There are a couple of variants of perf kvm:
40 40
41OPTIONS 41OPTIONS
42------- 42-------
43-i::
44--input=::
45 Input file name.
46-o::
47--output::
48 Output file name.
43--host=:: 49--host=::
44 Collect host side performance profile. 50 Collect host side performance profile.
45--guest=:: 51--guest=::
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index b317102138c8..921de259ea10 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -24,6 +24,21 @@ and statistics with this 'perf lock' command.
24 24
25 'perf lock report' reports statistical data. 25 'perf lock report' reports statistical data.
26 26
27OPTIONS
28-------
29
30-i::
31--input=<file>::
32 Input file name.
33
34-v::
35--verbose::
36 Be more verbose (show symbol address, etc).
37
38-D::
39--dump-raw-trace::
40 Dump raw trace in ASCII.
41
27SEE ALSO 42SEE ALSO
28-------- 43--------
29linkperf:perf[1] 44linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 62de1b7f4e76..4e2323276984 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -115,7 +115,7 @@ Each probe argument follows below syntax.
115 115
116LINE SYNTAX 116LINE SYNTAX
117----------- 117-----------
118Line range is descripted by following syntax. 118Line range is described by following syntax.
119 119
120 "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]" 120 "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]"
121 121
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index a91f9f9e6e5c..0ad1bc75ab49 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -39,15 +39,24 @@ OPTIONS
39 be passed as follows: '\mem:addr[:[r][w][x]]'. 39 be passed as follows: '\mem:addr[:[r][w][x]]'.
40 If you want to profile read-write accesses in 0x1000, just set 40 If you want to profile read-write accesses in 0x1000, just set
41 'mem:0x1000:rw'. 41 'mem:0x1000:rw'.
42
43--filter=<filter>::
44 Event filter.
45
42-a:: 46-a::
43 System-wide collection. 47--all-cpus::
48 System-wide collection from all CPUs.
44 49
45-l:: 50-l::
46 Scale counter values. 51 Scale counter values.
47 52
48-p:: 53-p::
49--pid=:: 54--pid=::
50 Record events on existing pid. 55 Record events on existing process ID.
56
57-t::
58--tid=::
59 Record events on existing thread ID.
51 60
52-r:: 61-r::
53--realtime=:: 62--realtime=::
@@ -109,8 +118,8 @@ Collect raw sample records from all opened counters (default for tracepoint coun
109 118
110-C:: 119-C::
111--cpu:: 120--cpu::
112Collect samples only on the list of cpus provided. Multiple CPUs can be provided as a 121Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
113comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. 122comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
114In per-thread mode with inheritance mode on (default), samples are captured only when 123In per-thread mode with inheritance mode on (default), samples are captured only when
115the thread executes on the designated CPUs. Default is to monitor all CPUs. 124the thread executes on the designated CPUs. Default is to monitor all CPUs.
116 125
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 12052c9ed0ba..59a1f57f2bb7 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -20,6 +20,11 @@ OPTIONS
20-i:: 20-i::
21--input=:: 21--input=::
22 Input file name. (default: perf.data) 22 Input file name. (default: perf.data)
23
24-v::
25--verbose::
26 Be more verbose. (show symbol address, etc)
27
23-d:: 28-d::
24--dsos=:: 29--dsos=::
25 Only consider symbols in these dsos. CSV that understands 30 Only consider symbols in these dsos. CSV that understands
@@ -27,6 +32,10 @@ OPTIONS
27-n:: 32-n::
28--show-nr-samples:: 33--show-nr-samples::
29 Show the number of samples for each symbol 34 Show the number of samples for each symbol
35
36--showcpuutilization::
37 Show sample percentage for different cpu modes.
38
30-T:: 39-T::
31--threads:: 40--threads::
32 Show per-thread event counters 41 Show per-thread event counters
@@ -39,12 +48,24 @@ OPTIONS
39 Only consider these symbols. CSV that understands 48 Only consider these symbols. CSV that understands
40 file://filename entries. 49 file://filename entries.
41 50
51-U::
52--hide-unresolved::
53 Only display entries resolved to a symbol.
54
42-s:: 55-s::
43--sort=:: 56--sort=::
44 Sort by key(s): pid, comm, dso, symbol, parent. 57 Sort by key(s): pid, comm, dso, symbol, parent.
45 58
59-p::
60--parent=<regex>::
61 regex filter to identify parent, see: '--sort parent'
62
63-x::
64--exclude-other::
65 Only display entries with parent-match.
66
46-w:: 67-w::
47--field-width=:: 68--column-widths=<width[,width...]>::
48 Force each column width to the provided list, for large terminal 69 Force each column width to the provided list, for large terminal
49 readability. 70 readability.
50 71
@@ -52,19 +73,26 @@ OPTIONS
52--field-separator=:: 73--field-separator=::
53 74
54 Use a special separator character and don't pad with spaces, replacing 75 Use a special separator character and don't pad with spaces, replacing
55 all occurances of this separator in symbol names (and other output) 76 all occurrences of this separator in symbol names (and other output)
56 with a '.' character, that thus it's the only non valid separator. 77 with a '.' character, that thus it's the only non valid separator.
57 78
79-D::
80--dump-raw-trace::
81 Dump raw trace in ASCII.
82
58-g [type,min]:: 83-g [type,min]::
59--call-graph:: 84--call-graph::
60 Display callchains using type and min percent threshold. 85 Display call chains using type and min percent threshold.
61 type can be either: 86 type can be either:
62 - flat: single column, linear exposure of callchains. 87 - flat: single column, linear exposure of call chains.
63 - graph: use a graph tree, displaying absolute overhead rates. 88 - graph: use a graph tree, displaying absolute overhead rates.
64 - fractal: like graph, but displays relative rates. Each branch of 89 - fractal: like graph, but displays relative rates. Each branch of
65 the tree is considered as a new profiled object. + 90 the tree is considered as a new profiled object. +
66 Default: fractal,0.5. 91 Default: fractal,0.5.
67 92
93--pretty=<key>::
94 Pretty printing style. key: normal, raw
95
68--stdio:: Use the stdio interface. 96--stdio:: Use the stdio interface.
69 97
70--tui:: Use the TUI interface, that is integrated with annotate and allows 98--tui:: Use the TUI interface, that is integrated with annotate and allows
@@ -72,6 +100,19 @@ OPTIONS
72 requires a tty, if one is not present, as when piping to other 100 requires a tty, if one is not present, as when piping to other
73 commands, the stdio interface is used. 101 commands, the stdio interface is used.
74 102
103-k::
104--vmlinux=<file>::
105 vmlinux pathname
106
107-m::
108--modules::
109 Load module symbols. WARNING: This should only be used with -k and
110 a LIVE kernel.
111
112-f::
113--force::
114 Don't complain, do it.
115
75SEE ALSO 116SEE ALSO
76-------- 117--------
77linkperf:perf-stat[1] 118linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 8417644a6166..46822d5fde1c 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf sched' {record|latency|replay|trace} 11'perf sched' {record|latency|map|replay|trace}
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15There are four variants of perf sched: 15There are five variants of perf sched:
16 16
17 'perf sched record <command>' to record the scheduling events 17 'perf sched record <command>' to record the scheduling events
18 of an arbitrary workload. 18 of an arbitrary workload.
@@ -30,8 +30,22 @@ There are four variants of perf sched:
30 of the workload as it occurred when it was recorded - and can repeat 30 of the workload as it occurred when it was recorded - and can repeat
31 it a number of times, measuring its performance.) 31 it a number of times, measuring its performance.)
32 32
33 'perf sched map' to print a textual context-switching outline of
34 workload captured via perf sched record. Columns stand for
35 individual CPUs, and the two-letter shortcuts stand for tasks that
36 are running on a CPU. A '*' denotes the CPU that had the event, and
37 a dot signals an idle CPU.
38
33OPTIONS 39OPTIONS
34------- 40-------
41-i::
42--input=<file>::
43 Input file name. (default: perf.data)
44
45-v::
46--verbose::
47 Be more verbose. (show symbol address, etc)
48
35-D:: 49-D::
36--dump-raw-trace=:: 50--dump-raw-trace=::
37 Display verbose dump of the sched data. 51 Display verbose dump of the sched data.
diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-script-perl.txt
index ee6525ee6d69..5bb41e55a3ac 100644
--- a/tools/perf/Documentation/perf-trace-perl.txt
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -1,19 +1,19 @@
1perf-trace-perl(1) 1perf-script-perl(1)
2================== 2==================
3 3
4NAME 4NAME
5---- 5----
6perf-trace-perl - Process trace data with a Perl script 6perf-script-perl - Process trace data with a Perl script
7 7
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf trace' [-s [Perl]:script[.pl] ] 11'perf script' [-s [Perl]:script[.pl] ]
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15 15
16This perf trace option is used to process perf trace data using perf's 16This perf script option is used to process perf script data using perf's
17built-in Perl interpreter. It reads and processes the input file and 17built-in Perl interpreter. It reads and processes the input file and
18displays the results of the trace analysis implemented in the given 18displays the results of the trace analysis implemented in the given
19Perl script, if any. 19Perl script, if any.
@@ -21,7 +21,7 @@ Perl script, if any.
21STARTER SCRIPTS 21STARTER SCRIPTS
22--------------- 22---------------
23 23
24You can avoid reading the rest of this document by running 'perf trace 24You can avoid reading the rest of this document by running 'perf script
25-g perl' in the same directory as an existing perf.data trace file. 25-g perl' in the same directory as an existing perf.data trace file.
26That will generate a starter script containing a handler for each of 26That will generate a starter script containing a handler for each of
27the event types in the trace file; it simply prints every available 27the event types in the trace file; it simply prints every available
@@ -30,13 +30,13 @@ field for each event in the trace file.
30You can also look at the existing scripts in 30You can also look at the existing scripts in
31~/libexec/perf-core/scripts/perl for typical examples showing how to 31~/libexec/perf-core/scripts/perl for typical examples showing how to
32do basic things like aggregate event data, print results, etc. Also, 32do basic things like aggregate event data, print results, etc. Also,
33the check-perf-trace.pl script, while not interesting for its results, 33the check-perf-script.pl script, while not interesting for its results,
34attempts to exercise all of the main scripting features. 34attempts to exercise all of the main scripting features.
35 35
36EVENT HANDLERS 36EVENT HANDLERS
37-------------- 37--------------
38 38
39When perf trace is invoked using a trace script, a user-defined 39When perf script is invoked using a trace script, a user-defined
40'handler function' is called for each event in the trace. If there's 40'handler function' is called for each event in the trace. If there's
41no handler function defined for a given event type, the event is 41no handler function defined for a given event type, the event is
42ignored (or passed to a 'trace_handled' function, see below) and the 42ignored (or passed to a 'trace_handled' function, see below) and the
@@ -112,13 +112,13 @@ write a useful trace script. The sections below cover the rest.
112SCRIPT LAYOUT 112SCRIPT LAYOUT
113------------- 113-------------
114 114
115Every perf trace Perl script should start by setting up a Perl module 115Every perf script Perl script should start by setting up a Perl module
116search path and 'use'ing a few support modules (see module 116search path and 'use'ing a few support modules (see module
117descriptions below): 117descriptions below):
118 118
119---- 119----
120 use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib"; 120 use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib";
121 use lib "./Perf-Trace-Util/lib"; 121 use lib "./perf-script-Util/lib";
122 use Perf::Trace::Core; 122 use Perf::Trace::Core;
123 use Perf::Trace::Context; 123 use Perf::Trace::Context;
124 use Perf::Trace::Util; 124 use Perf::Trace::Util;
@@ -162,7 +162,7 @@ sub trace_unhandled
162---- 162----
163 163
164The remaining sections provide descriptions of each of the available 164The remaining sections provide descriptions of each of the available
165built-in perf trace Perl modules and their associated functions. 165built-in perf script Perl modules and their associated functions.
166 166
167AVAILABLE MODULES AND FUNCTIONS 167AVAILABLE MODULES AND FUNCTIONS
168------------------------------- 168-------------------------------
@@ -170,7 +170,7 @@ AVAILABLE MODULES AND FUNCTIONS
170The following sections describe the functions and variables available 170The following sections describe the functions and variables available
171via the various Perf::Trace::* Perl modules. To use the functions and 171via the various Perf::Trace::* Perl modules. To use the functions and
172variables from the given module, add the corresponding 'use 172variables from the given module, add the corresponding 'use
173Perf::Trace::XXX' line to your perf trace script. 173Perf::Trace::XXX' line to your perf script script.
174 174
175Perf::Trace::Core Module 175Perf::Trace::Core Module
176~~~~~~~~~~~~~~~~~~~~~~~~ 176~~~~~~~~~~~~~~~~~~~~~~~~
@@ -204,7 +204,7 @@ argument.
204Perf::Trace::Util Module 204Perf::Trace::Util Module
205~~~~~~~~~~~~~~~~~~~~~~~~ 205~~~~~~~~~~~~~~~~~~~~~~~~
206 206
207Various utility functions for use with perf trace: 207Various utility functions for use with perf script:
208 208
209 nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair 209 nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
210 nsecs_secs($nsecs) - returns whole secs portion given nsecs 210 nsecs_secs($nsecs) - returns whole secs portion given nsecs
@@ -214,4 +214,4 @@ Various utility functions for use with perf trace:
214 214
215SEE ALSO 215SEE ALSO
216-------- 216--------
217linkperf:perf-trace[1] 217linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-script-python.txt
index 693be804dd3d..36b38277422c 100644
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -1,19 +1,19 @@
1perf-trace-python(1) 1perf-script-python(1)
2==================== 2====================
3 3
4NAME 4NAME
5---- 5----
6perf-trace-python - Process trace data with a Python script 6perf-script-python - Process trace data with a Python script
7 7
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf trace' [-s [Python]:script[.py] ] 11'perf script' [-s [Python]:script[.py] ]
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15 15
16This perf trace option is used to process perf trace data using perf's 16This perf script option is used to process perf script data using perf's
17built-in Python interpreter. It reads and processes the input file and 17built-in Python interpreter. It reads and processes the input file and
18displays the results of the trace analysis implemented in the given 18displays the results of the trace analysis implemented in the given
19Python script, if any. 19Python script, if any.
@@ -23,15 +23,15 @@ A QUICK EXAMPLE
23 23
24This section shows the process, start to finish, of creating a working 24This section shows the process, start to finish, of creating a working
25Python script that aggregates and extracts useful information from a 25Python script that aggregates and extracts useful information from a
26raw perf trace stream. You can avoid reading the rest of this 26raw perf script stream. You can avoid reading the rest of this
27document if an example is enough for you; the rest of the document 27document if an example is enough for you; the rest of the document
28provides more details on each step and lists the library functions 28provides more details on each step and lists the library functions
29available to script writers. 29available to script writers.
30 30
31This example actually details the steps that were used to create the 31This example actually details the steps that were used to create the
32'syscall-counts' script you see when you list the available perf trace 32'syscall-counts' script you see when you list the available perf script
33scripts via 'perf trace -l'. As such, this script also shows how to 33scripts via 'perf script -l'. As such, this script also shows how to
34integrate your script into the list of general-purpose 'perf trace' 34integrate your script into the list of general-purpose 'perf script'
35scripts listed by that command. 35scripts listed by that command.
36 36
37The syscall-counts script is a simple script, but demonstrates all the 37The syscall-counts script is a simple script, but demonstrates all the
@@ -105,31 +105,31 @@ That single stream will be recorded in a file in the current directory
105called perf.data. 105called perf.data.
106 106
107Once we have a perf.data file containing our data, we can use the -g 107Once we have a perf.data file containing our data, we can use the -g
108'perf trace' option to generate a Python script that will contain a 108'perf script' option to generate a Python script that will contain a
109callback handler for each event type found in the perf.data trace 109callback handler for each event type found in the perf.data trace
110stream (for more details, see the STARTER SCRIPTS section). 110stream (for more details, see the STARTER SCRIPTS section).
111 111
112---- 112----
113# perf trace -g python 113# perf script -g python
114generated Python script: perf-trace.py 114generated Python script: perf-script.py
115 115
116The output file created also in the current directory is named 116The output file created also in the current directory is named
117perf-trace.py. Here's the file in its entirety: 117perf-script.py. Here's the file in its entirety:
118 118
119# perf trace event handlers, generated by perf trace -g python 119# perf script event handlers, generated by perf script -g python
120# Licensed under the terms of the GNU GPL License version 2 120# Licensed under the terms of the GNU GPL License version 2
121 121
122# The common_* event handler fields are the most useful fields common to 122# The common_* event handler fields are the most useful fields common to
123# all events. They don't necessarily correspond to the 'common_*' fields 123# all events. They don't necessarily correspond to the 'common_*' fields
124# in the format files. Those fields not available as handler params can 124# in the format files. Those fields not available as handler params can
125# be retrieved using Python functions of the form common_*(context). 125# be retrieved using Python functions of the form common_*(context).
126# See the perf-trace-python Documentation for the list of available functions. 126# See the perf-script-python Documentation for the list of available functions.
127 127
128import os 128import os
129import sys 129import sys
130 130
131sys.path.append(os.environ['PERF_EXEC_PATH'] + \ 131sys.path.append(os.environ['PERF_EXEC_PATH'] + \
132 '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') 132 '/scripts/python/perf-script-Util/lib/Perf/Trace')
133 133
134from perf_trace_context import * 134from perf_trace_context import *
135from Core import * 135from Core import *
@@ -160,7 +160,7 @@ def print_header(event_name, cpu, secs, nsecs, pid, comm):
160---- 160----
161 161
162At the top is a comment block followed by some import statements and a 162At the top is a comment block followed by some import statements and a
163path append which every perf trace script should include. 163path append which every perf script script should include.
164 164
165Following that are a couple generated functions, trace_begin() and 165Following that are a couple generated functions, trace_begin() and
166trace_end(), which are called at the beginning and the end of the 166trace_end(), which are called at the beginning and the end of the
@@ -189,8 +189,8 @@ simply a utility function used for that purpose. Let's rename the
189script and run it to see the default output: 189script and run it to see the default output:
190 190
191---- 191----
192# mv perf-trace.py syscall-counts.py 192# mv perf-script.py syscall-counts.py
193# perf trace -s syscall-counts.py 193# perf script -s syscall-counts.py
194 194
195raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args= 195raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args=
196raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args= 196raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args=
@@ -216,7 +216,7 @@ import os
216import sys 216import sys
217 217
218sys.path.append(os.environ['PERF_EXEC_PATH'] + \ 218sys.path.append(os.environ['PERF_EXEC_PATH'] + \
219 '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') 219 '/scripts/python/perf-script-Util/lib/Perf/Trace')
220 220
221from perf_trace_context import * 221from perf_trace_context import *
222from Core import * 222from Core import *
@@ -279,7 +279,7 @@ import os
279import sys 279import sys
280 280
281sys.path.append(os.environ['PERF_EXEC_PATH'] + \ 281sys.path.append(os.environ['PERF_EXEC_PATH'] + \
282 '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') 282 '/scripts/python/perf-script-Util/lib/Perf/Trace')
283 283
284from perf_trace_context import * 284from perf_trace_context import *
285from Core import * 285from Core import *
@@ -315,7 +315,7 @@ def print_syscall_totals():
315 315
316The script can be run just as before: 316The script can be run just as before:
317 317
318 # perf trace -s syscall-counts.py 318 # perf script -s syscall-counts.py
319 319
320So those are the essential steps in writing and running a script. The 320So those are the essential steps in writing and running a script. The
321process can be generalized to any tracepoint or set of tracepoints 321process can be generalized to any tracepoint or set of tracepoints
@@ -324,17 +324,17 @@ interested in by looking at the list of available events shown by
324'perf list' and/or look in /sys/kernel/debug/tracing events for 324'perf list' and/or look in /sys/kernel/debug/tracing events for
325detailed event and field info, record the corresponding trace data 325detailed event and field info, record the corresponding trace data
326using 'perf record', passing it the list of interesting events, 326using 'perf record', passing it the list of interesting events,
327generate a skeleton script using 'perf trace -g python' and modify the 327generate a skeleton script using 'perf script -g python' and modify the
328code to aggregate and display it for your particular needs. 328code to aggregate and display it for your particular needs.
329 329
330After you've done that you may end up with a general-purpose script 330After you've done that you may end up with a general-purpose script
331that you want to keep around and have available for future use. By 331that you want to keep around and have available for future use. By
332writing a couple of very simple shell scripts and putting them in the 332writing a couple of very simple shell scripts and putting them in the
333right place, you can have your script listed alongside the other 333right place, you can have your script listed alongside the other
334scripts listed by the 'perf trace -l' command e.g.: 334scripts listed by the 'perf script -l' command e.g.:
335 335
336---- 336----
337root@tropicana:~# perf trace -l 337root@tropicana:~# perf script -l
338List of available trace scripts: 338List of available trace scripts:
339 workqueue-stats workqueue stats (ins/exe/create/destroy) 339 workqueue-stats workqueue stats (ins/exe/create/destroy)
340 wakeup-latency system-wide min/max/avg wakeup latency 340 wakeup-latency system-wide min/max/avg wakeup latency
@@ -365,14 +365,14 @@ perf record -a -e raw_syscalls:sys_enter
365The 'report' script is also a shell script with the same base name as 365The 'report' script is also a shell script with the same base name as
366your script, but with -report appended. It should also be located in 366your script, but with -report appended. It should also be located in
367the perf/scripts/python/bin directory. In that script, you write the 367the perf/scripts/python/bin directory. In that script, you write the
368'perf trace -s' command-line needed for running your script: 368'perf script -s' command-line needed for running your script:
369 369
370---- 370----
371# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report 371# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
372 372
373#!/bin/bash 373#!/bin/bash
374# description: system-wide syscall counts 374# description: system-wide syscall counts
375perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py 375perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
376---- 376----
377 377
378Note that the location of the Python script given in the shell script 378Note that the location of the Python script given in the shell script
@@ -390,17 +390,17 @@ total 32
390drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 . 390drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
391drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 .. 391drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
392drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin 392drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
393-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py 393-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
394drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util 394drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util
395-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py 395-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
396---- 396----
397 397
398Once you've done that (don't forget to do a new 'make install', 398Once you've done that (don't forget to do a new 'make install',
399otherwise your script won't show up at run-time), 'perf trace -l' 399otherwise your script won't show up at run-time), 'perf script -l'
400should show a new entry for your script: 400should show a new entry for your script:
401 401
402---- 402----
403root@tropicana:~# perf trace -l 403root@tropicana:~# perf script -l
404List of available trace scripts: 404List of available trace scripts:
405 workqueue-stats workqueue stats (ins/exe/create/destroy) 405 workqueue-stats workqueue stats (ins/exe/create/destroy)
406 wakeup-latency system-wide min/max/avg wakeup latency 406 wakeup-latency system-wide min/max/avg wakeup latency
@@ -409,19 +409,19 @@ List of available trace scripts:
409 syscall-counts system-wide syscall counts 409 syscall-counts system-wide syscall counts
410---- 410----
411 411
412You can now perform the record step via 'perf trace record': 412You can now perform the record step via 'perf script record':
413 413
414 # perf trace record syscall-counts 414 # perf script record syscall-counts
415 415
416and display the output using 'perf trace report': 416and display the output using 'perf script report':
417 417
418 # perf trace report syscall-counts 418 # perf script report syscall-counts
419 419
420STARTER SCRIPTS 420STARTER SCRIPTS
421--------------- 421---------------
422 422
423You can quickly get started writing a script for a particular set of 423You can quickly get started writing a script for a particular set of
424trace data by generating a skeleton script using 'perf trace -g 424trace data by generating a skeleton script using 'perf script -g
425python' in the same directory as an existing perf.data trace file. 425python' in the same directory as an existing perf.data trace file.
426That will generate a starter script containing a handler for each of 426That will generate a starter script containing a handler for each of
427the event types in the trace file; it simply prints every available 427the event types in the trace file; it simply prints every available
@@ -430,13 +430,13 @@ field for each event in the trace file.
430You can also look at the existing scripts in 430You can also look at the existing scripts in
431~/libexec/perf-core/scripts/python for typical examples showing how to 431~/libexec/perf-core/scripts/python for typical examples showing how to
432do basic things like aggregate event data, print results, etc. Also, 432do basic things like aggregate event data, print results, etc. Also,
433the check-perf-trace.py script, while not interesting for its results, 433the check-perf-script.py script, while not interesting for its results,
434attempts to exercise all of the main scripting features. 434attempts to exercise all of the main scripting features.
435 435
436EVENT HANDLERS 436EVENT HANDLERS
437-------------- 437--------------
438 438
439When perf trace is invoked using a trace script, a user-defined 439When perf script is invoked using a trace script, a user-defined
440'handler function' is called for each event in the trace. If there's 440'handler function' is called for each event in the trace. If there's
441no handler function defined for a given event type, the event is 441no handler function defined for a given event type, the event is
442ignored (or passed to a 'trace_handled' function, see below) and the 442ignored (or passed to a 'trace_handled' function, see below) and the
@@ -510,7 +510,7 @@ write a useful trace script. The sections below cover the rest.
510SCRIPT LAYOUT 510SCRIPT LAYOUT
511------------- 511-------------
512 512
513Every perf trace Python script should start by setting up a Python 513Every perf script Python script should start by setting up a Python
514module search path and 'import'ing a few support modules (see module 514module search path and 'import'ing a few support modules (see module
515descriptions below): 515descriptions below):
516 516
@@ -519,7 +519,7 @@ descriptions below):
519 import sys 519 import sys
520 520
521 sys.path.append(os.environ['PERF_EXEC_PATH'] + \ 521 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
522 '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') 522 '/scripts/python/perf-script-Util/lib/Perf/Trace')
523 523
524 from perf_trace_context import * 524 from perf_trace_context import *
525 from Core import * 525 from Core import *
@@ -559,15 +559,15 @@ def trace_unhandled(event_name, context, common_cpu, common_secs,
559---- 559----
560 560
561The remaining sections provide descriptions of each of the available 561The remaining sections provide descriptions of each of the available
562built-in perf trace Python modules and their associated functions. 562built-in perf script Python modules and their associated functions.
563 563
564AVAILABLE MODULES AND FUNCTIONS 564AVAILABLE MODULES AND FUNCTIONS
565------------------------------- 565-------------------------------
566 566
567The following sections describe the functions and variables available 567The following sections describe the functions and variables available
568via the various perf trace Python modules. To use the functions and 568via the various perf script Python modules. To use the functions and
569variables from the given module, add the corresponding 'from XXXX 569variables from the given module, add the corresponding 'from XXXX
570import' line to your perf trace script. 570import' line to your perf script script.
571 571
572Core.py Module 572Core.py Module
573~~~~~~~~~~~~~~ 573~~~~~~~~~~~~~~
@@ -610,7 +610,7 @@ argument.
610Util.py Module 610Util.py Module
611~~~~~~~~~~~~~~ 611~~~~~~~~~~~~~~
612 612
613Various utility functions for use with perf trace: 613Various utility functions for use with perf script:
614 614
615 nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair 615 nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
616 nsecs_secs(nsecs) - returns whole secs portion given nsecs 616 nsecs_secs(nsecs) - returns whole secs portion given nsecs
@@ -620,4 +620,4 @@ Various utility functions for use with perf trace:
620 620
621SEE ALSO 621SEE ALSO
622-------- 622--------
623linkperf:perf-trace[1] 623linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-script.txt
index 26aff6bf9e50..29ad94293cd2 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -1,71 +1,71 @@
1perf-trace(1) 1perf-script(1)
2============= 2=============
3 3
4NAME 4NAME
5---- 5----
6perf-trace - Read perf.data (created by perf record) and display trace output 6perf-script - Read perf.data (created by perf record) and display trace output
7 7
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf trace' [<options>] 11'perf script' [<options>]
12'perf trace' [<options>] record <script> [<record-options>] <command> 12'perf script' [<options>] record <script> [<record-options>] <command>
13'perf trace' [<options>] report <script> [script-args] 13'perf script' [<options>] report <script> [script-args]
14'perf trace' [<options>] <script> <required-script-args> [<record-options>] <command> 14'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
15'perf trace' [<options>] <top-script> [script-args] 15'perf script' [<options>] <top-script> [script-args]
16 16
17DESCRIPTION 17DESCRIPTION
18----------- 18-----------
19This command reads the input file and displays the trace recorded. 19This command reads the input file and displays the trace recorded.
20 20
21There are several variants of perf trace: 21There are several variants of perf script:
22 22
23 'perf trace' to see a detailed trace of the workload that was 23 'perf script' to see a detailed trace of the workload that was
24 recorded. 24 recorded.
25 25
26 You can also run a set of pre-canned scripts that aggregate and 26 You can also run a set of pre-canned scripts that aggregate and
27 summarize the raw trace data in various ways (the list of scripts is 27 summarize the raw trace data in various ways (the list of scripts is
28 available via 'perf trace -l'). The following variants allow you to 28 available via 'perf script -l'). The following variants allow you to
29 record and run those scripts: 29 record and run those scripts:
30 30
31 'perf trace record <script> <command>' to record the events required 31 'perf script record <script> <command>' to record the events required
32 for 'perf trace report'. <script> is the name displayed in the 32 for 'perf script report'. <script> is the name displayed in the
33 output of 'perf trace --list' i.e. the actual script name minus any 33 output of 'perf script --list' i.e. the actual script name minus any
34 language extension. If <command> is not specified, the events are 34 language extension. If <command> is not specified, the events are
35 recorded using the -a (system-wide) 'perf record' option. 35 recorded using the -a (system-wide) 'perf record' option.
36 36
37 'perf trace report <script> [args]' to run and display the results 37 'perf script report <script> [args]' to run and display the results
38 of <script>. <script> is the name displayed in the output of 'perf 38 of <script>. <script> is the name displayed in the output of 'perf
39 trace --list' i.e. the actual script name minus any language 39 trace --list' i.e. the actual script name minus any language
40 extension. The perf.data output from a previous run of 'perf trace 40 extension. The perf.data output from a previous run of 'perf script
41 record <script>' is used and should be present for this command to 41 record <script>' is used and should be present for this command to
42 succeed. [args] refers to the (mainly optional) args expected by 42 succeed. [args] refers to the (mainly optional) args expected by
43 the script. 43 the script.
44 44
45 'perf trace <script> <required-script-args> <command>' to both 45 'perf script <script> <required-script-args> <command>' to both
46 record the events required for <script> and to run the <script> 46 record the events required for <script> and to run the <script>
47 using 'live-mode' i.e. without writing anything to disk. <script> 47 using 'live-mode' i.e. without writing anything to disk. <script>
48 is the name displayed in the output of 'perf trace --list' i.e. the 48 is the name displayed in the output of 'perf script --list' i.e. the
49 actual script name minus any language extension. If <command> is 49 actual script name minus any language extension. If <command> is
50 not specified, the events are recorded using the -a (system-wide) 50 not specified, the events are recorded using the -a (system-wide)
51 'perf record' option. If <script> has any required args, they 51 'perf record' option. If <script> has any required args, they
52 should be specified before <command>. This mode doesn't allow for 52 should be specified before <command>. This mode doesn't allow for
53 optional script args to be specified; if optional script args are 53 optional script args to be specified; if optional script args are
54 desired, they can be specified using separate 'perf trace record' 54 desired, they can be specified using separate 'perf script record'
55 and 'perf trace report' commands, with the stdout of the record step 55 and 'perf script report' commands, with the stdout of the record step
56 piped to the stdin of the report script, using the '-o -' and '-i -' 56 piped to the stdin of the report script, using the '-o -' and '-i -'
57 options of the corresponding commands. 57 options of the corresponding commands.
58 58
59 'perf trace <top-script>' to both record the events required for 59 'perf script <top-script>' to both record the events required for
60 <top-script> and to run the <top-script> using 'live-mode' 60 <top-script> and to run the <top-script> using 'live-mode'
61 i.e. without writing anything to disk. <top-script> is the name 61 i.e. without writing anything to disk. <top-script> is the name
62 displayed in the output of 'perf trace --list' i.e. the actual 62 displayed in the output of 'perf script --list' i.e. the actual
63 script name minus any language extension; a <top-script> is defined 63 script name minus any language extension; a <top-script> is defined
64 as any script name ending with the string 'top'. 64 as any script name ending with the string 'top'.
65 65
66 [<record-options>] can be passed to the record steps of 'perf trace 66 [<record-options>] can be passed to the record steps of 'perf script
67 record' and 'live-mode' variants; this isn't possible however for 67 record' and 'live-mode' variants; this isn't possible however for
68 <top-script> 'live-mode' or 'perf trace report' variants. 68 <top-script> 'live-mode' or 'perf script report' variants.
69 69
70 See the 'SEE ALSO' section for links to language-specific 70 See the 'SEE ALSO' section for links to language-specific
71 information on how to write and run your own trace scripts. 71 information on how to write and run your own trace scripts.
@@ -76,7 +76,7 @@ OPTIONS
76 Any command you can specify in a shell. 76 Any command you can specify in a shell.
77 77
78-D:: 78-D::
79--dump-raw-trace=:: 79--dump-raw-script=::
80 Display verbose dump of the trace data. 80 Display verbose dump of the trace data.
81 81
82-L:: 82-L::
@@ -95,7 +95,7 @@ OPTIONS
95 95
96-g:: 96-g::
97--gen-script=:: 97--gen-script=::
98 Generate perf-trace.[ext] starter script for given language, 98 Generate perf-script.[ext] starter script for given language,
99 using current perf.data. 99 using current perf.data.
100 100
101-a:: 101-a::
@@ -104,8 +104,15 @@ OPTIONS
104 normally don't - this option allows the latter to be run in 104 normally don't - this option allows the latter to be run in
105 system-wide mode. 105 system-wide mode.
106 106
107-i::
108--input=::
109 Input file name.
110
111-d::
112--debug-mode::
113 Do various checks like samples ordering and lost events.
107 114
108SEE ALSO 115SEE ALSO
109-------- 116--------
110linkperf:perf-record[1], linkperf:perf-trace-perl[1], 117linkperf:perf-record[1], linkperf:perf-script-perl[1],
111linkperf:perf-trace-python[1] 118linkperf:perf-script-python[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 4b3a2d46b437..b6da7affbbee 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -8,8 +8,8 @@ perf-stat - Run a command and gather performance counter statistics
8SYNOPSIS 8SYNOPSIS
9-------- 9--------
10[verse] 10[verse]
11'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command> 11'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
12'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>] 12'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
13 13
14DESCRIPTION 14DESCRIPTION
15----------- 15-----------
@@ -35,24 +35,54 @@ OPTIONS
35 child tasks do not inherit counters 35 child tasks do not inherit counters
36-p:: 36-p::
37--pid=<pid>:: 37--pid=<pid>::
38 stat events on existing pid 38 stat events on existing process id
39
40-t::
41--tid=<tid>::
42 stat events on existing thread id
43
39 44
40-a:: 45-a::
41 system-wide collection 46--all-cpus::
47 system-wide collection from all CPUs
42 48
43-c:: 49-c::
44 scale counter values 50--scale::
51 scale/normalize counter values
52
53-r::
54--repeat=<n>::
55 repeat command and print average + stddev (max: 100)
45 56
46-B:: 57-B::
58--big-num::
47 print large numbers with thousands' separators according to locale 59 print large numbers with thousands' separators according to locale
48 60
49-C:: 61-C::
50--cpu=:: 62--cpu=::
51Count only on the list of cpus provided. Multiple CPUs can be provided as a 63Count only on the list of CPUs provided. Multiple CPUs can be provided as a
52comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. 64comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
53In per-thread mode, this option is ignored. The -a option is still necessary 65In per-thread mode, this option is ignored. The -a option is still necessary
54to activate system-wide monitoring. Default is to count on all CPUs. 66to activate system-wide monitoring. Default is to count on all CPUs.
55 67
68-A::
69--no-aggr::
70Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
71This option is only valid in system-wide mode.
72
73-n::
74--null::
75 null run - don't start any counters
76
77-v::
78--verbose::
79 be more verbose (show counter open errors, etc)
80
81-x SEP::
82--field-separator SEP::
83print counts using a CSV-style output to make it easy to import directly into
84spreadsheets. Columns are separated by the string specified in SEP.
85
56EXAMPLES 86EXAMPLES
57-------- 87--------
58 88
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
index 1c4b5f5b7f71..2c3b462f64b0 100644
--- a/tools/perf/Documentation/perf-test.txt
+++ b/tools/perf/Documentation/perf-test.txt
@@ -12,7 +12,7 @@ SYNOPSIS
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15This command does assorted sanity tests, initially thru linked routines but 15This command does assorted sanity tests, initially through linked routines but
16also will look for a directory with more tests in the form of scripts. 16also will look for a directory with more tests in the form of scripts.
17 17
18OPTIONS 18OPTIONS
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 1f9687663f2a..f6eb1cdafb77 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -12,7 +12,7 @@ SYNOPSIS
12 12
13DESCRIPTION 13DESCRIPTION
14----------- 14-----------
15This command generates and displays a performance counter profile in realtime. 15This command generates and displays a performance counter profile in real time.
16 16
17 17
18OPTIONS 18OPTIONS
@@ -27,8 +27,8 @@ OPTIONS
27 27
28-C <cpu-list>:: 28-C <cpu-list>::
29--cpu=<cpu>:: 29--cpu=<cpu>::
30Monitor only on the list of cpus provided. Multiple CPUs can be provided as a 30Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
31comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. 31comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
32Default is to monitor all CPUS. 32Default is to monitor all CPUS.
33 33
34-d <seconds>:: 34-d <seconds>::
@@ -50,6 +50,10 @@ Default is to monitor all CPUS.
50--count-filter=<count>:: 50--count-filter=<count>::
51 Only display functions with more events than this. 51 Only display functions with more events than this.
52 52
53-g::
54--group::
55 Put the counters into a counter group.
56
53-F <freq>:: 57-F <freq>::
54--freq=<freq>:: 58--freq=<freq>::
55 Profile at this frequency. 59 Profile at this frequency.
@@ -68,7 +72,11 @@ Default is to monitor all CPUS.
68 72
69-p <pid>:: 73-p <pid>::
70--pid=<pid>:: 74--pid=<pid>::
71 Profile events on existing pid. 75 Profile events on existing Process ID.
76
77-t <tid>::
78--tid=<tid>::
79 Profile events on existing thread ID.
72 80
73-r <priority>:: 81-r <priority>::
74--realtime=<priority>:: 82--realtime=<priority>::
@@ -78,6 +86,18 @@ Default is to monitor all CPUS.
78--sym-annotate=<symbol>:: 86--sym-annotate=<symbol>::
79 Annotate this symbol. 87 Annotate this symbol.
80 88
89-K::
90--hide_kernel_symbols::
91 Hide kernel symbols.
92
93-U::
94--hide_user_symbols::
95 Hide user symbols.
96
97-D::
98--dump-symtab::
99 Dump the symbol table used for profiling.
100
81-v:: 101-v::
82--verbose:: 102--verbose::
83 Be more verbose (show counter open errors, etc). 103 Be more verbose (show counter open errors, etc).
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 8c7fc0c8f0b8..c12659d8cb26 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -7,6 +7,7 @@ include/linux/stringify.h
7lib/rbtree.c 7lib/rbtree.c
8include/linux/swab.h 8include/linux/swab.h
9arch/*/include/asm/unistd*.h 9arch/*/include/asm/unistd*.h
10arch/*/lib/memcpy*.S
10include/linux/poison.h 11include/linux/poison.h
11include/linux/magic.h 12include/linux/magic.h
12include/linux/hw_breakpoint.h 13include/linux/hw_breakpoint.h
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index d1db0f676a4b..d88137a4356e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
185 ARCH := x86 185 ARCH := x86
186endif 186endif
187ifeq ($(ARCH),x86_64) 187ifeq ($(ARCH),x86_64)
188 RAW_ARCH := x86_64
188 ARCH := x86 189 ARCH := x86
190 ARCH_CFLAGS := -DARCH_X86_64
191 ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
189endif 192endif
190 193
191# CFLAGS and LDFLAGS are for the users to override from the command line. 194# CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h
375LIB_H += util/include/linux/rbtree.h 378LIB_H += util/include/linux/rbtree.h
376LIB_H += util/include/linux/string.h 379LIB_H += util/include/linux/string.h
377LIB_H += util/include/linux/types.h 380LIB_H += util/include/linux/types.h
381LIB_H += util/include/linux/linkage.h
378LIB_H += util/include/asm/asm-offsets.h 382LIB_H += util/include/asm/asm-offsets.h
379LIB_H += util/include/asm/bug.h 383LIB_H += util/include/asm/bug.h
380LIB_H += util/include/asm/byteorder.h 384LIB_H += util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h
383LIB_H += util/include/asm/system.h 387LIB_H += util/include/asm/system.h
384LIB_H += util/include/asm/uaccess.h 388LIB_H += util/include/asm/uaccess.h
385LIB_H += util/include/dwarf-regs.h 389LIB_H += util/include/dwarf-regs.h
390LIB_H += util/include/asm/dwarf2.h
391LIB_H += util/include/asm/cpufeature.h
386LIB_H += perf.h 392LIB_H += perf.h
387LIB_H += util/cache.h 393LIB_H += util/cache.h
388LIB_H += util/callchain.h 394LIB_H += util/callchain.h
@@ -417,6 +423,7 @@ LIB_H += util/probe-finder.h
417LIB_H += util/probe-event.h 423LIB_H += util/probe-event.h
418LIB_H += util/pstack.h 424LIB_H += util/pstack.h
419LIB_H += util/cpumap.h 425LIB_H += util/cpumap.h
426LIB_H += $(ARCH_INCLUDE)
420 427
421LIB_OBJS += $(OUTPUT)util/abspath.o 428LIB_OBJS += $(OUTPUT)util/abspath.o
422LIB_OBJS += $(OUTPUT)util/alias.o 429LIB_OBJS += $(OUTPUT)util/alias.o
@@ -472,6 +479,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
472# Benchmark modules 479# Benchmark modules
473BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o 480BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
474BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o 481BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
482ifeq ($(RAW_ARCH),x86_64)
483BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
484endif
475BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o 485BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
476 486
477BUILTIN_OBJS += $(OUTPUT)builtin-diff.o 487BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -485,7 +495,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-report.o
485BUILTIN_OBJS += $(OUTPUT)builtin-stat.o 495BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
486BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o 496BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
487BUILTIN_OBJS += $(OUTPUT)builtin-top.o 497BUILTIN_OBJS += $(OUTPUT)builtin-top.o
488BUILTIN_OBJS += $(OUTPUT)builtin-trace.o 498BUILTIN_OBJS += $(OUTPUT)builtin-script.o
489BUILTIN_OBJS += $(OUTPUT)builtin-probe.o 499BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
490BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o 500BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
491BUILTIN_OBJS += $(OUTPUT)builtin-lock.o 501BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
@@ -507,7 +517,7 @@ PERFLIBS = $(LIB_FILE)
507-include config.mak 517-include config.mak
508 518
509ifndef NO_DWARF 519ifndef NO_DWARF
510FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS) 520FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
511ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y) 521ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
512 msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); 522 msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
513 NO_DWARF := 1 523 NO_DWARF := 1
@@ -554,7 +564,7 @@ ifndef NO_DWARF
554ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined) 564ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
555 msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled); 565 msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
556else 566else
557 BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT 567 BASIC_CFLAGS += -DDWARF_SUPPORT
558 EXTLIBS += -lelf -ldw 568 EXTLIBS += -lelf -ldw
559 LIB_OBJS += $(OUTPUT)util/probe-finder.o 569 LIB_OBJS += $(OUTPUT)util/probe-finder.o
560endif # PERF_HAVE_DWARF_REGS 570endif # PERF_HAVE_DWARF_REGS
@@ -898,6 +908,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
898LIB_OBJS += $(COMPAT_OBJS) 908LIB_OBJS += $(COMPAT_OBJS)
899 909
900ALL_CFLAGS += $(BASIC_CFLAGS) 910ALL_CFLAGS += $(BASIC_CFLAGS)
911ALL_CFLAGS += $(ARCH_CFLAGS)
901ALL_LDFLAGS += $(BASIC_LDFLAGS) 912ALL_LDFLAGS += $(BASIC_LDFLAGS)
902 913
903export TAR INSTALL DESTDIR SHELL_PATH 914export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
new file mode 100644
index 000000000000..a72e36cb5394
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
1
2#ifdef ARCH_X86_64
3
4#define MEMCPY_FN(fn, name, desc) \
5 extern void *fn(void *, const void *, size_t);
6
7#include "mem-memcpy-x86-64-asm-def.h"
8
9#undef MEMCPY_FN
10
11#endif
12
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
new file mode 100644
index 000000000000..d588b87696fc
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
1
2MEMCPY_FN(__memcpy,
3 "x86-64-unrolled",
4 "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 000000000000..a57b66e853c2
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
1
2#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 38dae7465142..db82021f4b91 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -12,6 +12,7 @@
12#include "../util/parse-options.h" 12#include "../util/parse-options.h"
13#include "../util/header.h" 13#include "../util/header.h"
14#include "bench.h" 14#include "bench.h"
15#include "mem-memcpy-arch.h"
15 16
16#include <stdio.h> 17#include <stdio.h>
17#include <stdlib.h> 18#include <stdlib.h>
@@ -23,8 +24,10 @@
23 24
24static const char *length_str = "1MB"; 25static const char *length_str = "1MB";
25static const char *routine = "default"; 26static const char *routine = "default";
26static bool use_clock = false; 27static bool use_clock;
27static int clock_fd; 28static int clock_fd;
29static bool only_prefault;
30static bool no_prefault;
28 31
29static const struct option options[] = { 32static const struct option options[] = {
30 OPT_STRING('l', "length", &length_str, "1MB", 33 OPT_STRING('l', "length", &length_str, "1MB",
@@ -34,19 +37,33 @@ static const struct option options[] = {
34 "Specify routine to copy"), 37 "Specify routine to copy"),
35 OPT_BOOLEAN('c', "clock", &use_clock, 38 OPT_BOOLEAN('c', "clock", &use_clock,
36 "Use CPU clock for measuring"), 39 "Use CPU clock for measuring"),
40 OPT_BOOLEAN('o', "only-prefault", &only_prefault,
41 "Show only the result with page faults before memcpy()"),
42 OPT_BOOLEAN('n', "no-prefault", &no_prefault,
43 "Show only the result without page faults before memcpy()"),
37 OPT_END() 44 OPT_END()
38}; 45};
39 46
47typedef void *(*memcpy_t)(void *, const void *, size_t);
48
40struct routine { 49struct routine {
41 const char *name; 50 const char *name;
42 const char *desc; 51 const char *desc;
43 void * (*fn)(void *dst, const void *src, size_t len); 52 memcpy_t fn;
44}; 53};
45 54
46struct routine routines[] = { 55struct routine routines[] = {
47 { "default", 56 { "default",
48 "Default memcpy() provided by glibc", 57 "Default memcpy() provided by glibc",
49 memcpy }, 58 memcpy },
59#ifdef ARCH_X86_64
60
61#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
62#include "mem-memcpy-x86-64-asm-def.h"
63#undef MEMCPY_FN
64
65#endif
66
50 { NULL, 67 { NULL,
51 NULL, 68 NULL,
52 NULL } 69 NULL }
@@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts)
89 (double)ts->tv_usec / (double)1000000; 106 (double)ts->tv_usec / (double)1000000;
90} 107}
91 108
109static void alloc_mem(void **dst, void **src, size_t length)
110{
111 *dst = zalloc(length);
112 if (!dst)
113 die("memory allocation failed - maybe length is too large?\n");
114
115 *src = zalloc(length);
116 if (!src)
117 die("memory allocation failed - maybe length is too large?\n");
118}
119
120static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
121{
122 u64 clock_start = 0ULL, clock_end = 0ULL;
123 void *src = NULL, *dst = NULL;
124
125 alloc_mem(&src, &dst, len);
126
127 if (prefault)
128 fn(dst, src, len);
129
130 clock_start = get_clock();
131 fn(dst, src, len);
132 clock_end = get_clock();
133
134 free(src);
135 free(dst);
136 return clock_end - clock_start;
137}
138
139static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
140{
141 struct timeval tv_start, tv_end, tv_diff;
142 void *src = NULL, *dst = NULL;
143
144 alloc_mem(&src, &dst, len);
145
146 if (prefault)
147 fn(dst, src, len);
148
149 BUG_ON(gettimeofday(&tv_start, NULL));
150 fn(dst, src, len);
151 BUG_ON(gettimeofday(&tv_end, NULL));
152
153 timersub(&tv_end, &tv_start, &tv_diff);
154
155 free(src);
156 free(dst);
157 return (double)((double)len / timeval2double(&tv_diff));
158}
159
160#define pf (no_prefault ? 0 : 1)
161
162#define print_bps(x) do { \
163 if (x < K) \
164 printf(" %14lf B/Sec", x); \
165 else if (x < K * K) \
166 printf(" %14lfd KB/Sec", x / K); \
167 else if (x < K * K * K) \
168 printf(" %14lf MB/Sec", x / K / K); \
169 else \
170 printf(" %14lf GB/Sec", x / K / K / K); \
171 } while (0)
172
92int bench_mem_memcpy(int argc, const char **argv, 173int bench_mem_memcpy(int argc, const char **argv,
93 const char *prefix __used) 174 const char *prefix __used)
94{ 175{
95 int i; 176 int i;
96 void *dst, *src; 177 size_t len;
97 size_t length; 178 double result_bps[2];
98 double bps = 0.0; 179 u64 result_clock[2];
99 struct timeval tv_start, tv_end, tv_diff;
100 u64 clock_start, clock_end, clock_diff;
101 180
102 clock_start = clock_end = clock_diff = 0ULL;
103 argc = parse_options(argc, argv, options, 181 argc = parse_options(argc, argv, options,
104 bench_mem_memcpy_usage, 0); 182 bench_mem_memcpy_usage, 0);
105 183
106 tv_diff.tv_sec = 0; 184 if (use_clock)
107 tv_diff.tv_usec = 0; 185 init_clock();
108 length = (size_t)perf_atoll((char *)length_str); 186
187 len = (size_t)perf_atoll((char *)length_str);
109 188
110 if ((s64)length <= 0) { 189 result_clock[0] = result_clock[1] = 0ULL;
190 result_bps[0] = result_bps[1] = 0.0;
191
192 if ((s64)len <= 0) {
111 fprintf(stderr, "Invalid length:%s\n", length_str); 193 fprintf(stderr, "Invalid length:%s\n", length_str);
112 return 1; 194 return 1;
113 } 195 }
114 196
197 /* same to without specifying either of prefault and no-prefault */
198 if (only_prefault && no_prefault)
199 only_prefault = no_prefault = false;
200
115 for (i = 0; routines[i].name; i++) { 201 for (i = 0; routines[i].name; i++) {
116 if (!strcmp(routines[i].name, routine)) 202 if (!strcmp(routines[i].name, routine))
117 break; 203 break;
@@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv,
126 return 1; 212 return 1;
127 } 213 }
128 214
129 dst = zalloc(length); 215 if (bench_format == BENCH_FORMAT_DEFAULT)
130 if (!dst) 216 printf("# Copying %s Bytes ...\n\n", length_str);
131 die("memory allocation failed - maybe length is too large?\n");
132
133 src = zalloc(length);
134 if (!src)
135 die("memory allocation failed - maybe length is too large?\n");
136
137 if (bench_format == BENCH_FORMAT_DEFAULT) {
138 printf("# Copying %s Bytes from %p to %p ...\n\n",
139 length_str, src, dst);
140 }
141
142 if (use_clock) {
143 init_clock();
144 clock_start = get_clock();
145 } else {
146 BUG_ON(gettimeofday(&tv_start, NULL));
147 }
148
149 routines[i].fn(dst, src, length);
150 217
151 if (use_clock) { 218 if (!only_prefault && !no_prefault) {
152 clock_end = get_clock(); 219 /* show both of results */
153 clock_diff = clock_end - clock_start; 220 if (use_clock) {
221 result_clock[0] =
222 do_memcpy_clock(routines[i].fn, len, false);
223 result_clock[1] =
224 do_memcpy_clock(routines[i].fn, len, true);
225 } else {
226 result_bps[0] =
227 do_memcpy_gettimeofday(routines[i].fn,
228 len, false);
229 result_bps[1] =
230 do_memcpy_gettimeofday(routines[i].fn,
231 len, true);
232 }
154 } else { 233 } else {
155 BUG_ON(gettimeofday(&tv_end, NULL)); 234 if (use_clock) {
156 timersub(&tv_end, &tv_start, &tv_diff); 235 result_clock[pf] =
157 bps = (double)((double)length / timeval2double(&tv_diff)); 236 do_memcpy_clock(routines[i].fn,
237 len, only_prefault);
238 } else {
239 result_bps[pf] =
240 do_memcpy_gettimeofday(routines[i].fn,
241 len, only_prefault);
242 }
158 } 243 }
159 244
160 switch (bench_format) { 245 switch (bench_format) {
161 case BENCH_FORMAT_DEFAULT: 246 case BENCH_FORMAT_DEFAULT:
162 if (use_clock) { 247 if (!only_prefault && !no_prefault) {
163 printf(" %14lf Clock/Byte\n", 248 if (use_clock) {
164 (double)clock_diff / (double)length); 249 printf(" %14lf Clock/Byte\n",
165 } else { 250 (double)result_clock[0]
166 if (bps < K) 251 / (double)len);
167 printf(" %14lf B/Sec\n", bps); 252 printf(" %14lf Clock/Byte (with prefault)\n",
168 else if (bps < K * K) 253 (double)result_clock[1]
169 printf(" %14lfd KB/Sec\n", bps / 1024); 254 / (double)len);
170 else if (bps < K * K * K) 255 } else {
171 printf(" %14lf MB/Sec\n", bps / 1024 / 1024); 256 print_bps(result_bps[0]);
172 else { 257 printf("\n");
173 printf(" %14lf GB/Sec\n", 258 print_bps(result_bps[1]);
174 bps / 1024 / 1024 / 1024); 259 printf(" (with prefault)\n");
175 } 260 }
261 } else {
262 if (use_clock) {
263 printf(" %14lf Clock/Byte",
264 (double)result_clock[pf]
265 / (double)len);
266 } else
267 print_bps(result_bps[pf]);
268
269 printf("%s\n", only_prefault ? " (with prefault)" : "");
176 } 270 }
177 break; 271 break;
178 case BENCH_FORMAT_SIMPLE: 272 case BENCH_FORMAT_SIMPLE:
179 if (use_clock) { 273 if (!only_prefault && !no_prefault) {
180 printf("%14lf\n", 274 if (use_clock) {
181 (double)clock_diff / (double)length); 275 printf("%lf %lf\n",
182 } else 276 (double)result_clock[0] / (double)len,
183 printf("%lf\n", bps); 277 (double)result_clock[1] / (double)len);
278 } else {
279 printf("%lf %lf\n",
280 result_bps[0], result_bps[1]);
281 }
282 } else {
283 if (use_clock) {
284 printf("%lf\n", (double)result_clock[pf]
285 / (double)len);
286 } else
287 printf("%lf\n", result_bps[pf]);
288 }
184 break; 289 break;
185 default: 290 default:
186 /* reaching this means there's some disaster: */ 291 /* reaching this means there's some disaster: */
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index fca1d4402910..221b823bc26f 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -173,7 +173,7 @@ static const char * const diff_usage[] = {
173static const struct option options[] = { 173static const struct option options[] = {
174 OPT_INCR('v', "verbose", &verbose, 174 OPT_INCR('v', "verbose", &verbose,
175 "be more verbose (show symbol address, etc)"), 175 "be more verbose (show symbol address, etc)"),
176 OPT_BOOLEAN('m', "displacement", &show_displacement, 176 OPT_BOOLEAN('M', "displacement", &show_displacement,
177 "Show position displacement relative to baseline"), 177 "Show position displacement relative to baseline"),
178 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 178 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
179 "dump raw trace in ASCII"), 179 "dump raw trace in ASCII"),
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 821c1586a22b..8452a2ae2191 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -982,9 +982,9 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used)
982 usage_with_options(report_usage, report_options); 982 usage_with_options(report_usage, report_options);
983 } 983 }
984 __cmd_report(); 984 __cmd_report();
985 } else if (!strcmp(argv[0], "trace")) { 985 } else if (!strcmp(argv[0], "script")) {
986 /* Aliased to 'perf trace' */ 986 /* Aliased to 'perf script' */
987 return cmd_trace(argc, argv, prefix); 987 return cmd_script(argc, argv, prefix);
988 } else if (!strcmp(argv[0], "info")) { 988 } else if (!strcmp(argv[0], "info")) {
989 if (argc) { 989 if (argc) {
990 argc = parse_options(argc, argv, 990 argc = parse_options(argc, argv,
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e2c2de201eec..024e1441d76b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -61,6 +61,7 @@ static bool inherit_stat = false;
61static bool no_samples = false; 61static bool no_samples = false;
62static bool sample_address = false; 62static bool sample_address = false;
63static bool no_buildid = false; 63static bool no_buildid = false;
64static bool no_buildid_cache = false;
64 65
65static long samples = 0; 66static long samples = 0;
66static u64 bytes_written = 0; 67static u64 bytes_written = 0;
@@ -326,7 +327,7 @@ try_again:
326 goto try_again; 327 goto try_again;
327 } 328 }
328 printf("\n"); 329 printf("\n");
329 error("perfcounter syscall returned with %d (%s)\n", 330 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
330 fd[nr_cpu][counter][thread_index], strerror(err)); 331 fd[nr_cpu][counter][thread_index], strerror(err));
331 332
332#if defined(__i386__) || defined(__x86_64__) 333#if defined(__i386__) || defined(__x86_64__)
@@ -437,7 +438,8 @@ static void atexit_header(void)
437 if (!pipe_output) { 438 if (!pipe_output) {
438 session->header.data_size += bytes_written; 439 session->header.data_size += bytes_written;
439 440
440 process_buildids(); 441 if (!no_buildid)
442 process_buildids();
441 perf_header__write(&session->header, output, true); 443 perf_header__write(&session->header, output, true);
442 perf_session__delete(session); 444 perf_session__delete(session);
443 symbol__exit(); 445 symbol__exit();
@@ -557,6 +559,9 @@ static int __cmd_record(int argc, const char **argv)
557 return -1; 559 return -1;
558 } 560 }
559 561
562 if (!no_buildid)
563 perf_header__set_feat(&session->header, HEADER_BUILD_ID);
564
560 if (!file_new) { 565 if (!file_new) {
561 err = perf_header__read(session, output); 566 err = perf_header__read(session, output);
562 if (err < 0) 567 if (err < 0)
@@ -831,8 +836,10 @@ const struct option record_options[] = {
831 "Sample addresses"), 836 "Sample addresses"),
832 OPT_BOOLEAN('n', "no-samples", &no_samples, 837 OPT_BOOLEAN('n', "no-samples", &no_samples,
833 "don't sample"), 838 "don't sample"),
834 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid, 839 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
835 "do not update the buildid cache"), 840 "do not update the buildid cache"),
841 OPT_BOOLEAN('B', "no-buildid", &no_buildid,
842 "do not collect buildids in perf.data"),
836 OPT_END() 843 OPT_END()
837}; 844};
838 845
@@ -857,7 +864,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
857 } 864 }
858 865
859 symbol__init(); 866 symbol__init();
860 if (no_buildid) 867
868 if (no_buildid_cache || no_buildid)
861 disable_buildid_cache(); 869 disable_buildid_cache();
862 870
863 if (!nr_counters) { 871 if (!nr_counters) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 55f3b5dcc731..26523c939791 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1888,10 +1888,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
1888 usage_with_options(sched_usage, sched_options); 1888 usage_with_options(sched_usage, sched_options);
1889 1889
1890 /* 1890 /*
1891 * Aliased to 'perf trace' for now: 1891 * Aliased to 'perf script' for now:
1892 */ 1892 */
1893 if (!strcmp(argv[0], "trace")) 1893 if (!strcmp(argv[0], "script"))
1894 return cmd_trace(argc, argv, prefix); 1894 return cmd_script(argc, argv, prefix);
1895 1895
1896 symbol__init(); 1896 symbol__init();
1897 if (!strncmp(argv[0], "rec", 3)) { 1897 if (!strncmp(argv[0], "rec", 3)) {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-script.c
index 86cfe3800e6b..4539551ab40e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-script.c
@@ -56,7 +56,7 @@ static void setup_scripting(void)
56 56
57static int cleanup_scripting(void) 57static int cleanup_scripting(void)
58{ 58{
59 pr_debug("\nperf trace script stopped\n"); 59 pr_debug("\nperf script stopped\n");
60 60
61 return scripting_ops->stop_script(); 61 return scripting_ops->stop_script();
62} 62}
@@ -137,7 +137,7 @@ static void sig_handler(int sig __unused)
137 session_done = 1; 137 session_done = 1;
138} 138}
139 139
140static int __cmd_trace(struct perf_session *session) 140static int __cmd_script(struct perf_session *session)
141{ 141{
142 int ret; 142 int ret;
143 143
@@ -247,7 +247,7 @@ static void list_available_languages(void)
247 247
248 fprintf(stderr, "\n"); 248 fprintf(stderr, "\n");
249 fprintf(stderr, "Scripting language extensions (used in " 249 fprintf(stderr, "Scripting language extensions (used in "
250 "perf trace -s [spec:]script.[spec]):\n\n"); 250 "perf script -s [spec:]script.[spec]):\n\n");
251 251
252 list_for_each_entry(s, &script_specs, node) 252 list_for_each_entry(s, &script_specs, node)
253 fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name); 253 fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name);
@@ -301,17 +301,34 @@ static int parse_scriptname(const struct option *opt __used,
301 return 0; 301 return 0;
302} 302}
303 303
304#define for_each_lang(scripts_dir, lang_dirent, lang_next) \ 304/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
305static int is_directory(const char *base_path, const struct dirent *dent)
306{
307 char path[PATH_MAX];
308 struct stat st;
309
310 sprintf(path, "%s/%s", base_path, dent->d_name);
311 if (stat(path, &st))
312 return 0;
313
314 return S_ISDIR(st.st_mode);
315}
316
317#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
305 while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \ 318 while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \
306 lang_next) \ 319 lang_next) \
307 if (lang_dirent.d_type == DT_DIR && \ 320 if ((lang_dirent.d_type == DT_DIR || \
321 (lang_dirent.d_type == DT_UNKNOWN && \
322 is_directory(scripts_path, &lang_dirent))) && \
308 (strcmp(lang_dirent.d_name, ".")) && \ 323 (strcmp(lang_dirent.d_name, ".")) && \
309 (strcmp(lang_dirent.d_name, ".."))) 324 (strcmp(lang_dirent.d_name, "..")))
310 325
311#define for_each_script(lang_dir, script_dirent, script_next) \ 326#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
312 while (!readdir_r(lang_dir, &script_dirent, &script_next) && \ 327 while (!readdir_r(lang_dir, &script_dirent, &script_next) && \
313 script_next) \ 328 script_next) \
314 if (script_dirent.d_type != DT_DIR) 329 if (script_dirent.d_type != DT_DIR && \
330 (script_dirent.d_type != DT_UNKNOWN || \
331 !is_directory(lang_path, &script_dirent)))
315 332
316 333
317#define RECORD_SUFFIX "-record" 334#define RECORD_SUFFIX "-record"
@@ -466,14 +483,14 @@ static int list_available_scripts(const struct option *opt __used,
466 if (!scripts_dir) 483 if (!scripts_dir)
467 return -1; 484 return -1;
468 485
469 for_each_lang(scripts_dir, lang_dirent, lang_next) { 486 for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
470 snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, 487 snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
471 lang_dirent.d_name); 488 lang_dirent.d_name);
472 lang_dir = opendir(lang_path); 489 lang_dir = opendir(lang_path);
473 if (!lang_dir) 490 if (!lang_dir)
474 continue; 491 continue;
475 492
476 for_each_script(lang_dir, script_dirent, script_next) { 493 for_each_script(lang_path, lang_dir, script_dirent, script_next) {
477 script_root = strdup(script_dirent.d_name); 494 script_root = strdup(script_dirent.d_name);
478 str = ends_with(script_root, REPORT_SUFFIX); 495 str = ends_with(script_root, REPORT_SUFFIX);
479 if (str) { 496 if (str) {
@@ -514,14 +531,14 @@ static char *get_script_path(const char *script_root, const char *suffix)
514 if (!scripts_dir) 531 if (!scripts_dir)
515 return NULL; 532 return NULL;
516 533
517 for_each_lang(scripts_dir, lang_dirent, lang_next) { 534 for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
518 snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, 535 snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
519 lang_dirent.d_name); 536 lang_dirent.d_name);
520 lang_dir = opendir(lang_path); 537 lang_dir = opendir(lang_path);
521 if (!lang_dir) 538 if (!lang_dir)
522 continue; 539 continue;
523 540
524 for_each_script(lang_dir, script_dirent, script_next) { 541 for_each_script(lang_path, lang_dir, script_dirent, script_next) {
525 __script_root = strdup(script_dirent.d_name); 542 __script_root = strdup(script_dirent.d_name);
526 str = ends_with(__script_root, suffix); 543 str = ends_with(__script_root, suffix);
527 if (str) { 544 if (str) {
@@ -569,12 +586,12 @@ out:
569 return n_args; 586 return n_args;
570} 587}
571 588
572static const char * const trace_usage[] = { 589static const char * const script_usage[] = {
573 "perf trace [<options>]", 590 "perf script [<options>]",
574 "perf trace [<options>] record <script> [<record-options>] <command>", 591 "perf script [<options>] record <script> [<record-options>] <command>",
575 "perf trace [<options>] report <script> [script-args]", 592 "perf script [<options>] report <script> [script-args]",
576 "perf trace [<options>] <script> [<record-options>] <command>", 593 "perf script [<options>] <script> [<record-options>] <command>",
577 "perf trace [<options>] <top-script> [script-args]", 594 "perf script [<options>] <top-script> [script-args]",
578 NULL 595 NULL
579}; 596};
580 597
@@ -591,7 +608,7 @@ static const struct option options[] = {
591 "script file name (lang:script name, script name, or *)", 608 "script file name (lang:script name, script name, or *)",
592 parse_scriptname), 609 parse_scriptname),
593 OPT_STRING('g', "gen-script", &generate_script_lang, "lang", 610 OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
594 "generate perf-trace.xx script in specified language"), 611 "generate perf-script.xx script in specified language"),
595 OPT_STRING('i', "input", &input_name, "file", 612 OPT_STRING('i', "input", &input_name, "file",
596 "input file name"), 613 "input file name"),
597 OPT_BOOLEAN('d', "debug-mode", &debug_mode, 614 OPT_BOOLEAN('d', "debug-mode", &debug_mode,
@@ -614,7 +631,7 @@ static bool have_cmd(int argc, const char **argv)
614 return argc != 0; 631 return argc != 0;
615} 632}
616 633
617int cmd_trace(int argc, const char **argv, const char *prefix __used) 634int cmd_script(int argc, const char **argv, const char *prefix __used)
618{ 635{
619 char *rec_script_path = NULL; 636 char *rec_script_path = NULL;
620 char *rep_script_path = NULL; 637 char *rep_script_path = NULL;
@@ -626,7 +643,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
626 643
627 setup_scripting(); 644 setup_scripting();
628 645
629 argc = parse_options(argc, argv, options, trace_usage, 646 argc = parse_options(argc, argv, options, script_usage,
630 PARSE_OPT_STOP_AT_NON_OPTION); 647 PARSE_OPT_STOP_AT_NON_OPTION);
631 648
632 if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { 649 if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
@@ -640,7 +657,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
640 if (!rep_script_path) { 657 if (!rep_script_path) {
641 fprintf(stderr, 658 fprintf(stderr,
642 "Please specify a valid report script" 659 "Please specify a valid report script"
643 "(see 'perf trace -l' for listing)\n"); 660 "(see 'perf script -l' for listing)\n");
644 return -1; 661 return -1;
645 } 662 }
646 } 663 }
@@ -658,8 +675,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
658 675
659 if (!rec_script_path && !rep_script_path) { 676 if (!rec_script_path && !rep_script_path) {
660 fprintf(stderr, " Couldn't find script %s\n\n See perf" 677 fprintf(stderr, " Couldn't find script %s\n\n See perf"
661 " trace -l for available scripts.\n", argv[0]); 678 " script -l for available scripts.\n", argv[0]);
662 usage_with_options(trace_usage, options); 679 usage_with_options(script_usage, options);
663 } 680 }
664 681
665 if (is_top_script(argv[0])) { 682 if (is_top_script(argv[0])) {
@@ -671,9 +688,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
671 rec_args = (argc - 1) - rep_args; 688 rec_args = (argc - 1) - rep_args;
672 if (rec_args < 0) { 689 if (rec_args < 0) {
673 fprintf(stderr, " %s script requires options." 690 fprintf(stderr, " %s script requires options."
674 "\n\n See perf trace -l for available " 691 "\n\n See perf script -l for available "
675 "scripts and options.\n", argv[0]); 692 "scripts and options.\n", argv[0]);
676 usage_with_options(trace_usage, options); 693 usage_with_options(script_usage, options);
677 } 694 }
678 } 695 }
679 696
@@ -806,7 +823,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
806 return -1; 823 return -1;
807 } 824 }
808 825
809 err = scripting_ops->generate_script("perf-trace"); 826 err = scripting_ops->generate_script("perf-script");
810 goto out; 827 goto out;
811 } 828 }
812 829
@@ -814,10 +831,10 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
814 err = scripting_ops->start_script(script_name, argc, argv); 831 err = scripting_ops->start_script(script_name, argc, argv);
815 if (err) 832 if (err)
816 goto out; 833 goto out;
817 pr_debug("perf trace started with script %s\n\n", script_name); 834 pr_debug("perf script started with script %s\n\n", script_name);
818 } 835 }
819 836
820 err = __cmd_trace(session); 837 err = __cmd_script(session);
821 838
822 perf_session__delete(session); 839 perf_session__delete(session);
823 cleanup_scripting(); 840 cleanup_scripting();
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a6b4d44f9502..7ff746da7e6c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -52,6 +52,8 @@
52#include <math.h> 52#include <math.h>
53#include <locale.h> 53#include <locale.h>
54 54
55#define DEFAULT_SEPARATOR " "
56
55static struct perf_event_attr default_attrs[] = { 57static struct perf_event_attr default_attrs[] = {
56 58
57 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 59 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
@@ -75,20 +77,30 @@ static int run_idx = 0;
75static int run_count = 1; 77static int run_count = 1;
76static bool no_inherit = false; 78static bool no_inherit = false;
77static bool scale = true; 79static bool scale = true;
80static bool no_aggr = false;
78static pid_t target_pid = -1; 81static pid_t target_pid = -1;
79static pid_t target_tid = -1; 82static pid_t target_tid = -1;
80static pid_t *all_tids = NULL; 83static pid_t *all_tids = NULL;
81static int thread_num = 0; 84static int thread_num = 0;
82static pid_t child_pid = -1; 85static pid_t child_pid = -1;
83static bool null_run = false; 86static bool null_run = false;
84static bool big_num = false; 87static bool big_num = true;
88static int big_num_opt = -1;
85static const char *cpu_list; 89static const char *cpu_list;
90static const char *csv_sep = NULL;
91static bool csv_output = false;
86 92
87 93
88static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; 94static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
89 95
90static int event_scaled[MAX_COUNTERS]; 96static int event_scaled[MAX_COUNTERS];
91 97
98static struct {
99 u64 val;
100 u64 ena;
101 u64 run;
102} cpu_counts[MAX_NR_CPUS][MAX_COUNTERS];
103
92static volatile int done = 0; 104static volatile int done = 0;
93 105
94struct stats 106struct stats
@@ -136,19 +148,19 @@ static double stddev_stats(struct stats *stats)
136} 148}
137 149
138struct stats event_res_stats[MAX_COUNTERS][3]; 150struct stats event_res_stats[MAX_COUNTERS][3];
139struct stats runtime_nsecs_stats; 151struct stats runtime_nsecs_stats[MAX_NR_CPUS];
152struct stats runtime_cycles_stats[MAX_NR_CPUS];
153struct stats runtime_branches_stats[MAX_NR_CPUS];
140struct stats walltime_nsecs_stats; 154struct stats walltime_nsecs_stats;
141struct stats runtime_cycles_stats;
142struct stats runtime_branches_stats;
143 155
144#define MATCH_EVENT(t, c, counter) \ 156#define MATCH_EVENT(t, c, counter) \
145 (attrs[counter].type == PERF_TYPE_##t && \ 157 (attrs[counter].type == PERF_TYPE_##t && \
146 attrs[counter].config == PERF_COUNT_##c) 158 attrs[counter].config == PERF_COUNT_##c)
147 159
148#define ERR_PERF_OPEN \ 160#define ERR_PERF_OPEN \
149"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" 161"counter %d, sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information."
150 162
151static int create_perf_stat_counter(int counter) 163static int create_perf_stat_counter(int counter, bool *perm_err)
152{ 164{
153 struct perf_event_attr *attr = attrs + counter; 165 struct perf_event_attr *attr = attrs + counter;
154 int thread; 166 int thread;
@@ -164,11 +176,14 @@ static int create_perf_stat_counter(int counter)
164 for (cpu = 0; cpu < nr_cpus; cpu++) { 176 for (cpu = 0; cpu < nr_cpus; cpu++) {
165 fd[cpu][counter][0] = sys_perf_event_open(attr, 177 fd[cpu][counter][0] = sys_perf_event_open(attr,
166 -1, cpumap[cpu], -1, 0); 178 -1, cpumap[cpu], -1, 0);
167 if (fd[cpu][counter][0] < 0) 179 if (fd[cpu][counter][0] < 0) {
168 pr_debug(ERR_PERF_OPEN, counter, 180 if (errno == EPERM || errno == EACCES)
181 *perm_err = true;
182 error(ERR_PERF_OPEN, counter,
169 fd[cpu][counter][0], strerror(errno)); 183 fd[cpu][counter][0], strerror(errno));
170 else 184 } else {
171 ++ncreated; 185 ++ncreated;
186 }
172 } 187 }
173 } else { 188 } else {
174 attr->inherit = !no_inherit; 189 attr->inherit = !no_inherit;
@@ -179,12 +194,15 @@ static int create_perf_stat_counter(int counter)
179 for (thread = 0; thread < thread_num; thread++) { 194 for (thread = 0; thread < thread_num; thread++) {
180 fd[0][counter][thread] = sys_perf_event_open(attr, 195 fd[0][counter][thread] = sys_perf_event_open(attr,
181 all_tids[thread], -1, -1, 0); 196 all_tids[thread], -1, -1, 0);
182 if (fd[0][counter][thread] < 0) 197 if (fd[0][counter][thread] < 0) {
183 pr_debug(ERR_PERF_OPEN, counter, 198 if (errno == EPERM || errno == EACCES)
199 *perm_err = true;
200 error(ERR_PERF_OPEN, counter,
184 fd[0][counter][thread], 201 fd[0][counter][thread],
185 strerror(errno)); 202 strerror(errno));
186 else 203 } else {
187 ++ncreated; 204 ++ncreated;
205 }
188 } 206 }
189 } 207 }
190 208
@@ -205,8 +223,9 @@ static inline int nsec_counter(int counter)
205 223
206/* 224/*
207 * Read out the results of a single counter: 225 * Read out the results of a single counter:
226 * aggregate counts across CPUs in system-wide mode
208 */ 227 */
209static void read_counter(int counter) 228static void read_counter_aggr(int counter)
210{ 229{
211 u64 count[3], single_count[3]; 230 u64 count[3], single_count[3];
212 int cpu; 231 int cpu;
@@ -264,11 +283,58 @@ static void read_counter(int counter)
264 * Save the full runtime - to allow normalization during printout: 283 * Save the full runtime - to allow normalization during printout:
265 */ 284 */
266 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) 285 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
267 update_stats(&runtime_nsecs_stats, count[0]); 286 update_stats(&runtime_nsecs_stats[0], count[0]);
268 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) 287 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
269 update_stats(&runtime_cycles_stats, count[0]); 288 update_stats(&runtime_cycles_stats[0], count[0]);
270 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) 289 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
271 update_stats(&runtime_branches_stats, count[0]); 290 update_stats(&runtime_branches_stats[0], count[0]);
291}
292
293/*
294 * Read out the results of a single counter:
295 * do not aggregate counts across CPUs in system-wide mode
296 */
297static void read_counter(int counter)
298{
299 u64 count[3];
300 int cpu;
301 size_t res, nv;
302
303 count[0] = count[1] = count[2] = 0;
304
305 nv = scale ? 3 : 1;
306
307 for (cpu = 0; cpu < nr_cpus; cpu++) {
308
309 if (fd[cpu][counter][0] < 0)
310 continue;
311
312 res = read(fd[cpu][counter][0], count, nv * sizeof(u64));
313
314 assert(res == nv * sizeof(u64));
315
316 close(fd[cpu][counter][0]);
317 fd[cpu][counter][0] = -1;
318
319 if (scale) {
320 if (count[2] == 0) {
321 count[0] = 0;
322 } else if (count[2] < count[1]) {
323 count[0] = (unsigned long long)
324 ((double)count[0] * count[1] / count[2] + 0.5);
325 }
326 }
327 cpu_counts[cpu][counter].val = count[0]; /* scaled count */
328 cpu_counts[cpu][counter].ena = count[1];
329 cpu_counts[cpu][counter].run = count[2];
330
331 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
332 update_stats(&runtime_nsecs_stats[cpu], count[0]);
333 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
334 update_stats(&runtime_cycles_stats[cpu], count[0]);
335 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
336 update_stats(&runtime_branches_stats[cpu], count[0]);
337 }
272} 338}
273 339
274static int run_perf_stat(int argc __used, const char **argv) 340static int run_perf_stat(int argc __used, const char **argv)
@@ -277,6 +343,7 @@ static int run_perf_stat(int argc __used, const char **argv)
277 int status = 0; 343 int status = 0;
278 int counter, ncreated = 0; 344 int counter, ncreated = 0;
279 int child_ready_pipe[2], go_pipe[2]; 345 int child_ready_pipe[2], go_pipe[2];
346 bool perm_err = false;
280 const bool forks = (argc > 0); 347 const bool forks = (argc > 0);
281 char buf; 348 char buf;
282 349
@@ -335,12 +402,15 @@ static int run_perf_stat(int argc __used, const char **argv)
335 } 402 }
336 403
337 for (counter = 0; counter < nr_counters; counter++) 404 for (counter = 0; counter < nr_counters; counter++)
338 ncreated += create_perf_stat_counter(counter); 405 ncreated += create_perf_stat_counter(counter, &perm_err);
339 406
340 if (ncreated == 0) { 407 if (ncreated < nr_counters) {
341 pr_err("No permission to collect %sstats.\n" 408 if (perm_err)
342 "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n", 409 error("You may not have permission to collect %sstats.\n"
343 system_wide ? "system-wide " : ""); 410 "\t Consider tweaking"
411 " /proc/sys/kernel/perf_event_paranoid or running as root.",
412 system_wide ? "system-wide " : "");
413 die("Not all events could be opened.\n");
344 if (child_pid != -1) 414 if (child_pid != -1)
345 kill(child_pid, SIGTERM); 415 kill(child_pid, SIGTERM);
346 return -1; 416 return -1;
@@ -362,9 +432,13 @@ static int run_perf_stat(int argc __used, const char **argv)
362 432
363 update_stats(&walltime_nsecs_stats, t1 - t0); 433 update_stats(&walltime_nsecs_stats, t1 - t0);
364 434
365 for (counter = 0; counter < nr_counters; counter++) 435 if (no_aggr) {
366 read_counter(counter); 436 for (counter = 0; counter < nr_counters; counter++)
367 437 read_counter(counter);
438 } else {
439 for (counter = 0; counter < nr_counters; counter++)
440 read_counter_aggr(counter);
441 }
368 return WEXITSTATUS(status); 442 return WEXITSTATUS(status);
369} 443}
370 444
@@ -377,11 +451,21 @@ static void print_noise(int counter, double avg)
377 100 * stddev_stats(&event_res_stats[counter][0]) / avg); 451 100 * stddev_stats(&event_res_stats[counter][0]) / avg);
378} 452}
379 453
380static void nsec_printout(int counter, double avg) 454static void nsec_printout(int cpu, int counter, double avg)
381{ 455{
382 double msecs = avg / 1e6; 456 double msecs = avg / 1e6;
457 char cpustr[16] = { '\0', };
458 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s";
459
460 if (no_aggr)
461 sprintf(cpustr, "CPU%*d%s",
462 csv_output ? 0 : -4,
463 cpumap[cpu], csv_sep);
383 464
384 fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); 465 fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(counter));
466
467 if (csv_output)
468 return;
385 469
386 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { 470 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
387 fprintf(stderr, " # %10.3f CPUs ", 471 fprintf(stderr, " # %10.3f CPUs ",
@@ -389,33 +473,49 @@ static void nsec_printout(int counter, double avg)
389 } 473 }
390} 474}
391 475
392static void abs_printout(int counter, double avg) 476static void abs_printout(int cpu, int counter, double avg)
393{ 477{
394 double total, ratio = 0.0; 478 double total, ratio = 0.0;
479 char cpustr[16] = { '\0', };
480 const char *fmt;
481
482 if (csv_output)
483 fmt = "%s%.0f%s%s";
484 else if (big_num)
485 fmt = "%s%'18.0f%s%-24s";
486 else
487 fmt = "%s%18.0f%s%-24s";
395 488
396 if (big_num) 489 if (no_aggr)
397 fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); 490 sprintf(cpustr, "CPU%*d%s",
491 csv_output ? 0 : -4,
492 cpumap[cpu], csv_sep);
398 else 493 else
399 fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); 494 cpu = 0;
495
496 fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(counter));
497
498 if (csv_output)
499 return;
400 500
401 if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { 501 if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
402 total = avg_stats(&runtime_cycles_stats); 502 total = avg_stats(&runtime_cycles_stats[cpu]);
403 503
404 if (total) 504 if (total)
405 ratio = avg / total; 505 ratio = avg / total;
406 506
407 fprintf(stderr, " # %10.3f IPC ", ratio); 507 fprintf(stderr, " # %10.3f IPC ", ratio);
408 } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && 508 } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
409 runtime_branches_stats.n != 0) { 509 runtime_branches_stats[cpu].n != 0) {
410 total = avg_stats(&runtime_branches_stats); 510 total = avg_stats(&runtime_branches_stats[cpu]);
411 511
412 if (total) 512 if (total)
413 ratio = avg * 100 / total; 513 ratio = avg * 100 / total;
414 514
415 fprintf(stderr, " # %10.3f %% ", ratio); 515 fprintf(stderr, " # %10.3f %% ", ratio);
416 516
417 } else if (runtime_nsecs_stats.n != 0) { 517 } else if (runtime_nsecs_stats[cpu].n != 0) {
418 total = avg_stats(&runtime_nsecs_stats); 518 total = avg_stats(&runtime_nsecs_stats[cpu]);
419 519
420 if (total) 520 if (total)
421 ratio = 1000.0 * avg / total; 521 ratio = 1000.0 * avg / total;
@@ -426,22 +526,29 @@ static void abs_printout(int counter, double avg)
426 526
427/* 527/*
428 * Print out the results of a single counter: 528 * Print out the results of a single counter:
529 * aggregated counts in system-wide mode
429 */ 530 */
430static void print_counter(int counter) 531static void print_counter_aggr(int counter)
431{ 532{
432 double avg = avg_stats(&event_res_stats[counter][0]); 533 double avg = avg_stats(&event_res_stats[counter][0]);
433 int scaled = event_scaled[counter]; 534 int scaled = event_scaled[counter];
434 535
435 if (scaled == -1) { 536 if (scaled == -1) {
436 fprintf(stderr, " %18s %-24s\n", 537 fprintf(stderr, "%*s%s%-24s\n",
437 "<not counted>", event_name(counter)); 538 csv_output ? 0 : 18,
539 "<not counted>", csv_sep, event_name(counter));
438 return; 540 return;
439 } 541 }
440 542
441 if (nsec_counter(counter)) 543 if (nsec_counter(counter))
442 nsec_printout(counter, avg); 544 nsec_printout(-1, counter, avg);
443 else 545 else
444 abs_printout(counter, avg); 546 abs_printout(-1, counter, avg);
547
548 if (csv_output) {
549 fputc('\n', stderr);
550 return;
551 }
445 552
446 print_noise(counter, avg); 553 print_noise(counter, avg);
447 554
@@ -458,40 +565,91 @@ static void print_counter(int counter)
458 fprintf(stderr, "\n"); 565 fprintf(stderr, "\n");
459} 566}
460 567
568/*
569 * Print out the results of a single counter:
570 * does not use aggregated count in system-wide
571 */
572static void print_counter(int counter)
573{
574 u64 ena, run, val;
575 int cpu;
576
577 for (cpu = 0; cpu < nr_cpus; cpu++) {
578 val = cpu_counts[cpu][counter].val;
579 ena = cpu_counts[cpu][counter].ena;
580 run = cpu_counts[cpu][counter].run;
581 if (run == 0 || ena == 0) {
582 fprintf(stderr, "CPU%*d%s%*s%s%-24s",
583 csv_output ? 0 : -4,
584 cpumap[cpu], csv_sep,
585 csv_output ? 0 : 18,
586 "<not counted>", csv_sep,
587 event_name(counter));
588
589 fprintf(stderr, "\n");
590 continue;
591 }
592
593 if (nsec_counter(counter))
594 nsec_printout(cpu, counter, val);
595 else
596 abs_printout(cpu, counter, val);
597
598 if (!csv_output) {
599 print_noise(counter, 1.0);
600
601 if (run != ena) {
602 fprintf(stderr, " (scaled from %.2f%%)",
603 100.0 * run / ena);
604 }
605 }
606 fprintf(stderr, "\n");
607 }
608}
609
461static void print_stat(int argc, const char **argv) 610static void print_stat(int argc, const char **argv)
462{ 611{
463 int i, counter; 612 int i, counter;
464 613
465 fflush(stdout); 614 fflush(stdout);
466 615
467 fprintf(stderr, "\n"); 616 if (!csv_output) {
468 fprintf(stderr, " Performance counter stats for "); 617 fprintf(stderr, "\n");
469 if(target_pid == -1 && target_tid == -1) { 618 fprintf(stderr, " Performance counter stats for ");
470 fprintf(stderr, "\'%s", argv[0]); 619 if(target_pid == -1 && target_tid == -1) {
471 for (i = 1; i < argc; i++) 620 fprintf(stderr, "\'%s", argv[0]);
472 fprintf(stderr, " %s", argv[i]); 621 for (i = 1; i < argc; i++)
473 } else if (target_pid != -1) 622 fprintf(stderr, " %s", argv[i]);
474 fprintf(stderr, "process id \'%d", target_pid); 623 } else if (target_pid != -1)
475 else 624 fprintf(stderr, "process id \'%d", target_pid);
476 fprintf(stderr, "thread id \'%d", target_tid); 625 else
477 626 fprintf(stderr, "thread id \'%d", target_tid);
478 fprintf(stderr, "\'"); 627
479 if (run_count > 1) 628 fprintf(stderr, "\'");
480 fprintf(stderr, " (%d runs)", run_count); 629 if (run_count > 1)
481 fprintf(stderr, ":\n\n"); 630 fprintf(stderr, " (%d runs)", run_count);
631 fprintf(stderr, ":\n\n");
632 }
482 633
483 for (counter = 0; counter < nr_counters; counter++) 634 if (no_aggr) {
484 print_counter(counter); 635 for (counter = 0; counter < nr_counters; counter++)
636 print_counter(counter);
637 } else {
638 for (counter = 0; counter < nr_counters; counter++)
639 print_counter_aggr(counter);
640 }
485 641
486 fprintf(stderr, "\n"); 642 if (!csv_output) {
487 fprintf(stderr, " %18.9f seconds time elapsed", 643 fprintf(stderr, "\n");
488 avg_stats(&walltime_nsecs_stats)/1e9); 644 fprintf(stderr, " %18.9f seconds time elapsed",
489 if (run_count > 1) { 645 avg_stats(&walltime_nsecs_stats)/1e9);
490 fprintf(stderr, " ( +- %7.3f%% )", 646 if (run_count > 1) {
647 fprintf(stderr, " ( +- %7.3f%% )",
491 100*stddev_stats(&walltime_nsecs_stats) / 648 100*stddev_stats(&walltime_nsecs_stats) /
492 avg_stats(&walltime_nsecs_stats)); 649 avg_stats(&walltime_nsecs_stats));
650 }
651 fprintf(stderr, "\n\n");
493 } 652 }
494 fprintf(stderr, "\n\n");
495} 653}
496 654
497static volatile int signr = -1; 655static volatile int signr = -1;
@@ -521,6 +679,13 @@ static const char * const stat_usage[] = {
521 NULL 679 NULL
522}; 680};
523 681
682static int stat__set_big_num(const struct option *opt __used,
683 const char *s __used, int unset)
684{
685 big_num_opt = unset ? 0 : 1;
686 return 0;
687}
688
524static const struct option options[] = { 689static const struct option options[] = {
525 OPT_CALLBACK('e', "event", NULL, "event", 690 OPT_CALLBACK('e', "event", NULL, "event",
526 "event selector. use 'perf list' to list available events", 691 "event selector. use 'perf list' to list available events",
@@ -541,10 +706,15 @@ static const struct option options[] = {
541 "repeat command and print average + stddev (max: 100)"), 706 "repeat command and print average + stddev (max: 100)"),
542 OPT_BOOLEAN('n', "null", &null_run, 707 OPT_BOOLEAN('n', "null", &null_run,
543 "null run - dont start any counters"), 708 "null run - dont start any counters"),
544 OPT_BOOLEAN('B', "big-num", &big_num, 709 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL,
545 "print large numbers with thousands\' separators"), 710 "print large numbers with thousands\' separators",
711 stat__set_big_num),
546 OPT_STRING('C', "cpu", &cpu_list, "cpu", 712 OPT_STRING('C', "cpu", &cpu_list, "cpu",
547 "list of cpus to monitor in system-wide"), 713 "list of cpus to monitor in system-wide"),
714 OPT_BOOLEAN('A', "no-aggr", &no_aggr,
715 "disable CPU count aggregation"),
716 OPT_STRING('x', "field-separator", &csv_sep, "separator",
717 "print counts with custom separator"),
548 OPT_END() 718 OPT_END()
549}; 719};
550 720
@@ -557,11 +727,34 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
557 727
558 argc = parse_options(argc, argv, options, stat_usage, 728 argc = parse_options(argc, argv, options, stat_usage,
559 PARSE_OPT_STOP_AT_NON_OPTION); 729 PARSE_OPT_STOP_AT_NON_OPTION);
730
731 if (csv_sep)
732 csv_output = true;
733 else
734 csv_sep = DEFAULT_SEPARATOR;
735
736 /*
737 * let the spreadsheet do the pretty-printing
738 */
739 if (csv_output) {
740 /* User explicitely passed -B? */
741 if (big_num_opt == 1) {
742 fprintf(stderr, "-B option not supported with -x\n");
743 usage_with_options(stat_usage, options);
744 } else /* Nope, so disable big number formatting */
745 big_num = false;
746 } else if (big_num_opt == 0) /* User passed --no-big-num */
747 big_num = false;
748
560 if (!argc && target_pid == -1 && target_tid == -1) 749 if (!argc && target_pid == -1 && target_tid == -1)
561 usage_with_options(stat_usage, options); 750 usage_with_options(stat_usage, options);
562 if (run_count <= 0) 751 if (run_count <= 0)
563 usage_with_options(stat_usage, options); 752 usage_with_options(stat_usage, options);
564 753
754 /* no_aggr is for system-wide only */
755 if (no_aggr && !system_wide)
756 usage_with_options(stat_usage, options);
757
565 /* Set attrs and nr_counters if no event is selected and !null_run */ 758 /* Set attrs and nr_counters if no event is selected and !null_run */
566 if (!null_run && !nr_counters) { 759 if (!null_run && !nr_counters) {
567 memcpy(attrs, default_attrs, sizeof(default_attrs)); 760 memcpy(attrs, default_attrs, sizeof(default_attrs));
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index dd625808c2a5..3d2b47d5121a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1214,7 +1214,9 @@ try_again:
1214 int err = errno; 1214 int err = errno;
1215 1215
1216 if (err == EPERM || err == EACCES) 1216 if (err == EPERM || err == EACCES)
1217 die("No permission - are you root?\n"); 1217 die("Permission error - are you root?\n"
1218 "\t Consider tweaking"
1219 " /proc/sys/kernel/perf_event_paranoid.\n");
1218 /* 1220 /*
1219 * If it's cycles then fall back to hrtimer 1221 * If it's cycles then fall back to hrtimer
1220 * based cpu-clock-tick sw counter, which 1222 * based cpu-clock-tick sw counter, which
@@ -1231,7 +1233,7 @@ try_again:
1231 goto try_again; 1233 goto try_again;
1232 } 1234 }
1233 printf("\n"); 1235 printf("\n");
1234 error("perfcounter syscall returned with %d (%s)\n", 1236 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
1235 fd[i][counter][thread_index], strerror(err)); 1237 fd[i][counter][thread_index], strerror(err));
1236 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 1238 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
1237 exit(-1); 1239 exit(-1);
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 921245b28583..c7798c7f24ed 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -27,7 +27,7 @@ extern int cmd_report(int argc, const char **argv, const char *prefix);
27extern int cmd_stat(int argc, const char **argv, const char *prefix); 27extern int cmd_stat(int argc, const char **argv, const char *prefix);
28extern int cmd_timechart(int argc, const char **argv, const char *prefix); 28extern int cmd_timechart(int argc, const char **argv, const char *prefix);
29extern int cmd_top(int argc, const char **argv, const char *prefix); 29extern int cmd_top(int argc, const char **argv, const char *prefix);
30extern int cmd_trace(int argc, const char **argv, const char *prefix); 30extern int cmd_script(int argc, const char **argv, const char *prefix);
31extern int cmd_version(int argc, const char **argv, const char *prefix); 31extern int cmd_version(int argc, const char **argv, const char *prefix);
32extern int cmd_probe(int argc, const char **argv, const char *prefix); 32extern int cmd_probe(int argc, const char **argv, const char *prefix);
33extern int cmd_kmem(int argc, const char **argv, const char *prefix); 33extern int cmd_kmem(int argc, const char **argv, const char *prefix);
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 949d77fc0b97..16b5088cf8f4 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -16,7 +16,7 @@ perf-report mainporcelain common
16perf-stat mainporcelain common 16perf-stat mainporcelain common
17perf-timechart mainporcelain common 17perf-timechart mainporcelain common
18perf-top mainporcelain common 18perf-top mainporcelain common
19perf-trace mainporcelain common 19perf-script mainporcelain common
20perf-probe mainporcelain common 20perf-probe mainporcelain common
21perf-kmem mainporcelain common 21perf-kmem mainporcelain common
22perf-lock mainporcelain common 22perf-lock mainporcelain common
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak
index b253db634f04..b041ca67a2cb 100644
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -9,8 +9,8 @@ endef
9ifndef NO_DWARF 9ifndef NO_DWARF
10define SOURCE_DWARF 10define SOURCE_DWARF
11#include <dwarf.h> 11#include <dwarf.h>
12#include <libdw.h> 12#include <elfutils/libdw.h>
13#include <version.h> 13#include <elfutils/version.h>
14#ifndef _ELFUTILS_PREREQ 14#ifndef _ELFUTILS_PREREQ
15#error 15#error
16#endif 16#endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index cdd6c03f1e14..595d0f4a7103 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -323,7 +323,7 @@ static void handle_internal_command(int argc, const char **argv)
323 { "top", cmd_top, 0 }, 323 { "top", cmd_top, 0 },
324 { "annotate", cmd_annotate, 0 }, 324 { "annotate", cmd_annotate, 0 },
325 { "version", cmd_version, 0 }, 325 { "version", cmd_version, 0 },
326 { "trace", cmd_trace, 0 }, 326 { "script", cmd_script, 0 },
327 { "sched", cmd_sched, 0 }, 327 { "sched", cmd_sched, 0 },
328 { "probe", cmd_probe, 0 }, 328 { "probe", cmd_probe, 0 },
329 { "kmem", cmd_kmem, 0 }, 329 { "kmem", cmd_kmem, 0 },
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 957085dd5d8d..315067b8f552 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Context.c. Python interfaces for perf trace. 2 * Context.c. Python interfaces for perf script.
3 * 3 *
4 * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com> 4 * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
5 * 5 *
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index c8d81b00089d..01bbe8ecec3f 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -46,20 +46,16 @@ int dump_printf(const char *fmt, ...)
46 return ret; 46 return ret;
47} 47}
48 48
49static int dump_printf_color(const char *fmt, const char *color, ...) 49#ifdef NO_NEWT_SUPPORT
50void ui__warning(const char *format, ...)
50{ 51{
51 va_list args; 52 va_list args;
52 int ret = 0;
53 53
54 if (dump_trace) { 54 va_start(args, format);
55 va_start(args, color); 55 vfprintf(stderr, format, args);
56 ret = color_vfprintf(stdout, color, fmt, args); 56 va_end(args);
57 va_end(args);
58 }
59
60 return ret;
61} 57}
62 58#endif
63 59
64void trace_event(event_t *event) 60void trace_event(event_t *event)
65{ 61{
@@ -70,29 +66,29 @@ void trace_event(event_t *event)
70 if (!dump_trace) 66 if (!dump_trace)
71 return; 67 return;
72 68
73 dump_printf("."); 69 printf(".");
74 dump_printf_color("\n. ... raw event: size %d bytes\n", color, 70 color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
75 event->header.size); 71 event->header.size);
76 72
77 for (i = 0; i < event->header.size; i++) { 73 for (i = 0; i < event->header.size; i++) {
78 if ((i & 15) == 0) { 74 if ((i & 15) == 0) {
79 dump_printf("."); 75 printf(".");
80 dump_printf_color(" %04x: ", color, i); 76 color_fprintf(stdout, color, " %04x: ", i);
81 } 77 }
82 78
83 dump_printf_color(" %02x", color, raw_event[i]); 79 color_fprintf(stdout, color, " %02x", raw_event[i]);
84 80
85 if (((i & 15) == 15) || i == event->header.size-1) { 81 if (((i & 15) == 15) || i == event->header.size-1) {
86 dump_printf_color(" ", color); 82 color_fprintf(stdout, color, " ");
87 for (j = 0; j < 15-(i & 15); j++) 83 for (j = 0; j < 15-(i & 15); j++)
88 dump_printf_color(" ", color); 84 color_fprintf(stdout, color, " ");
89 for (j = i & ~15; j <= i; j++) { 85 for (j = i & ~15; j <= i; j++) {
90 dump_printf_color("%c", color, 86 color_fprintf(stdout, color, "%c",
91 isprint(raw_event[j]) ? 87 isprint(raw_event[j]) ?
92 raw_event[j] : '.'); 88 raw_event[j] : '.');
93 } 89 }
94 dump_printf_color("\n", color); 90 color_fprintf(stdout, color, "\n");
95 } 91 }
96 } 92 }
97 dump_printf(".\n"); 93 printf(".\n");
98} 94}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 7b514082bbaf..ca35fd66b5df 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -35,4 +35,6 @@ int ui_helpline__show_help(const char *format, va_list ap);
35#include "ui/progress.h" 35#include "ui/progress.h"
36#endif 36#endif
37 37
38void ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
39
38#endif /* __PERF_DEBUG_H */ 40#endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index dab9e754a281..7260db75b93d 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -392,7 +392,7 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
392 * a zero sized synthesized MMAP event for the kernel. 392 * a zero sized synthesized MMAP event for the kernel.
393 */ 393 */
394 if (maps[MAP__FUNCTION]->end == 0) 394 if (maps[MAP__FUNCTION]->end == 0)
395 maps[MAP__FUNCTION]->end = ~0UL; 395 maps[MAP__FUNCTION]->end = ~0ULL;
396} 396}
397 397
398static int event__process_kernel_mmap(event_t *self, 398static int event__process_kernel_mmap(event_t *self,
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d7e67b167ea3..f65d7dc127b6 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -152,6 +152,11 @@ void perf_header__set_feat(struct perf_header *self, int feat)
152 set_bit(feat, self->adds_features); 152 set_bit(feat, self->adds_features);
153} 153}
154 154
155void perf_header__clear_feat(struct perf_header *self, int feat)
156{
157 clear_bit(feat, self->adds_features);
158}
159
155bool perf_header__has_feat(const struct perf_header *self, int feat) 160bool perf_header__has_feat(const struct perf_header *self, int feat)
156{ 161{
157 return test_bit(feat, self->adds_features); 162 return test_bit(feat, self->adds_features);
@@ -431,8 +436,10 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
431 int idx = 0, err; 436 int idx = 0, err;
432 437
433 session = container_of(self, struct perf_session, header); 438 session = container_of(self, struct perf_session, header);
434 if (perf_session__read_build_ids(session, true)) 439
435 perf_header__set_feat(self, HEADER_BUILD_ID); 440 if (perf_header__has_feat(self, HEADER_BUILD_ID &&
441 !perf_session__read_build_ids(session, true)))
442 perf_header__clear_feat(self, HEADER_BUILD_ID);
436 443
437 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); 444 nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
438 if (!nr_sections) 445 if (!nr_sections)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 402ac2454cf8..ed550bffd655 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -84,6 +84,7 @@ u64 perf_header__sample_type(struct perf_header *header);
84struct perf_event_attr * 84struct perf_event_attr *
85perf_header__find_attr(u64 id, struct perf_header *header); 85perf_header__find_attr(u64 id, struct perf_header *header);
86void perf_header__set_feat(struct perf_header *self, int feat); 86void perf_header__set_feat(struct perf_header *self, int feat);
87void perf_header__clear_feat(struct perf_header *self, int feat);
87bool perf_header__has_feat(const struct perf_header *self, int feat); 88bool perf_header__has_feat(const struct perf_header *self, int feat);
88 89
89int perf_header__process_sections(struct perf_header *self, int fd, 90int perf_header__process_sections(struct perf_header *self, int fd,
diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h
new file mode 100644
index 000000000000..acffd5e4d1d4
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
1
2#ifndef PERF_CPUFEATURE_H
3#define PERF_CPUFEATURE_H
4
5/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
6
7#define X86_FEATURE_REP_GOOD 0
8
9#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h
new file mode 100644
index 000000000000..bb4198e7837a
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
1
2#ifndef PERF_DWARF2_H
3#define PERF_DWARF2_H
4
5/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
6
7#define CFI_STARTPROC
8#define CFI_ENDPROC
9
10#endif /* PERF_DWARF2_H */
11
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index bb4ac2e05385..8be0b968ca0b 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -13,6 +13,11 @@ static inline void set_bit(int nr, unsigned long *addr)
13 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); 13 addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
14} 14}
15 15
16static inline void clear_bit(int nr, unsigned long *addr)
17{
18 addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
19}
20
16static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) 21static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
17{ 22{
18 return ((1UL << (nr % BITS_PER_LONG)) & 23 return ((1UL << (nr % BITS_PER_LONG)) &
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h
new file mode 100644
index 000000000000..06387cffe125
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
1
2#ifndef PERF_LINUX_LINKAGE_H_
3#define PERF_LINUX_LINKAGE_H_
4
5/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
6
7#define ENTRY(name) \
8 .globl name; \
9 name:
10
11#define ENDPROC(name)
12
13#endif /* PERF_LINUX_LINKAGE_H_ */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4af5bd59cfd1..c305305a3884 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -434,7 +434,7 @@ parse_single_tracepoint_event(char *sys_name,
434 id = atoll(id_buf); 434 id = atoll(id_buf);
435 attr->config = id; 435 attr->config = id;
436 attr->type = PERF_TYPE_TRACEPOINT; 436 attr->type = PERF_TYPE_TRACEPOINT;
437 *strp = evt_name + evt_length; 437 *strp += strlen(sys_name) + evt_length + 1; /* + 1 for the ':' */
438 438
439 attr->sample_type |= PERF_SAMPLE_RAW; 439 attr->sample_type |= PERF_SAMPLE_RAW;
440 attr->sample_type |= PERF_SAMPLE_TIME; 440 attr->sample_type |= PERF_SAMPLE_TIME;
@@ -495,7 +495,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
495 struct perf_event_attr *attr) 495 struct perf_event_attr *attr)
496{ 496{
497 const char *evt_name; 497 const char *evt_name;
498 char *flags; 498 char *flags = NULL, *comma_loc;
499 char sys_name[MAX_EVENT_LENGTH]; 499 char sys_name[MAX_EVENT_LENGTH];
500 unsigned int sys_length, evt_length; 500 unsigned int sys_length, evt_length;
501 501
@@ -514,6 +514,11 @@ static enum event_result parse_tracepoint_event(const char **strp,
514 sys_name[sys_length] = '\0'; 514 sys_name[sys_length] = '\0';
515 evt_name = evt_name + 1; 515 evt_name = evt_name + 1;
516 516
517 comma_loc = strchr(evt_name, ',');
518 if (comma_loc) {
519 /* take the event name up to the comma */
520 evt_name = strndup(evt_name, comma_loc - evt_name);
521 }
517 flags = strchr(evt_name, ':'); 522 flags = strchr(evt_name, ':');
518 if (flags) { 523 if (flags) {
519 /* split it out: */ 524 /* split it out: */
@@ -524,9 +529,8 @@ static enum event_result parse_tracepoint_event(const char **strp,
524 evt_length = strlen(evt_name); 529 evt_length = strlen(evt_name);
525 if (evt_length >= MAX_EVENT_LENGTH) 530 if (evt_length >= MAX_EVENT_LENGTH)
526 return EVT_FAILED; 531 return EVT_FAILED;
527
528 if (strpbrk(evt_name, "*?")) { 532 if (strpbrk(evt_name, "*?")) {
529 *strp = evt_name + evt_length; 533 *strp += strlen(sys_name) + evt_length;
530 return parse_multiple_tracepoint_event(sys_name, evt_name, 534 return parse_multiple_tracepoint_event(sys_name, evt_name,
531 flags); 535 flags);
532 } else 536 } else
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index bba69d455699..beaefc3c1223 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -34,9 +34,9 @@ extern int find_available_vars_at(int fd, struct perf_probe_event *pev,
34 bool externs); 34 bool externs);
35 35
36#include <dwarf.h> 36#include <dwarf.h>
37#include <libdw.h> 37#include <elfutils/libdw.h>
38#include <libdwfl.h> 38#include <elfutils/libdwfl.h>
39#include <version.h> 39#include <elfutils/version.h>
40 40
41struct probe_finder { 41struct probe_finder {
42 struct perf_probe_event *pev; /* Target probe event */ 42 struct perf_probe_event *pev; /* Target probe event */
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index b059dc50cc2d..93680818e244 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * trace-event-perl. Feed perf trace events to an embedded Perl interpreter. 2 * trace-event-perl. Feed perf script events to an embedded Perl interpreter.
3 * 3 *
4 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 4 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
5 * 5 *
@@ -411,8 +411,8 @@ static int perl_generate_script(const char *outfile)
411 return -1; 411 return -1;
412 } 412 }
413 413
414 fprintf(ofp, "# perf trace event handlers, " 414 fprintf(ofp, "# perf script event handlers, "
415 "generated by perf trace -g perl\n"); 415 "generated by perf script -g perl\n");
416 416
417 fprintf(ofp, "# Licensed under the terms of the GNU GPL" 417 fprintf(ofp, "# Licensed under the terms of the GNU GPL"
418 " License version 2\n\n"); 418 " License version 2\n\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 33a632523743..c6d99334bdfa 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -442,8 +442,8 @@ static int python_generate_script(const char *outfile)
442 fprintf(stderr, "couldn't open %s\n", fname); 442 fprintf(stderr, "couldn't open %s\n", fname);
443 return -1; 443 return -1;
444 } 444 }
445 fprintf(ofp, "# perf trace event handlers, " 445 fprintf(ofp, "# perf script event handlers, "
446 "generated by perf trace -g python\n"); 446 "generated by perf script -g python\n");
447 447
448 fprintf(ofp, "# Licensed under the terms of the GNU GPL" 448 fprintf(ofp, "# Licensed under the terms of the GNU GPL"
449 " License version 2\n\n"); 449 " License version 2\n\n");
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index fa9d652c2dc3..52672dad1fe9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -101,10 +101,20 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
101 INIT_LIST_HEAD(&self->dead_threads); 101 INIT_LIST_HEAD(&self->dead_threads);
102 self->hists_tree = RB_ROOT; 102 self->hists_tree = RB_ROOT;
103 self->last_match = NULL; 103 self->last_match = NULL;
104 self->mmap_window = 32; 104 /*
105 * On 64bit we can mmap the data file in one go. No need for tiny mmap
106 * slices. On 32bit we use 32MB.
107 */
108#if BITS_PER_LONG == 64
109 self->mmap_window = ULLONG_MAX;
110#else
111 self->mmap_window = 32 * 1024 * 1024ULL;
112#endif
105 self->machines = RB_ROOT; 113 self->machines = RB_ROOT;
106 self->repipe = repipe; 114 self->repipe = repipe;
107 INIT_LIST_HEAD(&self->ordered_samples.samples_head); 115 INIT_LIST_HEAD(&self->ordered_samples.samples);
116 INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
117 INIT_LIST_HEAD(&self->ordered_samples.to_free);
108 machine__init(&self->host_machine, "", HOST_KERNEL_ID); 118 machine__init(&self->host_machine, "", HOST_KERNEL_ID);
109 119
110 if (mode == O_RDONLY) { 120 if (mode == O_RDONLY) {
@@ -262,7 +272,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
262 if (handler->exit == NULL) 272 if (handler->exit == NULL)
263 handler->exit = process_event_stub; 273 handler->exit = process_event_stub;
264 if (handler->lost == NULL) 274 if (handler->lost == NULL)
265 handler->lost = process_event_stub; 275 handler->lost = event__process_lost;
266 if (handler->read == NULL) 276 if (handler->read == NULL)
267 handler->read = process_event_stub; 277 handler->read = process_event_stub;
268 if (handler->throttle == NULL) 278 if (handler->throttle == NULL)
@@ -386,33 +396,51 @@ static event__swap_op event__swap_ops[] = {
386 396
387struct sample_queue { 397struct sample_queue {
388 u64 timestamp; 398 u64 timestamp;
389 struct sample_event *event; 399 event_t *event;
390 struct list_head list; 400 struct list_head list;
391}; 401};
392 402
403static void perf_session_free_sample_buffers(struct perf_session *session)
404{
405 struct ordered_samples *os = &session->ordered_samples;
406
407 while (!list_empty(&os->to_free)) {
408 struct sample_queue *sq;
409
410 sq = list_entry(os->to_free.next, struct sample_queue, list);
411 list_del(&sq->list);
412 free(sq);
413 }
414}
415
393static void flush_sample_queue(struct perf_session *s, 416static void flush_sample_queue(struct perf_session *s,
394 struct perf_event_ops *ops) 417 struct perf_event_ops *ops)
395{ 418{
396 struct list_head *head = &s->ordered_samples.samples_head; 419 struct ordered_samples *os = &s->ordered_samples;
397 u64 limit = s->ordered_samples.next_flush; 420 struct list_head *head = &os->samples;
398 struct sample_queue *tmp, *iter; 421 struct sample_queue *tmp, *iter;
422 u64 limit = os->next_flush;
423 u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
399 424
400 if (!ops->ordered_samples || !limit) 425 if (!ops->ordered_samples || !limit)
401 return; 426 return;
402 427
403 list_for_each_entry_safe(iter, tmp, head, list) { 428 list_for_each_entry_safe(iter, tmp, head, list) {
404 if (iter->timestamp > limit) 429 if (iter->timestamp > limit)
405 return; 430 break;
406
407 if (iter == s->ordered_samples.last_inserted)
408 s->ordered_samples.last_inserted = NULL;
409 431
410 ops->sample((event_t *)iter->event, s); 432 ops->sample(iter->event, s);
411 433
412 s->ordered_samples.last_flush = iter->timestamp; 434 os->last_flush = iter->timestamp;
413 list_del(&iter->list); 435 list_del(&iter->list);
414 free(iter->event); 436 list_add(&iter->list, &os->sample_cache);
415 free(iter); 437 }
438
439 if (list_empty(head)) {
440 os->last_sample = NULL;
441 } else if (last_ts <= limit) {
442 os->last_sample =
443 list_entry(head->prev, struct sample_queue, list);
416 } 444 }
417} 445}
418 446
@@ -465,104 +493,87 @@ static int process_finished_round(event_t *event __used,
465 return 0; 493 return 0;
466} 494}
467 495
468static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
469{
470 struct sample_queue *iter;
471
472 list_for_each_entry_reverse(iter, head, list) {
473 if (iter->timestamp < new->timestamp) {
474 list_add(&new->list, &iter->list);
475 return;
476 }
477 }
478
479 list_add(&new->list, head);
480}
481
482static void __queue_sample_before(struct sample_queue *new,
483 struct sample_queue *iter,
484 struct list_head *head)
485{
486 list_for_each_entry_continue_reverse(iter, head, list) {
487 if (iter->timestamp < new->timestamp) {
488 list_add(&new->list, &iter->list);
489 return;
490 }
491 }
492
493 list_add(&new->list, head);
494}
495
496static void __queue_sample_after(struct sample_queue *new,
497 struct sample_queue *iter,
498 struct list_head *head)
499{
500 list_for_each_entry_continue(iter, head, list) {
501 if (iter->timestamp > new->timestamp) {
502 list_add_tail(&new->list, &iter->list);
503 return;
504 }
505 }
506 list_add_tail(&new->list, head);
507}
508
509/* The queue is ordered by time */ 496/* The queue is ordered by time */
510static void __queue_sample_event(struct sample_queue *new, 497static void __queue_sample_event(struct sample_queue *new,
511 struct perf_session *s) 498 struct perf_session *s)
512{ 499{
513 struct sample_queue *last_inserted = s->ordered_samples.last_inserted; 500 struct ordered_samples *os = &s->ordered_samples;
514 struct list_head *head = &s->ordered_samples.samples_head; 501 struct sample_queue *sample = os->last_sample;
502 u64 timestamp = new->timestamp;
503 struct list_head *p;
515 504
505 os->last_sample = new;
516 506
517 if (!last_inserted) { 507 if (!sample) {
518 __queue_sample_end(new, head); 508 list_add(&new->list, &os->samples);
509 os->max_timestamp = timestamp;
519 return; 510 return;
520 } 511 }
521 512
522 /* 513 /*
523 * Most of the time the current event has a timestamp 514 * last_sample might point to some random place in the list as it's
524 * very close to the last event inserted, unless we just switched 515 * the last queued event. We expect that the new event is close to
525 * to another event buffer. Having a sorting based on a list and 516 * this.
526 * on the last inserted event that is close to the current one is
527 * probably more efficient than an rbtree based sorting.
528 */ 517 */
529 if (last_inserted->timestamp >= new->timestamp) 518 if (sample->timestamp <= timestamp) {
530 __queue_sample_before(new, last_inserted, head); 519 while (sample->timestamp <= timestamp) {
531 else 520 p = sample->list.next;
532 __queue_sample_after(new, last_inserted, head); 521 if (p == &os->samples) {
522 list_add_tail(&new->list, &os->samples);
523 os->max_timestamp = timestamp;
524 return;
525 }
526 sample = list_entry(p, struct sample_queue, list);
527 }
528 list_add_tail(&new->list, &sample->list);
529 } else {
530 while (sample->timestamp > timestamp) {
531 p = sample->list.prev;
532 if (p == &os->samples) {
533 list_add(&new->list, &os->samples);
534 return;
535 }
536 sample = list_entry(p, struct sample_queue, list);
537 }
538 list_add(&new->list, &sample->list);
539 }
533} 540}
534 541
542#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue))
543
535static int queue_sample_event(event_t *event, struct sample_data *data, 544static int queue_sample_event(event_t *event, struct sample_data *data,
536 struct perf_session *s) 545 struct perf_session *s)
537{ 546{
547 struct ordered_samples *os = &s->ordered_samples;
548 struct list_head *sc = &os->sample_cache;
538 u64 timestamp = data->time; 549 u64 timestamp = data->time;
539 struct sample_queue *new; 550 struct sample_queue *new;
540 551
541
542 if (timestamp < s->ordered_samples.last_flush) { 552 if (timestamp < s->ordered_samples.last_flush) {
543 printf("Warning: Timestamp below last timeslice flush\n"); 553 printf("Warning: Timestamp below last timeslice flush\n");
544 return -EINVAL; 554 return -EINVAL;
545 } 555 }
546 556
547 new = malloc(sizeof(*new)); 557 if (!list_empty(sc)) {
548 if (!new) 558 new = list_entry(sc->next, struct sample_queue, list);
549 return -ENOMEM; 559 list_del(&new->list);
550 560 } else if (os->sample_buffer) {
551 new->timestamp = timestamp; 561 new = os->sample_buffer + os->sample_buffer_idx;
552 562 if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
553 new->event = malloc(event->header.size); 563 os->sample_buffer = NULL;
554 if (!new->event) { 564 } else {
555 free(new); 565 os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
556 return -ENOMEM; 566 if (!os->sample_buffer)
567 return -ENOMEM;
568 list_add(&os->sample_buffer->list, &os->to_free);
569 os->sample_buffer_idx = 2;
570 new = os->sample_buffer + 1;
557 } 571 }
558 572
559 memcpy(new->event, event, event->header.size); 573 new->timestamp = timestamp;
574 new->event = event;
560 575
561 __queue_sample_event(new, s); 576 __queue_sample_event(new, s);
562 s->ordered_samples.last_inserted = new;
563
564 if (new->timestamp > s->ordered_samples.max_timestamp)
565 s->ordered_samples.max_timestamp = new->timestamp;
566 577
567 return 0; 578 return 0;
568} 579}
@@ -586,13 +597,13 @@ static int perf_session__process_sample(event_t *event, struct perf_session *s,
586static int perf_session__process_event(struct perf_session *self, 597static int perf_session__process_event(struct perf_session *self,
587 event_t *event, 598 event_t *event,
588 struct perf_event_ops *ops, 599 struct perf_event_ops *ops,
589 u64 offset, u64 head) 600 u64 file_offset)
590{ 601{
591 trace_event(event); 602 trace_event(event);
592 603
593 if (event->header.type < PERF_RECORD_HEADER_MAX) { 604 if (event->header.type < PERF_RECORD_HEADER_MAX) {
594 dump_printf("%#Lx [%#x]: PERF_RECORD_%s", 605 dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
595 offset + head, event->header.size, 606 file_offset, event->header.size,
596 event__name[event->header.type]); 607 event__name[event->header.type]);
597 hists__inc_nr_events(&self->hists, event->header.type); 608 hists__inc_nr_events(&self->hists, event->header.type);
598 } 609 }
@@ -625,7 +636,7 @@ static int perf_session__process_event(struct perf_session *self,
625 return ops->event_type(event, self); 636 return ops->event_type(event, self);
626 case PERF_RECORD_HEADER_TRACING_DATA: 637 case PERF_RECORD_HEADER_TRACING_DATA:
627 /* setup for reading amidst mmap */ 638 /* setup for reading amidst mmap */
628 lseek(self->fd, offset + head, SEEK_SET); 639 lseek(self->fd, file_offset, SEEK_SET);
629 return ops->tracing_data(event, self); 640 return ops->tracing_data(event, self);
630 case PERF_RECORD_HEADER_BUILD_ID: 641 case PERF_RECORD_HEADER_BUILD_ID:
631 return ops->build_id(event, self); 642 return ops->build_id(event, self);
@@ -724,8 +735,7 @@ more:
724 } 735 }
725 736
726 if (size == 0 || 737 if (size == 0 ||
727 (skip = perf_session__process_event(self, &event, ops, 738 (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
728 0, head)) < 0) {
729 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", 739 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
730 head, event.header.size, event.header.type); 740 head, event.header.size, event.header.type);
731 /* 741 /*
@@ -751,82 +761,93 @@ more:
751done: 761done:
752 err = 0; 762 err = 0;
753out_err: 763out_err:
764 perf_session_free_sample_buffers(self);
754 return err; 765 return err;
755} 766}
756 767
757int __perf_session__process_events(struct perf_session *self, 768int __perf_session__process_events(struct perf_session *session,
758 u64 data_offset, u64 data_size, 769 u64 data_offset, u64 data_size,
759 u64 file_size, struct perf_event_ops *ops) 770 u64 file_size, struct perf_event_ops *ops)
760{ 771{
761 int err, mmap_prot, mmap_flags; 772 u64 head, page_offset, file_offset, file_pos, progress_next;
762 u64 head, shift; 773 int err, mmap_prot, mmap_flags, map_idx = 0;
763 u64 offset = 0; 774 struct ui_progress *progress;
764 size_t page_size; 775 size_t page_size, mmap_size;
776 char *buf, *mmaps[8];
765 event_t *event; 777 event_t *event;
766 uint32_t size; 778 uint32_t size;
767 char *buf;
768 struct ui_progress *progress = ui_progress__new("Processing events...",
769 self->size);
770 if (progress == NULL)
771 return -1;
772 779
773 perf_event_ops__fill_defaults(ops); 780 perf_event_ops__fill_defaults(ops);
774 781
775 page_size = sysconf(_SC_PAGESIZE); 782 page_size = sysconf(_SC_PAGESIZE);
776 783
777 head = data_offset; 784 page_offset = page_size * (data_offset / page_size);
778 shift = page_size * (head / page_size); 785 file_offset = page_offset;
779 offset += shift; 786 head = data_offset - page_offset;
780 head -= shift; 787
788 if (data_offset + data_size < file_size)
789 file_size = data_offset + data_size;
790
791 progress_next = file_size / 16;
792 progress = ui_progress__new("Processing events...", file_size);
793 if (progress == NULL)
794 return -1;
795
796 mmap_size = session->mmap_window;
797 if (mmap_size > file_size)
798 mmap_size = file_size;
799
800 memset(mmaps, 0, sizeof(mmaps));
781 801
782 mmap_prot = PROT_READ; 802 mmap_prot = PROT_READ;
783 mmap_flags = MAP_SHARED; 803 mmap_flags = MAP_SHARED;
784 804
785 if (self->header.needs_swap) { 805 if (session->header.needs_swap) {
786 mmap_prot |= PROT_WRITE; 806 mmap_prot |= PROT_WRITE;
787 mmap_flags = MAP_PRIVATE; 807 mmap_flags = MAP_PRIVATE;
788 } 808 }
789remap: 809remap:
790 buf = mmap(NULL, page_size * self->mmap_window, mmap_prot, 810 buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd,
791 mmap_flags, self->fd, offset); 811 file_offset);
792 if (buf == MAP_FAILED) { 812 if (buf == MAP_FAILED) {
793 pr_err("failed to mmap file\n"); 813 pr_err("failed to mmap file\n");
794 err = -errno; 814 err = -errno;
795 goto out_err; 815 goto out_err;
796 } 816 }
817 mmaps[map_idx] = buf;
818 map_idx = (map_idx + 1) & (ARRAY_SIZE(mmaps) - 1);
819 file_pos = file_offset + head;
797 820
798more: 821more:
799 event = (event_t *)(buf + head); 822 event = (event_t *)(buf + head);
800 ui_progress__update(progress, offset);
801 823
802 if (self->header.needs_swap) 824 if (session->header.needs_swap)
803 perf_event_header__bswap(&event->header); 825 perf_event_header__bswap(&event->header);
804 size = event->header.size; 826 size = event->header.size;
805 if (size == 0) 827 if (size == 0)
806 size = 8; 828 size = 8;
807 829
808 if (head + event->header.size >= page_size * self->mmap_window) { 830 if (head + event->header.size >= mmap_size) {
809 int munmap_ret; 831 if (mmaps[map_idx]) {
810 832 munmap(mmaps[map_idx], mmap_size);
811 shift = page_size * (head / page_size); 833 mmaps[map_idx] = NULL;
812 834 }
813 munmap_ret = munmap(buf, page_size * self->mmap_window);
814 assert(munmap_ret == 0);
815 835
816 offset += shift; 836 page_offset = page_size * (head / page_size);
817 head -= shift; 837 file_offset += page_offset;
838 head -= page_offset;
818 goto remap; 839 goto remap;
819 } 840 }
820 841
821 size = event->header.size; 842 size = event->header.size;
822 843
823 dump_printf("\n%#Lx [%#x]: event: %d\n", 844 dump_printf("\n%#Lx [%#x]: event: %d\n",
824 offset + head, event->header.size, event->header.type); 845 file_pos, event->header.size, event->header.type);
825 846
826 if (size == 0 || 847 if (size == 0 ||
827 perf_session__process_event(self, event, ops, offset, head) < 0) { 848 perf_session__process_event(session, event, ops, file_pos) < 0) {
828 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", 849 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
829 offset + head, event->header.size, 850 file_offset + head, event->header.size,
830 event->header.type); 851 event->header.type);
831 /* 852 /*
832 * assume we lost track of the stream, check alignment, and 853 * assume we lost track of the stream, check alignment, and
@@ -839,19 +860,41 @@ more:
839 } 860 }
840 861
841 head += size; 862 head += size;
863 file_pos += size;
842 864
843 if (offset + head >= data_offset + data_size) 865 if (file_pos >= progress_next) {
844 goto done; 866 progress_next += file_size / 16;
867 ui_progress__update(progress, file_pos);
868 }
845 869
846 if (offset + head < file_size) 870 if (file_pos < file_size)
847 goto more; 871 goto more;
848done: 872
849 err = 0; 873 err = 0;
850 /* do the final flush for ordered samples */ 874 /* do the final flush for ordered samples */
851 self->ordered_samples.next_flush = ULLONG_MAX; 875 session->ordered_samples.next_flush = ULLONG_MAX;
852 flush_sample_queue(self, ops); 876 flush_sample_queue(session, ops);
853out_err: 877out_err:
854 ui_progress__delete(progress); 878 ui_progress__delete(progress);
879
880 if (ops->lost == event__process_lost &&
881 session->hists.stats.total_lost != 0) {
882 ui__warning("Processed %Lu events and LOST %Lu!\n\n"
883 "Check IO/CPU overload!\n\n",
884 session->hists.stats.total_period,
885 session->hists.stats.total_lost);
886 }
887
888 if (session->hists.stats.nr_unknown_events != 0) {
889 ui__warning("Found %u unknown events!\n\n"
890 "Is this an older tool processing a perf.data "
891 "file generated by a more recent tool?\n\n"
892 "If that is not the case, consider "
893 "reporting to linux-kernel@vger.kernel.org.\n\n",
894 session->hists.stats.nr_unknown_events);
895 }
896
897 perf_session_free_sample_buffers(session);
855 return err; 898 return err;
856} 899}
857 900
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 9fa0fc2a863f..5bf6efa3788a 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -17,8 +17,12 @@ struct ordered_samples {
17 u64 last_flush; 17 u64 last_flush;
18 u64 next_flush; 18 u64 next_flush;
19 u64 max_timestamp; 19 u64 max_timestamp;
20 struct list_head samples_head; 20 struct list_head samples;
21 struct sample_queue *last_inserted; 21 struct list_head sample_cache;
22 struct list_head to_free;
23 struct sample_queue *sample_buffer;
24 struct sample_queue *last_sample;
25 int sample_buffer_idx;
22}; 26};
23 27
24struct perf_session { 28struct perf_session {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 0500895a45af..a348906b587d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -121,7 +121,7 @@ static void __map_groups__fixup_end(struct map_groups *self, enum map_type type)
121 * We still haven't the actual symbols, so guess the 121 * We still haven't the actual symbols, so guess the
122 * last map final address. 122 * last map final address.
123 */ 123 */
124 curr->end = ~0UL; 124 curr->end = ~0ULL;
125} 125}
126 126
127static void map_groups__fixup_end(struct map_groups *self) 127static void map_groups__fixup_end(struct map_groups *self)
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c
index 056c69521a38..7b5a8926624e 100644
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -104,10 +104,24 @@ out_destroy_form:
104 return rc; 104 return rc;
105} 105}
106 106
107static const char yes[] = "Yes", no[] = "No"; 107static const char yes[] = "Yes", no[] = "No",
108 warning_str[] = "Warning!", ok[] = "Ok";
108 109
109bool ui__dialog_yesno(const char *msg) 110bool ui__dialog_yesno(const char *msg)
110{ 111{
111 /* newtWinChoice should really be accepting const char pointers... */ 112 /* newtWinChoice should really be accepting const char pointers... */
112 return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1; 113 return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1;
113} 114}
115
116void ui__warning(const char *format, ...)
117{
118 va_list args;
119
120 va_start(args, format);
121 if (use_browser > 0)
122 newtWinMessagev((char *)warning_str, (char *)ok,
123 (char *)format, args);
124 else
125 vfprintf(stderr, format, args);
126 va_end(args);
127}