aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/sysctl/kernel.txt14
-rw-r--r--arch/x86/Kconfig.debug19
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c35
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--include/linux/kernel.h1
-rw-r--r--kernel/sysctl.c9
12 files changed, 337 insertions, 10 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a8d389d72405..eb93fd0ec734 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1824,6 +1824,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1824 nomfgpt [X86-32] Disable Multi-Function General Purpose 1824 nomfgpt [X86-32] Disable Multi-Function General Purpose
1825 Timer usage (for AMD Geode machines). 1825 Timer usage (for AMD Geode machines).
1826 1826
1827 nonmi_ipi [X86] Disable using NMI IPIs during panic/reboot to
1828 shutdown the other cpus. Instead use the REBOOT_VECTOR
1829 irq.
1830
1827 nopat [X86] Disable PAT (page attribute table extension of 1831 nopat [X86] Disable PAT (page attribute table extension of
1828 pagetables) support. 1832 pagetables) support.
1829 1833
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1f2463671a1a..6d8cd8b2c30d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
49- panic 49- panic
50- panic_on_oops 50- panic_on_oops
51- panic_on_unrecovered_nmi 51- panic_on_unrecovered_nmi
52- panic_on_stackoverflow
52- pid_max 53- pid_max
53- powersave-nap [ PPC only ] 54- powersave-nap [ PPC only ]
54- printk 55- printk
@@ -393,6 +394,19 @@ Controls the kernel's behaviour when an oops or BUG is encountered.
393 394
394============================================================== 395==============================================================
395 396
397panic_on_stackoverflow:
398
399Controls the kernel's behavior when detecting the overflows of
400kernel, IRQ and exception stacks except a user stack.
401This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
402
4030: try to continue operation.
404
4051: panic immediately.
406
407==============================================================
408
409
396pid_max: 410pid_max:
397 411
398PID allocation wrap value. When the kernel's next PID value 412PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..aa4158f3ce62 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
63 bool "Check for stack overflows" 63 bool "Check for stack overflows"
64 depends on DEBUG_KERNEL 64 depends on DEBUG_KERNEL
65 ---help--- 65 ---help---
66 This option will cause messages to be printed if free stack space 66 Say Y here if you want to check the overflows of kernel, IRQ
67 drops below a certain limit. 67 and exception stacks. This option will cause messages of the
68 stacks in detail when free stack space drops below a certain
69 limit.
70 If in doubt, say "N".
68 71
69config X86_PTDUMP 72config X86_PTDUMP
70 bool "Export kernel pagetable layout to userspace via debugfs" 73 bool "Export kernel pagetable layout to userspace via debugfs"
@@ -284,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
284 287
285 If unsure, or if you run an older (pre 4.4) gcc, say N. 288 If unsure, or if you run an older (pre 4.4) gcc, say N.
286 289
290config DEBUG_NMI_SELFTEST
291 bool "NMI Selftest"
292 depends on DEBUG_KERNEL && X86_LOCAL_APIC
293 ---help---
294 Enabling this option turns on a quick NMI selftest to verify
295 that the NMI behaves correctly.
296
297 This might help diagnose strange hangs that rely on NMI to
298 function properly.
299
300 If unsure, say N.
301
287endmenu 302endmenu
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
225 225
226#endif /* CONFIG_X86_LOCAL_APIC */ 226#endif /* CONFIG_X86_LOCAL_APIC */
227 227
228#ifdef CONFIG_DEBUG_NMI_SELFTEST
229extern void nmi_selftest(void);
230#else
231#define nmi_selftest() do { } while (0)
232#endif
233
228#endif /* __ASSEMBLY__ */ 234#endif /* __ASSEMBLY__ */
229#endif /* _ASM_X86_SMP_H */ 235#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..02b2f05b371e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
80obj-$(CONFIG_AMD_NB) += amd_nb.o 80obj-$(CONFIG_AMD_NB) += amd_nb.o
81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
83obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 84
84obj-$(CONFIG_KVM_GUEST) += kvm.o 85obj-$(CONFIG_KVM_GUEST) += kvm.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 86obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs); 28EXPORT_PER_CPU_SYMBOL(irq_regs);
29 29
30#ifdef CONFIG_DEBUG_STACKOVERFLOW 30#ifdef CONFIG_DEBUG_STACKOVERFLOW
31
32int sysctl_panic_on_stackoverflow __read_mostly;
33
31/* Debugging check for stack overflow: is there less than 1KB free? */ 34/* Debugging check for stack overflow: is there less than 1KB free? */
32static int check_stack_overflow(void) 35static int check_stack_overflow(void)
33{ 36{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
43{ 46{
44 printk(KERN_WARNING "low stack detected by irq handler\n"); 47 printk(KERN_WARNING "low stack detected by irq handler\n");
45 dump_stack(); 48 dump_stack();
49 if (sysctl_panic_on_stackoverflow)
50 panic("low stack detected by irq handler - check messages\n");
46} 51}
47 52
48#else 53#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca468c47a..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
26DEFINE_PER_CPU(struct pt_regs *, irq_regs); 26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs); 27EXPORT_PER_CPU_SYMBOL(irq_regs);
28 28
29int sysctl_panic_on_stackoverflow;
30
29/* 31/*
30 * Probabilistic stack overflow check: 32 * Probabilistic stack overflow check:
31 * 33 *
@@ -36,18 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
36static inline void stack_overflow_check(struct pt_regs *regs) 38static inline void stack_overflow_check(struct pt_regs *regs)
37{ 39{
38#ifdef CONFIG_DEBUG_STACKOVERFLOW 40#ifdef CONFIG_DEBUG_STACKOVERFLOW
41#define STACK_TOP_MARGIN 128
42 struct orig_ist *oist;
43 u64 irq_stack_top, irq_stack_bottom;
44 u64 estack_top, estack_bottom;
39 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
40 46
41 if (user_mode_vm(regs)) 47 if (user_mode_vm(regs))
42 return; 48 return;
43 49
44 WARN_ONCE(regs->sp >= curbase && 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
45 regs->sp <= curbase + THREAD_SIZE && 51 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
46 regs->sp < curbase + sizeof(struct thread_info) + 52 regs->sp <= curbase + THREAD_SIZE)
47 sizeof(struct pt_regs) + 128, 53 return;
54
55 irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
56 STACK_TOP_MARGIN;
57 irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
58 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
59 return;
60
61 oist = &__get_cpu_var(orig_ist);
62 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
63 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
64 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
65 return;
66
67 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
68 current->comm, curbase, regs->sp,
69 irq_stack_top, irq_stack_bottom,
70 estack_top, estack_bottom);
48 71
49 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 72 if (sysctl_panic_on_stackoverflow)
50 current->comm, curbase, regs->sp); 73 panic("low stack detected by irq handler - check messages\n");
51#endif 74#endif
52} 75}
53 76
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..0d01a8ea4e11
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
1/*
2 * arch/x86/kernel/nmi-selftest.c
3 *
4 * Testsuite for NMI: IPIs
5 *
6 * Started by Don Zickus:
7 * (using lib/locking-selftest.c as a guide)
8 *
9 * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
10 */
11
12#include <linux/smp.h>
13#include <linux/cpumask.h>
14#include <linux/delay.h>
15
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#define SUCCESS 0
20#define FAILURE 1
21#define TIMEOUT 2
22
23static int nmi_fail;
24
25/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
27
28static int testcase_total;
29static int testcase_successes;
30static int expected_testcase_failures;
31static int unexpected_testcase_failures;
32static int unexpected_testcase_unknowns;
33
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{
36 unexpected_testcase_unknowns++;
37 return NMI_HANDLED;
38}
39
40static void init_nmi_testsuite(void)
41{
42 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44}
45
46static void cleanup_nmi_testsuite(void)
47{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49}
50
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{
53 int cpu = raw_smp_processor_id();
54
55 if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
56 return NMI_HANDLED;
57
58 return NMI_DONE;
59}
60
61static void test_nmi_ipi(struct cpumask *mask)
62{
63 unsigned long timeout;
64
65 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
66 NMI_FLAG_FIRST, "nmi_selftest")) {
67 nmi_fail = FAILURE;
68 return;
69 }
70
71 /* sync above data before sending NMI */
72 wmb();
73
74 apic->send_IPI_mask(mask, NMI_VECTOR);
75
76 /* Don't wait longer than a second */
77 timeout = USEC_PER_SEC;
78 while (!cpumask_empty(mask) && timeout--)
79 udelay(1);
80
81 /* What happens if we timeout, do we still unregister?? */
82 unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
83
84 if (!timeout)
85 nmi_fail = TIMEOUT;
86 return;
87}
88
89static void remote_ipi(void)
90{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
93 if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95}
96
97static void local_ipi(void)
98{
99 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102}
103
104static void reset_nmi(void)
105{
106 nmi_fail = 0;
107}
108
109static void dotest(void (*testcase_fn)(void), int expected)
110{
111 testcase_fn();
112 /*
113 * Filter out expected failures:
114 */
115 if (nmi_fail != expected) {
116 unexpected_testcase_failures++;
117
118 if (nmi_fail == FAILURE)
119 printk("FAILED |");
120 else if (nmi_fail == TIMEOUT)
121 printk("TIMEOUT|");
122 else
123 printk("ERROR |");
124 dump_stack();
125 } else {
126 testcase_successes++;
127 printk(" ok |");
128 }
129 testcase_total++;
130
131 reset_nmi();
132}
133
134static inline void print_testname(const char *testname)
135{
136 printk("%12s:", testname);
137}
138
139void nmi_selftest(void)
140{
141 init_nmi_testsuite();
142
143 /*
144 * Run the testsuite:
145 */
146 printk("----------------\n");
147 printk("| NMI testsuite:\n");
148 printk("--------------------\n");
149
150 print_testname("remote IPI");
151 dotest(remote_ipi, SUCCESS);
152 printk("\n");
153 print_testname("local IPI");
154 dotest(local_ipi, SUCCESS);
155 printk("\n");
156
157 cleanup_nmi_testsuite();
158
159 if (unexpected_testcase_failures) {
160 printk("--------------------\n");
161 printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
162 unexpected_testcase_failures, testcase_total);
163 printk("-----------------------------------------------------------------\n");
164 } else if (expected_testcase_failures && testcase_successes) {
165 printk("--------------------\n");
166 printk("%3d out of %3d testcases failed, as expected. |\n",
167 expected_testcase_failures, testcase_total);
168 printk("----------------------------------------------------\n");
169 } else if (expected_testcase_failures && !testcase_successes) {
170 printk("--------------------\n");
171 printk("All %3d testcases failed, as expected. |\n",
172 expected_testcase_failures);
173 printk("----------------------------------------\n");
174 } else {
175 printk("--------------------\n");
176 printk("Good, all %3d testcases passed! |\n",
177 testcase_successes);
178 printk("---------------------------------\n");
179 }
180}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
29#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/nmi.h>
32/* 33/*
33 * Some notes on x86 processor bugs affecting SMP operation: 34 * Some notes on x86 processor bugs affecting SMP operation:
34 * 35 *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
148 free_cpumask_var(allbutself); 149 free_cpumask_var(allbutself);
149} 150}
150 151
152static atomic_t stopping_cpu = ATOMIC_INIT(-1);
153
154static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
155{
156 /* We are registered on stopping cpu too, avoid spurious NMI */
157 if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
158 return NMI_HANDLED;
159
160 stop_this_cpu(NULL);
161
162 return NMI_HANDLED;
163}
164
165static void native_nmi_stop_other_cpus(int wait)
166{
167 unsigned long flags;
168 unsigned long timeout;
169
170 if (reboot_force)
171 return;
172
173 /*
174 * Use an own vector here because smp_call_function
175 * does lots of things not suitable in a panic situation.
176 */
177 if (num_online_cpus() > 1) {
178 /* did someone beat us here? */
179 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
180 return;
181
182 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
183 NMI_FLAG_FIRST, "smp_stop"))
184 /* Note: we ignore failures here */
185 return;
186
187 /* sync above data before sending NMI */
188 wmb();
189
190 apic->send_IPI_allbutself(NMI_VECTOR);
191
192 /*
193 * Don't wait longer than a second if the caller
194 * didn't ask us to wait.
195 */
196 timeout = USEC_PER_SEC;
197 while (num_online_cpus() > 1 && (wait || timeout--))
198 udelay(1);
199 }
200
201 local_irq_save(flags);
202 disable_local_APIC();
203 local_irq_restore(flags);
204}
205
151/* 206/*
152 * this function calls the 'stop' function on all other CPUs in the system. 207 * this function calls the 'stop' function on all other CPUs in the system.
153 */ 208 */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
160 irq_exit(); 215 irq_exit();
161} 216}
162 217
163static void native_stop_other_cpus(int wait) 218static void native_irq_stop_other_cpus(int wait)
164{ 219{
165 unsigned long flags; 220 unsigned long flags;
166 unsigned long timeout; 221 unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
194 local_irq_restore(flags); 249 local_irq_restore(flags);
195} 250}
196 251
252static void native_smp_disable_nmi_ipi(void)
253{
254 smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
255}
256
197/* 257/*
198 * Reschedule call back. 258 * Reschedule call back.
199 */ 259 */
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
225 irq_exit(); 285 irq_exit();
226} 286}
227 287
288static int __init nonmi_ipi_setup(char *str)
289{
290 native_smp_disable_nmi_ipi();
291 return 1;
292}
293
294__setup("nonmi_ipi", nonmi_ipi_setup);
295
228struct smp_ops smp_ops = { 296struct smp_ops smp_ops = {
229 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 297 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
230 .smp_prepare_cpus = native_smp_prepare_cpus, 298 .smp_prepare_cpus = native_smp_prepare_cpus,
231 .smp_cpus_done = native_smp_cpus_done, 299 .smp_cpus_done = native_smp_cpus_done,
232 300
233 .stop_other_cpus = native_stop_other_cpus, 301 .stop_other_cpus = native_nmi_stop_other_cpus,
234 .smp_send_reschedule = native_smp_send_reschedule, 302 .smp_send_reschedule = native_smp_send_reschedule,
235 303
236 .cpu_up = native_cpu_up, 304 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e38e21754eea..79f636bc44c6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1143,6 +1143,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1143{ 1143{
1144 pr_debug("Boot done.\n"); 1144 pr_debug("Boot done.\n");
1145 1145
1146 nmi_selftest();
1146 impress_friends(); 1147 impress_friends();
1147#ifdef CONFIG_X86_IO_APIC 1148#ifdef CONFIG_X86_IO_APIC
1148 setup_ioapic_dest(); 1149 setup_ioapic_dest();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f48e8a528544..d0a7a0c71661 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -341,6 +341,7 @@ extern int panic_timeout;
341extern int panic_on_oops; 341extern int panic_on_oops;
342extern int panic_on_unrecovered_nmi; 342extern int panic_on_unrecovered_nmi;
343extern int panic_on_io_nmi; 343extern int panic_on_io_nmi;
344extern int sysctl_panic_on_stackoverflow;
344extern const char *print_tainted(void); 345extern const char *print_tainted(void);
345extern void add_taint(unsigned flag); 346extern void add_taint(unsigned flag);
346extern int test_taint(unsigned flag); 347extern int test_taint(unsigned flag);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
803 .mode = 0644, 803 .mode = 0644,
804 .proc_handler = proc_dointvec, 804 .proc_handler = proc_dointvec,
805 }, 805 },
806#ifdef CONFIG_DEBUG_STACKOVERFLOW
807 {
808 .procname = "panic_on_stackoverflow",
809 .data = &sysctl_panic_on_stackoverflow,
810 .maxlen = sizeof(int),
811 .mode = 0644,
812 .proc_handler = proc_dointvec,
813 },
814#endif
806 { 815 {
807 .procname = "bootloader_type", 816 .procname = "bootloader_type",
808 .data = &bootloader_type, 817 .data = &bootloader_type,