aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDon Zickus <dzickus@redhat.com>2006-09-26 04:52:27 -0400
committerAndi Kleen <andi@basil.nowhere.org>2006-09-26 04:52:27 -0400
commit8da5adda91df3d2fcc5300e68da491694c9af019 (patch)
treebae152dabd728ba2f7fead421276e3cc9a779141
parente33e89ab1a8d295de0500b697f4f31c3ceee9aa2 (diff)
[PATCH] x86: Allow users to force a panic on NMI
To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r--arch/i386/kernel/traps.c6
-rw-r--r--arch/x86_64/kernel/traps.c6
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/sysctl.c8
6 files changed, 23 insertions, 0 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7db664d0b25c..2f6cb8276480 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
635 "to continue\n"); 635 "to continue\n");
636 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 636 printk(KERN_EMERG "You probably have a hardware problem with your RAM "
637 "chips\n"); 637 "chips\n");
638 if (panic_on_unrecovered_nmi)
639 panic("NMI: Not continuing");
638 640
639 /* Clear and disable the memory parity error line. */ 641 /* Clear and disable the memory parity error line. */
640 clear_mem_error(reason); 642 clear_mem_error(reason);
@@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
670 reason, smp_processor_id()); 672 reason, smp_processor_id());
671 printk("Dazed and confused, but trying to continue\n"); 673 printk("Dazed and confused, but trying to continue\n");
672 printk("Do you have a strange power saving mode enabled?\n"); 674 printk("Do you have a strange power saving mode enabled?\n");
675
676 if (panic_on_unrecovered_nmi)
677 panic("NMI: Not continuing");
678
673} 679}
674 680
675static DEFINE_SPINLOCK(nmi_print_lock); 681static DEFINE_SPINLOCK(nmi_print_lock);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 42bc070fdf11..b18829db2a6a 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
732{ 732{
733 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); 733 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
734 printk("You probably have a hardware problem with your RAM chips\n"); 734 printk("You probably have a hardware problem with your RAM chips\n");
735 if (panic_on_unrecovered_nmi)
736 panic("NMI: Not continuing");
735 737
736 /* Clear and disable the memory parity error line. */ 738 /* Clear and disable the memory parity error line. */
737 reason = (reason & 0xf) | 4; 739 reason = (reason & 0xf) | 4;
@@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
757{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); 759{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
758 printk("Dazed and confused, but trying to continue\n"); 760 printk("Dazed and confused, but trying to continue\n");
759 printk("Do you have a strange power saving mode enabled?\n"); 761 printk("Do you have a strange power saving mode enabled?\n");
762
763 if (panic_on_unrecovered_nmi)
764 panic("NMI: Not continuing");
765
760} 766}
761 767
762/* Runs on IST stack. This code must keep interrupts off all the time. 768/* Runs on IST stack. This code must keep interrupts off all the time.
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b2ae4fdce8b..1ff9609300b4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -186,6 +186,7 @@ extern void bust_spinlocks(int yes);
186extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ 186extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
187extern int panic_timeout; 187extern int panic_timeout;
188extern int panic_on_oops; 188extern int panic_on_oops;
189extern int panic_on_unrecovered_nmi;
189extern int tainted; 190extern int tainted;
190extern const char *print_tainted(void); 191extern const char *print_tainted(void);
191extern void add_taint(unsigned); 192extern void add_taint(unsigned);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ecb79ba52ae1..432778446ad2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -151,6 +151,7 @@ enum
151 KERN_COMPAT_LOG=73, /* int: print compat layer messages */ 151 KERN_COMPAT_LOG=73, /* int: print compat layer messages */
152 KERN_MAX_LOCK_DEPTH=74, 152 KERN_MAX_LOCK_DEPTH=74,
153 KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ 153 KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
154 KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
154}; 155};
155 156
156 157
diff --git a/kernel/panic.c b/kernel/panic.c
index 8010b9b17aca..d2db3e2209e0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22 22
23int panic_on_oops; 23int panic_on_oops;
24int panic_on_unrecovered_nmi;
24int tainted; 25int tainted;
25static int pause_on_oops; 26static int pause_on_oops;
26static int pause_on_oops_flag; 27static int pause_on_oops_flag;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 040de6bd74dd..220e20564124 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -642,6 +642,14 @@ static ctl_table kern_table[] = {
642#endif 642#endif
643#if defined(CONFIG_X86) 643#if defined(CONFIG_X86)
644 { 644 {
645 .ctl_name = KERN_PANIC_ON_NMI,
646 .procname = "panic_on_unrecovered_nmi",
647 .data = &panic_on_unrecovered_nmi,
648 .maxlen = sizeof(int),
649 .mode = 0644,
650 .proc_handler = &proc_dointvec,
651 },
652 {
645 .ctl_name = KERN_BOOTLOADER_TYPE, 653 .ctl_name = KERN_BOOTLOADER_TYPE,
646 .procname = "bootloader_type", 654 .procname = "bootloader_type",
647 .data = &bootloader_type, 655 .data = &bootloader_type,