aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCorey Minyard <minyard@acm.org>2007-05-08 03:23:58 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-08 14:14:58 -0400
commitf64da958dfc83335de1d2bef9d3868f30feb4e53 (patch)
treeebf2ca43cf50ea05742b19806ca72c5027c0911a
parentee6cd5f8f573ad11f270a07fb201822c2862474d (diff)
ipmi: add new IPMI nmi watchdog handling
Convert over to the new NMI handling for getting IPMI watchdog timeouts via an NMI. This add config options to know if there is the ability to receive NMIs and if it has an NMI post processing call. Then it modifies the IPMI watchdog to take advantage of this so that it can know if an NMI comes in. It also adds testing that the IPMI NMI watchdog works. Signed-off-by: Corey Minyard <minyard@acm.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/i386/kernel/traps.c5
-rw-r--r--arch/x86_64/kernel/traps.c2
-rw-r--r--drivers/char/ipmi/ipmi_watchdog.c136
-rw-r--r--include/asm-i386/kdebug.h1
-rw-r--r--include/asm-x86_64/kdebug.h1
5 files changed, 103 insertions, 42 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f21b41e7770c..58c8e015e77e 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -747,6 +747,11 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
747 */ 747 */
748 if (nmi_watchdog_tick(regs, reason)) 748 if (nmi_watchdog_tick(regs, reason))
749 return; 749 return;
750#endif
751 if (notify_die(DIE_NMI_POST, "nmi_post", regs, reason, 2, 0)
752 == NOTIFY_STOP)
753 return;
754#ifdef CONFIG_X86_LOCAL_APIC
750 if (!do_nmi_callback(regs, smp_processor_id())) 755 if (!do_nmi_callback(regs, smp_processor_id()))
751#endif 756#endif
752 unknown_nmi_error(reason, regs); 757 unknown_nmi_error(reason, regs);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index d76fc32d4599..0484a2ceac87 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -792,6 +792,8 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
792 */ 792 */
793 if (nmi_watchdog_tick(regs,reason)) 793 if (nmi_watchdog_tick(regs,reason))
794 return; 794 return;
795 if (notify_die(DIE_NMI_POST, "nmi_post", regs, reason, 2, 0)
796 == NOTIFY_STOP)
795 if (!do_nmi_callback(regs,cpu)) 797 if (!do_nmi_callback(regs,cpu))
796 unknown_nmi_error(reason, regs); 798 unknown_nmi_error(reason, regs);
797 799
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 6b634e8d9519..9e9c5de2e549 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -49,9 +49,19 @@
49#include <linux/poll.h> 49#include <linux/poll.h>
50#include <linux/string.h> 50#include <linux/string.h>
51#include <linux/ctype.h> 51#include <linux/ctype.h>
52#include <linux/delay.h>
52#include <asm/atomic.h> 53#include <asm/atomic.h>
53#ifdef CONFIG_X86_LOCAL_APIC 54
54#include <asm/apic.h> 55#ifdef CONFIG_X86
56/* This is ugly, but I've determined that x86 is the only architecture
57 that can reasonably support the IPMI NMI watchdog timeout at this
58 time. If another architecture adds this capability somehow, it
59 will have to be a somewhat different mechanism and I have no idea
60 how it will work. So in the unlikely event that another
61 architecture supports this, we can figure out a good generic
62 mechanism for it at that time. */
63#include <asm/kdebug.h>
64#define HAVE_DIE_NMI_POST
55#endif 65#endif
56 66
57#define PFX "IPMI Watchdog: " 67#define PFX "IPMI Watchdog: "
@@ -317,6 +327,11 @@ static unsigned char ipmi_version_minor;
317/* If a pretimeout occurs, this is used to allow only one panic to happen. */ 327/* If a pretimeout occurs, this is used to allow only one panic to happen. */
318static atomic_t preop_panic_excl = ATOMIC_INIT(-1); 328static atomic_t preop_panic_excl = ATOMIC_INIT(-1);
319 329
330#ifdef HAVE_DIE_NMI_POST
331static int testing_nmi;
332static int nmi_handler_registered;
333#endif
334
320static int ipmi_heartbeat(void); 335static int ipmi_heartbeat(void);
321static void panic_halt_ipmi_heartbeat(void); 336static void panic_halt_ipmi_heartbeat(void);
322 337
@@ -358,6 +373,10 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg *smi_msg,
358 int hbnow = 0; 373 int hbnow = 0;
359 374
360 375
376 /* These can be cleared as we are setting the timeout. */
377 ipmi_start_timer_on_heartbeat = 0;
378 pretimeout_since_last_heartbeat = 0;
379
361 data[0] = 0; 380 data[0] = 0;
362 WDOG_SET_TIMER_USE(data[0], WDOG_TIMER_USE_SMS_OS); 381 WDOG_SET_TIMER_USE(data[0], WDOG_TIMER_USE_SMS_OS);
363 382
@@ -432,13 +451,12 @@ static int ipmi_set_timeout(int do_heartbeat)
432 451
433 wait_for_completion(&set_timeout_wait); 452 wait_for_completion(&set_timeout_wait);
434 453
454 mutex_unlock(&set_timeout_lock);
455
435 if ((do_heartbeat == IPMI_SET_TIMEOUT_FORCE_HB) 456 if ((do_heartbeat == IPMI_SET_TIMEOUT_FORCE_HB)
436 || ((send_heartbeat_now) 457 || ((send_heartbeat_now)
437 && (do_heartbeat == IPMI_SET_TIMEOUT_HB_IF_NECESSARY))) 458 && (do_heartbeat == IPMI_SET_TIMEOUT_HB_IF_NECESSARY)))
438 {
439 rv = ipmi_heartbeat(); 459 rv = ipmi_heartbeat();
440 }
441 mutex_unlock(&set_timeout_lock);
442 460
443out: 461out:
444 return rv; 462 return rv;
@@ -518,12 +536,10 @@ static int ipmi_heartbeat(void)
518 int rv; 536 int rv;
519 struct ipmi_system_interface_addr addr; 537 struct ipmi_system_interface_addr addr;
520 538
521 if (ipmi_ignore_heartbeat) { 539 if (ipmi_ignore_heartbeat)
522 return 0; 540 return 0;
523 }
524 541
525 if (ipmi_start_timer_on_heartbeat) { 542 if (ipmi_start_timer_on_heartbeat) {
526 ipmi_start_timer_on_heartbeat = 0;
527 ipmi_watchdog_state = action_val; 543 ipmi_watchdog_state = action_val;
528 return ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB); 544 return ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
529 } else if (pretimeout_since_last_heartbeat) { 545 } else if (pretimeout_since_last_heartbeat) {
@@ -531,7 +547,6 @@ static int ipmi_heartbeat(void)
531 We don't want to set the action, though, we want to 547 We don't want to set the action, though, we want to
532 leave that alone (thus it can't be combined with the 548 leave that alone (thus it can't be combined with the
533 above operation. */ 549 above operation. */
534 pretimeout_since_last_heartbeat = 0;
535 return ipmi_set_timeout(IPMI_SET_TIMEOUT_HB_IF_NECESSARY); 550 return ipmi_set_timeout(IPMI_SET_TIMEOUT_HB_IF_NECESSARY);
536 } 551 }
537 552
@@ -919,6 +934,45 @@ static void ipmi_register_watchdog(int ipmi_intf)
919 printk(KERN_CRIT PFX "Unable to register misc device\n"); 934 printk(KERN_CRIT PFX "Unable to register misc device\n");
920 } 935 }
921 936
937#ifdef HAVE_DIE_NMI_POST
938 if (nmi_handler_registered) {
939 int old_pretimeout = pretimeout;
940 int old_timeout = timeout;
941 int old_preop_val = preop_val;
942
943 /* Set the pretimeout to go off in a second and give
944 ourselves plenty of time to stop the timer. */
945 ipmi_watchdog_state = WDOG_TIMEOUT_RESET;
946 preop_val = WDOG_PREOP_NONE; /* Make sure nothing happens */
947 pretimeout = 99;
948 timeout = 100;
949
950 testing_nmi = 1;
951
952 rv = ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
953 if (rv) {
954 printk(KERN_WARNING PFX "Error starting timer to"
955 " test NMI: 0x%x. The NMI pretimeout will"
956 " likely not work\n", rv);
957 rv = 0;
958 goto out_restore;
959 }
960
961 msleep(1500);
962
963 if (testing_nmi != 2) {
964 printk(KERN_WARNING PFX "IPMI NMI didn't seem to"
965 " occur. The NMI pretimeout will"
966 " likely not work\n");
967 }
968 out_restore:
969 testing_nmi = 0;
970 preop_val = old_preop_val;
971 pretimeout = old_pretimeout;
972 timeout = old_timeout;
973 }
974#endif
975
922 out: 976 out:
923 up_write(&register_sem); 977 up_write(&register_sem);
924 978
@@ -928,6 +982,10 @@ static void ipmi_register_watchdog(int ipmi_intf)
928 ipmi_watchdog_state = action_val; 982 ipmi_watchdog_state = action_val;
929 ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB); 983 ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
930 printk(KERN_INFO PFX "Starting now!\n"); 984 printk(KERN_INFO PFX "Starting now!\n");
985 } else {
986 /* Stop the timer now. */
987 ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
988 ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
931 } 989 }
932} 990}
933 991
@@ -964,17 +1022,28 @@ static void ipmi_unregister_watchdog(int ipmi_intf)
964 up_write(&register_sem); 1022 up_write(&register_sem);
965} 1023}
966 1024
967#ifdef HAVE_NMI_HANDLER 1025#ifdef HAVE_DIE_NMI_POST
968static int 1026static int
969ipmi_nmi(void *dev_id, int cpu, int handled) 1027ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
970{ 1028{
1029 if (val != DIE_NMI_POST)
1030 return NOTIFY_OK;
1031
1032 if (testing_nmi) {
1033 testing_nmi = 2;
1034 return NOTIFY_STOP;
1035 }
1036
971 /* If we are not expecting a timeout, ignore it. */ 1037 /* If we are not expecting a timeout, ignore it. */
972 if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE) 1038 if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE)
973 return NOTIFY_DONE; 1039 return NOTIFY_OK;
1040
1041 if (preaction_val != WDOG_PRETIMEOUT_NMI)
1042 return NOTIFY_OK;
974 1043
975 /* If no one else handled the NMI, we assume it was the IPMI 1044 /* If no one else handled the NMI, we assume it was the IPMI
976 watchdog. */ 1045 watchdog. */
977 if ((!handled) && (preop_val == WDOG_PREOP_PANIC)) { 1046 if (preop_val == WDOG_PREOP_PANIC) {
978 /* On some machines, the heartbeat will give 1047 /* On some machines, the heartbeat will give
979 an error and not work unless we re-enable 1048 an error and not work unless we re-enable
980 the timer. So do so. */ 1049 the timer. So do so. */
@@ -983,18 +1052,12 @@ ipmi_nmi(void *dev_id, int cpu, int handled)
983 panic(PFX "pre-timeout"); 1052 panic(PFX "pre-timeout");
984 } 1053 }
985 1054
986 return NOTIFY_DONE; 1055 return NOTIFY_STOP;
987} 1056}
988 1057
989static struct nmi_handler ipmi_nmi_handler = 1058static struct notifier_block ipmi_nmi_handler = {
990{ 1059 .notifier_call = ipmi_nmi
991 .link = LIST_HEAD_INIT(ipmi_nmi_handler.link),
992 .dev_name = "ipmi_watchdog",
993 .dev_id = NULL,
994 .handler = ipmi_nmi,
995 .priority = 0, /* Call us last. */
996}; 1060};
997int nmi_handler_registered;
998#endif 1061#endif
999 1062
1000static int wdog_reboot_handler(struct notifier_block *this, 1063static int wdog_reboot_handler(struct notifier_block *this,
@@ -1111,7 +1174,7 @@ static int preaction_op(const char *inval, char *outval)
1111 preaction_val = WDOG_PRETIMEOUT_NONE; 1174 preaction_val = WDOG_PRETIMEOUT_NONE;
1112 else if (strcmp(inval, "pre_smi") == 0) 1175 else if (strcmp(inval, "pre_smi") == 0)
1113 preaction_val = WDOG_PRETIMEOUT_SMI; 1176 preaction_val = WDOG_PRETIMEOUT_SMI;
1114#ifdef HAVE_NMI_HANDLER 1177#ifdef HAVE_DIE_NMI_POST
1115 else if (strcmp(inval, "pre_nmi") == 0) 1178 else if (strcmp(inval, "pre_nmi") == 0)
1116 preaction_val = WDOG_PRETIMEOUT_NMI; 1179 preaction_val = WDOG_PRETIMEOUT_NMI;
1117#endif 1180#endif
@@ -1145,7 +1208,7 @@ static int preop_op(const char *inval, char *outval)
1145 1208
1146static void check_parms(void) 1209static void check_parms(void)
1147{ 1210{
1148#ifdef HAVE_NMI_HANDLER 1211#ifdef HAVE_DIE_NMI_POST
1149 int do_nmi = 0; 1212 int do_nmi = 0;
1150 int rv; 1213 int rv;
1151 1214
@@ -1158,20 +1221,9 @@ static void check_parms(void)
1158 preop_op("preop_none", NULL); 1221 preop_op("preop_none", NULL);
1159 do_nmi = 0; 1222 do_nmi = 0;
1160 } 1223 }
1161#ifdef CONFIG_X86_LOCAL_APIC
1162 if (nmi_watchdog == NMI_IO_APIC) {
1163 printk(KERN_WARNING PFX "nmi_watchdog is set to IO APIC"
1164 " mode (value is %d), that is incompatible"
1165 " with using NMI in the IPMI watchdog."
1166 " Disabling IPMI nmi pretimeout.\n",
1167 nmi_watchdog);
1168 preaction_val = WDOG_PRETIMEOUT_NONE;
1169 do_nmi = 0;
1170 }
1171#endif
1172 } 1224 }
1173 if (do_nmi && !nmi_handler_registered) { 1225 if (do_nmi && !nmi_handler_registered) {
1174 rv = request_nmi(&ipmi_nmi_handler); 1226 rv = register_die_notifier(&ipmi_nmi_handler);
1175 if (rv) { 1227 if (rv) {
1176 printk(KERN_WARNING PFX 1228 printk(KERN_WARNING PFX
1177 "Can't register nmi handler\n"); 1229 "Can't register nmi handler\n");
@@ -1179,7 +1231,7 @@ static void check_parms(void)
1179 } else 1231 } else
1180 nmi_handler_registered = 1; 1232 nmi_handler_registered = 1;
1181 } else if (!do_nmi && nmi_handler_registered) { 1233 } else if (!do_nmi && nmi_handler_registered) {
1182 release_nmi(&ipmi_nmi_handler); 1234 unregister_die_notifier(&ipmi_nmi_handler);
1183 nmi_handler_registered = 0; 1235 nmi_handler_registered = 0;
1184 } 1236 }
1185#endif 1237#endif
@@ -1215,9 +1267,9 @@ static int __init ipmi_wdog_init(void)
1215 1267
1216 rv = ipmi_smi_watcher_register(&smi_watcher); 1268 rv = ipmi_smi_watcher_register(&smi_watcher);
1217 if (rv) { 1269 if (rv) {
1218#ifdef HAVE_NMI_HANDLER 1270#ifdef HAVE_DIE_NMI_POST
1219 if (preaction_val == WDOG_PRETIMEOUT_NMI) 1271 if (nmi_handler_registered)
1220 release_nmi(&ipmi_nmi_handler); 1272 unregister_die_notifier(&ipmi_nmi_handler);
1221#endif 1273#endif
1222 atomic_notifier_chain_unregister(&panic_notifier_list, 1274 atomic_notifier_chain_unregister(&panic_notifier_list,
1223 &wdog_panic_notifier); 1275 &wdog_panic_notifier);
@@ -1236,9 +1288,9 @@ static void __exit ipmi_wdog_exit(void)
1236 ipmi_smi_watcher_unregister(&smi_watcher); 1288 ipmi_smi_watcher_unregister(&smi_watcher);
1237 ipmi_unregister_watchdog(watchdog_ifnum); 1289 ipmi_unregister_watchdog(watchdog_ifnum);
1238 1290
1239#ifdef HAVE_NMI_HANDLER 1291#ifdef HAVE_DIE_NMI_POST
1240 if (nmi_handler_registered) 1292 if (nmi_handler_registered)
1241 release_nmi(&ipmi_nmi_handler); 1293 unregister_die_notifier(&ipmi_nmi_handler);
1242#endif 1294#endif
1243 1295
1244 atomic_notifier_chain_unregister(&panic_notifier_list, 1296 atomic_notifier_chain_unregister(&panic_notifier_list,
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index d18cdb9fc9a6..6e1c8e1b5e2b 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -38,6 +38,7 @@ enum die_val {
38 DIE_GPF, 38 DIE_GPF,
39 DIE_CALL, 39 DIE_CALL,
40 DIE_NMI_IPI, 40 DIE_NMI_IPI,
41 DIE_NMI_POST,
41 DIE_PAGE_FAULT, 42 DIE_PAGE_FAULT,
42}; 43};
43 44
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index 2b0c088e2957..e9ce163b1550 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -33,6 +33,7 @@ enum die_val {
33 DIE_GPF, 33 DIE_GPF,
34 DIE_CALL, 34 DIE_CALL,
35 DIE_NMI_IPI, 35 DIE_NMI_IPI,
36 DIE_NMI_POST,
36 DIE_PAGE_FAULT, 37 DIE_PAGE_FAULT,
37}; 38};
38 39