aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Mingarelli <thomas.mingarelli@hp.com>2009-06-04 15:50:45 -0400
committerWim Van Sebroeck <wim@iguana.be>2009-06-18 03:32:06 -0400
commit47bece87b14b866872b52ff04d469832e4936756 (patch)
tree812f6e1856cb322f1246a761a46ef20295b4689b
parent55e8ddecec6a9dbe35a99d03cc4189fd7c56e600 (diff)
[WATCHDOG] hpwdt: Add NMI sourcing
Add NMI sourcing functionality (Can only be active if nmi_watchdog is inactive). Signed-off-by: Thomas Mingarelli <thomas.mingarelli@hp.com> Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
-rw-r--r--Documentation/watchdog/hpwdt.txt84
-rw-r--r--drivers/watchdog/hpwdt.c59
2 files changed, 128 insertions, 15 deletions
diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt
new file mode 100644
index 000000000000..127839e53043
--- /dev/null
+++ b/Documentation/watchdog/hpwdt.txt
@@ -0,0 +1,84 @@
1Last reviewed: 06/02/2009
2
3 HP iLO2 NMI Watchdog Driver
4 NMI sourcing for iLO2 based ProLiant Servers
5 Documentation and Driver by
6 Thomas Mingarelli <thomas.mingarelli@hp.com>
7
8 The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
9 watchdog functionality and the added benefit of NMI sourcing. Both the
10 watchdog functionality and the NMI sourcing capability need to be enabled
11 by the user. Remember that the two modes are not dependant on one another.
12 A user can have the NMI sourcing without the watchdog timer and vice-versa.
13
14 Watchdog functionality is enabled like any other common watchdog driver. That
15 is, an application needs to be started that kicks off the watchdog timer. A
16 basic application exists in the Documentation/watchdog/src directory called
17 watchdog-test.c. Simply compile the C file and kick it off. If the system
18 gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
19 not be updated in a timely fashion and a hardware system reset (also known as
20 an Automatic Server Recovery (ASR)) event will occur.
21
22 The hpwdt driver also has three (3) module parameters. They are the following:
23
24 soft_margin - allows the user to set the watchdog timer value
25 allow_kdump - allows the user to save off a kernel dump image after an NMI
26 nowayout - basic watchdog parameter that does not allow the timer to
27 be restarted or an impending ASR to be escaped.
28
29 NOTE: More information about watchdog drivers in general, including the ioctl
30 interface to /dev/watchdog can be found in
31 Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
32
33 The NMI sourcing capability is disabled when the driver discovers that the
34 nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to
35 distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
36 Linux kernel. What this means is that the hpwdt nmi handler code is called
37 each time the NMI signal fires off. This could amount to several thousands of
38 NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
39 confused" message in the logs or if the system gets into a hung state, then
40 the user should reboot with nmi_watchdog=0.
41
42 1. If the kernel has not been booted with nmi_watchdog turned off then
43 edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
44 currently booting kernel line.
45 2. reboot the sever
46
47 Now, the hpwdt can successfully receive and source the NMI and provide a log
48 message that details the reason for the NMI (as determined by the HP BIOS).
49
50 Below is a list of NMIs the HP BIOS understands along with the associated
51 code (reason):
52
53 No source found 00h
54
55 Uncorrectable Memory Error 01h
56
57 ASR NMI 1Bh
58
59 PCI Parity Error 20h
60
61 NMI Button Press 27h
62
63 SB_BUS_NMI 28h
64
65 ILO Doorbell NMI 29h
66
67 ILO IOP NMI 2Ah
68
69 ILO Watchdog NMI 2Bh
70
71 Proc Throt NMI 2Ch
72
73 Front Side Bus NMI 2Dh
74
75 PCI Express Error 2Fh
76
77 DMA controller NMI 30h
78
79 Hypertransport/CSI Error 31h
80
81
82
83 -- Tom Mingarelli
84 (thomas.mingarelli@hp.com)
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 3137361ccbfe..c0b9169ba5d5 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -19,6 +19,7 @@
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/io.h> 20#include <linux/io.h>
21#include <linux/irq.h> 21#include <linux/irq.h>
22#include <linux/nmi.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/miscdevice.h> 24#include <linux/miscdevice.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
@@ -47,7 +48,7 @@
47#define PCI_BIOS32_PARAGRAPH_LEN 16 48#define PCI_BIOS32_PARAGRAPH_LEN 16
48#define PCI_ROM_BASE1 0x000F0000 49#define PCI_ROM_BASE1 0x000F0000
49#define ROM_SIZE 0x10000 50#define ROM_SIZE 0x10000
50#define HPWDT_VERSION "1.01" 51#define HPWDT_VERSION "1.1.1"
51 52
52struct bios32_service_dir { 53struct bios32_service_dir {
53 u32 signature; 54 u32 signature;
@@ -119,6 +120,7 @@ static int nowayout = WATCHDOG_NOWAYOUT;
119static char expect_release; 120static char expect_release;
120static unsigned long hpwdt_is_open; 121static unsigned long hpwdt_is_open;
121static unsigned int allow_kdump; 122static unsigned int allow_kdump;
123static int hpwdt_nmi_sourcing;
122 124
123static void __iomem *pci_mem_addr; /* the PCI-memory address */ 125static void __iomem *pci_mem_addr; /* the PCI-memory address */
124static unsigned long __iomem *hpwdt_timer_reg; 126static unsigned long __iomem *hpwdt_timer_reg;
@@ -468,21 +470,22 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
468 if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI) 470 if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
469 return NOTIFY_OK; 471 return NOTIFY_OK;
470 472
471 spin_lock_irqsave(&rom_lock, rom_pl); 473 if (hpwdt_nmi_sourcing) {
472 if (!die_nmi_called) 474 spin_lock_irqsave(&rom_lock, rom_pl);
473 asminline_call(&cmn_regs, cru_rom_addr); 475 if (!die_nmi_called)
474 die_nmi_called = 1; 476 asminline_call(&cmn_regs, cru_rom_addr);
475 spin_unlock_irqrestore(&rom_lock, rom_pl); 477 die_nmi_called = 1;
476 if (cmn_regs.u1.ral == 0) { 478 spin_unlock_irqrestore(&rom_lock, rom_pl);
477 printk(KERN_WARNING "hpwdt: An NMI occurred, " 479 if (cmn_regs.u1.ral == 0) {
478 "but unable to determine source.\n"); 480 printk(KERN_WARNING "hpwdt: An NMI occurred, "
479 } else { 481 "but unable to determine source.\n");
480 if (allow_kdump) 482 } else {
481 hpwdt_stop(); 483 if (allow_kdump)
482 panic("An NMI occurred, please see the Integrated " 484 hpwdt_stop();
483 "Management Log for details.\n"); 485 panic("An NMI occurred, please see the Integrated "
486 "Management Log for details.\n");
487 }
484 } 488 }
485
486 return NOTIFY_OK; 489 return NOTIFY_OK;
487} 490}
488 491
@@ -627,12 +630,38 @@ static struct notifier_block die_notifier = {
627 * Init & Exit 630 * Init & Exit
628 */ 631 */
629 632
633#ifdef ARCH_HAS_NMI_WATCHDOG
634static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
635{
636 /*
637 * If nmi_watchdog is turned off then we can turn on
638 * our nmi sourcing capability.
639 */
640 if (!nmi_watchdog_active())
641 hpwdt_nmi_sourcing = 1;
642 else
643 dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this "
644 "functionality you must reboot with nmi_watchdog=0.\n");
645}
646#else
647static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
648{
649 dev_warn(&dev->dev, "NMI sourcing is disabled. "
650 "Your kernel does not support a NMI Watchdog.\n");
651}
652#endif
653
630static int __devinit hpwdt_init_one(struct pci_dev *dev, 654static int __devinit hpwdt_init_one(struct pci_dev *dev,
631 const struct pci_device_id *ent) 655 const struct pci_device_id *ent)
632{ 656{
633 int retval; 657 int retval;
634 658
635 /* 659 /*
660 * Check if we can do NMI sourcing or not
661 */
662 hpwdt_check_nmi_sourcing(dev);
663
664 /*
636 * First let's find out if we are on an iLO2 server. We will 665 * First let's find out if we are on an iLO2 server. We will
637 * not run on a legacy ASM box. 666 * not run on a legacy ASM box.
638 * So we only support the G5 ProLiant servers and higher. 667 * So we only support the G5 ProLiant servers and higher.