diff options
-rw-r--r-- | Documentation/watchdog/hpwdt.txt | 84 | ||||
-rw-r--r-- | drivers/watchdog/hpwdt.c | 59 |
2 files changed, 128 insertions, 15 deletions
diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt new file mode 100644 index 000000000000..127839e53043 --- /dev/null +++ b/Documentation/watchdog/hpwdt.txt | |||
@@ -0,0 +1,84 @@ | |||
1 | Last reviewed: 06/02/2009 | ||
2 | |||
3 | HP iLO2 NMI Watchdog Driver | ||
4 | NMI sourcing for iLO2 based ProLiant Servers | ||
5 | Documentation and Driver by | ||
6 | Thomas Mingarelli <thomas.mingarelli@hp.com> | ||
7 | |||
8 | The HP iLO2 NMI Watchdog driver is a kernel module that provides basic | ||
9 | watchdog functionality and the added benefit of NMI sourcing. Both the | ||
10 | watchdog functionality and the NMI sourcing capability need to be enabled | ||
11 | by the user. Remember that the two modes are not dependant on one another. | ||
12 | A user can have the NMI sourcing without the watchdog timer and vice-versa. | ||
13 | |||
14 | Watchdog functionality is enabled like any other common watchdog driver. That | ||
15 | is, an application needs to be started that kicks off the watchdog timer. A | ||
16 | basic application exists in the Documentation/watchdog/src directory called | ||
17 | watchdog-test.c. Simply compile the C file and kick it off. If the system | ||
18 | gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will | ||
19 | not be updated in a timely fashion and a hardware system reset (also known as | ||
20 | an Automatic Server Recovery (ASR)) event will occur. | ||
21 | |||
22 | The hpwdt driver also has three (3) module parameters. They are the following: | ||
23 | |||
24 | soft_margin - allows the user to set the watchdog timer value | ||
25 | allow_kdump - allows the user to save off a kernel dump image after an NMI | ||
26 | nowayout - basic watchdog parameter that does not allow the timer to | ||
27 | be restarted or an impending ASR to be escaped. | ||
28 | |||
29 | NOTE: More information about watchdog drivers in general, including the ioctl | ||
30 | interface to /dev/watchdog can be found in | ||
31 | Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt. | ||
32 | |||
33 | The NMI sourcing capability is disabled when the driver discovers that the | ||
34 | nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to | ||
35 | distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the | ||
36 | Linux kernel. What this means is that the hpwdt nmi handler code is called | ||
37 | each time the NMI signal fires off. This could amount to several thousands of | ||
38 | NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and | ||
39 | confused" message in the logs or if the system gets into a hung state, then | ||
40 | the user should reboot with nmi_watchdog=0. | ||
41 | |||
42 | 1. If the kernel has not been booted with nmi_watchdog turned off then | ||
43 | edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the | ||
44 | currently booting kernel line. | ||
45 | 2. reboot the sever | ||
46 | |||
47 | Now, the hpwdt can successfully receive and source the NMI and provide a log | ||
48 | message that details the reason for the NMI (as determined by the HP BIOS). | ||
49 | |||
50 | Below is a list of NMIs the HP BIOS understands along with the associated | ||
51 | code (reason): | ||
52 | |||
53 | No source found 00h | ||
54 | |||
55 | Uncorrectable Memory Error 01h | ||
56 | |||
57 | ASR NMI 1Bh | ||
58 | |||
59 | PCI Parity Error 20h | ||
60 | |||
61 | NMI Button Press 27h | ||
62 | |||
63 | SB_BUS_NMI 28h | ||
64 | |||
65 | ILO Doorbell NMI 29h | ||
66 | |||
67 | ILO IOP NMI 2Ah | ||
68 | |||
69 | ILO Watchdog NMI 2Bh | ||
70 | |||
71 | Proc Throt NMI 2Ch | ||
72 | |||
73 | Front Side Bus NMI 2Dh | ||
74 | |||
75 | PCI Express Error 2Fh | ||
76 | |||
77 | DMA controller NMI 30h | ||
78 | |||
79 | Hypertransport/CSI Error 31h | ||
80 | |||
81 | |||
82 | |||
83 | -- Tom Mingarelli | ||
84 | (thomas.mingarelli@hp.com) | ||
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 3137361ccbfe..c0b9169ba5d5 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
20 | #include <linux/io.h> | 20 | #include <linux/io.h> |
21 | #include <linux/irq.h> | 21 | #include <linux/irq.h> |
22 | #include <linux/nmi.h> | ||
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
23 | #include <linux/miscdevice.h> | 24 | #include <linux/miscdevice.h> |
24 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
@@ -47,7 +48,7 @@ | |||
47 | #define PCI_BIOS32_PARAGRAPH_LEN 16 | 48 | #define PCI_BIOS32_PARAGRAPH_LEN 16 |
48 | #define PCI_ROM_BASE1 0x000F0000 | 49 | #define PCI_ROM_BASE1 0x000F0000 |
49 | #define ROM_SIZE 0x10000 | 50 | #define ROM_SIZE 0x10000 |
50 | #define HPWDT_VERSION "1.01" | 51 | #define HPWDT_VERSION "1.1.1" |
51 | 52 | ||
52 | struct bios32_service_dir { | 53 | struct bios32_service_dir { |
53 | u32 signature; | 54 | u32 signature; |
@@ -119,6 +120,7 @@ static int nowayout = WATCHDOG_NOWAYOUT; | |||
119 | static char expect_release; | 120 | static char expect_release; |
120 | static unsigned long hpwdt_is_open; | 121 | static unsigned long hpwdt_is_open; |
121 | static unsigned int allow_kdump; | 122 | static unsigned int allow_kdump; |
123 | static int hpwdt_nmi_sourcing; | ||
122 | 124 | ||
123 | static void __iomem *pci_mem_addr; /* the PCI-memory address */ | 125 | static void __iomem *pci_mem_addr; /* the PCI-memory address */ |
124 | static unsigned long __iomem *hpwdt_timer_reg; | 126 | static unsigned long __iomem *hpwdt_timer_reg; |
@@ -468,21 +470,22 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason, | |||
468 | if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI) | 470 | if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI) |
469 | return NOTIFY_OK; | 471 | return NOTIFY_OK; |
470 | 472 | ||
471 | spin_lock_irqsave(&rom_lock, rom_pl); | 473 | if (hpwdt_nmi_sourcing) { |
472 | if (!die_nmi_called) | 474 | spin_lock_irqsave(&rom_lock, rom_pl); |
473 | asminline_call(&cmn_regs, cru_rom_addr); | 475 | if (!die_nmi_called) |
474 | die_nmi_called = 1; | 476 | asminline_call(&cmn_regs, cru_rom_addr); |
475 | spin_unlock_irqrestore(&rom_lock, rom_pl); | 477 | die_nmi_called = 1; |
476 | if (cmn_regs.u1.ral == 0) { | 478 | spin_unlock_irqrestore(&rom_lock, rom_pl); |
477 | printk(KERN_WARNING "hpwdt: An NMI occurred, " | 479 | if (cmn_regs.u1.ral == 0) { |
478 | "but unable to determine source.\n"); | 480 | printk(KERN_WARNING "hpwdt: An NMI occurred, " |
479 | } else { | 481 | "but unable to determine source.\n"); |
480 | if (allow_kdump) | 482 | } else { |
481 | hpwdt_stop(); | 483 | if (allow_kdump) |
482 | panic("An NMI occurred, please see the Integrated " | 484 | hpwdt_stop(); |
483 | "Management Log for details.\n"); | 485 | panic("An NMI occurred, please see the Integrated " |
486 | "Management Log for details.\n"); | ||
487 | } | ||
484 | } | 488 | } |
485 | |||
486 | return NOTIFY_OK; | 489 | return NOTIFY_OK; |
487 | } | 490 | } |
488 | 491 | ||
@@ -627,12 +630,38 @@ static struct notifier_block die_notifier = { | |||
627 | * Init & Exit | 630 | * Init & Exit |
628 | */ | 631 | */ |
629 | 632 | ||
633 | #ifdef ARCH_HAS_NMI_WATCHDOG | ||
634 | static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) | ||
635 | { | ||
636 | /* | ||
637 | * If nmi_watchdog is turned off then we can turn on | ||
638 | * our nmi sourcing capability. | ||
639 | */ | ||
640 | if (!nmi_watchdog_active()) | ||
641 | hpwdt_nmi_sourcing = 1; | ||
642 | else | ||
643 | dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this " | ||
644 | "functionality you must reboot with nmi_watchdog=0.\n"); | ||
645 | } | ||
646 | #else | ||
647 | static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) | ||
648 | { | ||
649 | dev_warn(&dev->dev, "NMI sourcing is disabled. " | ||
650 | "Your kernel does not support a NMI Watchdog.\n"); | ||
651 | } | ||
652 | #endif | ||
653 | |||
630 | static int __devinit hpwdt_init_one(struct pci_dev *dev, | 654 | static int __devinit hpwdt_init_one(struct pci_dev *dev, |
631 | const struct pci_device_id *ent) | 655 | const struct pci_device_id *ent) |
632 | { | 656 | { |
633 | int retval; | 657 | int retval; |
634 | 658 | ||
635 | /* | 659 | /* |
660 | * Check if we can do NMI sourcing or not | ||
661 | */ | ||
662 | hpwdt_check_nmi_sourcing(dev); | ||
663 | |||
664 | /* | ||
636 | * First let's find out if we are on an iLO2 server. We will | 665 | * First let's find out if we are on an iLO2 server. We will |
637 | * not run on a legacy ASM box. | 666 | * not run on a legacy ASM box. |
638 | * So we only support the G5 ProLiant servers and higher. | 667 | * So we only support the G5 ProLiant servers and higher. |