diff options
author | Dave Jiang <djiang@mvista.com> | 2007-07-19 04:49:46 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-19 13:04:53 -0400 |
commit | c0d121720220584bba2876b032e58a076b843fa1 (patch) | |
tree | 13ba24c6d875ded1494e1560f336b8551c663ef1 | |
parent | 28f96eeafc89643d411d54c258788a8573576127 (diff) |
drivers/edac: add new nmi rescan
Provides a way for NMI reported errors on x86 to notify the EDAC
subsystem pending ECC errors by writing to a software state variable.
Here's the reworked patch. I added an EDAC stub to the kernel so we can
have variables that are in the kernel even if EDAC is a module. I also
implemented the idea of using the chip driver to select error detection
mode via module parameter and eliminate the kernel compile option.
Please review/test. Thx!
Also, I only made changes to some of the chipset drivers since I am
unfamiliar with the other ones. We can add similar changes as we go.
Signed-off-by: Dave Jiang <djiang@mvista.com>
Signed-off-by: Douglas Thompson <dougthompson@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/i386/kernel/traps.c | 12 | ||||
-rw-r--r-- | arch/x86_64/kernel/traps.c | 11 | ||||
-rw-r--r-- | drivers/edac/Kconfig | 11 | ||||
-rw-r--r-- | drivers/edac/Makefile | 2 | ||||
-rw-r--r-- | drivers/edac/e752x_edac.c | 14 | ||||
-rw-r--r-- | drivers/edac/e7xxx_edac.c | 14 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 3 | ||||
-rw-r--r-- | drivers/edac/edac_module.c | 24 | ||||
-rw-r--r-- | drivers/edac/edac_stub.c | 42 | ||||
-rw-r--r-- | drivers/edac/i5000_edac.c | 13 | ||||
-rw-r--r-- | include/linux/edac.h | 29 |
11 files changed, 160 insertions, 15 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 109ebbcde585..3e7753c78b9b 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c | |||
@@ -41,6 +41,10 @@ | |||
41 | #include <linux/mca.h> | 41 | #include <linux/mca.h> |
42 | #endif | 42 | #endif |
43 | 43 | ||
44 | #if defined(CONFIG_EDAC) | ||
45 | #include <linux/edac.h> | ||
46 | #endif | ||
47 | |||
44 | #include <asm/processor.h> | 48 | #include <asm/processor.h> |
45 | #include <asm/system.h> | 49 | #include <asm/system.h> |
46 | #include <asm/io.h> | 50 | #include <asm/io.h> |
@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) | |||
638 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " | 642 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " |
639 | "CPU %d.\n", reason, smp_processor_id()); | 643 | "CPU %d.\n", reason, smp_processor_id()); |
640 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | 644 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); |
645 | |||
646 | #if defined(CONFIG_EDAC) | ||
647 | if(edac_handler_set()) { | ||
648 | edac_atomic_assert_error(); | ||
649 | return; | ||
650 | } | ||
651 | #endif | ||
652 | |||
641 | if (panic_on_unrecovered_nmi) | 653 | if (panic_on_unrecovered_nmi) |
642 | panic("NMI: Not continuing"); | 654 | panic("NMI: Not continuing"); |
643 | 655 | ||
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 74cbeb2e99a6..8713ad4a4db1 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c | |||
@@ -34,6 +34,10 @@ | |||
34 | #include <linux/bug.h> | 34 | #include <linux/bug.h> |
35 | #include <linux/kdebug.h> | 35 | #include <linux/kdebug.h> |
36 | 36 | ||
37 | #if defined(CONFIG_EDAC) | ||
38 | #include <linux/edac.h> | ||
39 | #endif | ||
40 | |||
37 | #include <asm/system.h> | 41 | #include <asm/system.h> |
38 | #include <asm/io.h> | 42 | #include <asm/io.h> |
39 | #include <asm/atomic.h> | 43 | #include <asm/atomic.h> |
@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) | |||
719 | reason); | 723 | reason); |
720 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | 724 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); |
721 | 725 | ||
726 | #if defined(CONFIG_EDAC) | ||
727 | if(edac_handler_set()) { | ||
728 | edac_atomic_assert_error(); | ||
729 | return; | ||
730 | } | ||
731 | #endif | ||
732 | |||
722 | if (panic_on_unrecovered_nmi) | 733 | if (panic_on_unrecovered_nmi) |
723 | panic("NMI: Not continuing"); | 734 | panic("NMI: Not continuing"); |
724 | 735 | ||
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e8c4a2bedaa1..3cfd9065a9b4 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig | |||
@@ -109,15 +109,4 @@ config EDAC_I5000 | |||
109 | Support for error detection and correction the Intel | 109 | Support for error detection and correction the Intel |
110 | Greekcreek/Blackford chipsets. | 110 | Greekcreek/Blackford chipsets. |
111 | 111 | ||
112 | choice | ||
113 | prompt "Error detecting method" | ||
114 | default EDAC_POLL | ||
115 | |||
116 | config EDAC_POLL | ||
117 | bool "Poll for errors" | ||
118 | help | ||
119 | Poll the chipset periodically to detect errors. | ||
120 | |||
121 | endchoice | ||
122 | |||
123 | endif # EDAC | 112 | endif # EDAC |
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 773472cef76e..19d5ac724098 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile | |||
@@ -5,9 +5,9 @@ | |||
5 | # This file may be distributed under the terms of the | 5 | # This file may be distributed under the terms of the |
6 | # GNU General Public License. | 6 | # GNU General Public License. |
7 | # | 7 | # |
8 | # $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $ | ||
9 | 8 | ||
10 | 9 | ||
10 | obj-$(CONFIG_EDAC) := edac_stub.o | ||
11 | obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o | 11 | obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o |
12 | 12 | ||
13 | edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o | 13 | edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o |
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index 8bcc887692ab..f51e79a6f891 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/pci.h> | 22 | #include <linux/pci.h> |
23 | #include <linux/pci_ids.h> | 23 | #include <linux/pci_ids.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/edac.h> | ||
25 | #include "edac_mc.h" | 26 | #include "edac_mc.h" |
26 | 27 | ||
27 | #define E752X_REVISION " Ver: 2.0.1 " __DATE__ | 28 | #define E752X_REVISION " Ver: 2.0.1 " __DATE__ |
@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx) | |||
948 | debugf0("%s(): mci\n", __func__); | 949 | debugf0("%s(): mci\n", __func__); |
949 | debugf0("Starting Probe1\n"); | 950 | debugf0("Starting Probe1\n"); |
950 | 951 | ||
952 | /* make sure error reporting method is sane */ | ||
953 | switch(edac_op_state) { | ||
954 | case EDAC_OPSTATE_POLL: | ||
955 | case EDAC_OPSTATE_NMI: | ||
956 | break; | ||
957 | default: | ||
958 | edac_op_state = EDAC_OPSTATE_POLL; | ||
959 | break; | ||
960 | } | ||
961 | |||
951 | /* check to see if device 0 function 1 is enabled; if it isn't, we | 962 | /* check to see if device 0 function 1 is enabled; if it isn't, we |
952 | * assume the BIOS has reserved it for a reason and is expecting | 963 | * assume the BIOS has reserved it for a reason and is expecting |
953 | * exclusive access, we take care not to violate that assumption and | 964 | * exclusive access, we take care not to violate that assumption and |
@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers"); | |||
1123 | module_param(force_function_unhide, int, 0444); | 1134 | module_param(force_function_unhide, int, 0444); |
1124 | MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:" | 1135 | MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:" |
1125 | " 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access"); | 1136 | " 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access"); |
1126 | 1137 | module_param(edac_op_state, int, 0444); | |
1138 | MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); | ||
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c index 310d91b41c96..0827b9a7b386 100644 --- a/drivers/edac/e7xxx_edac.c +++ b/drivers/edac/e7xxx_edac.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/pci.h> | 27 | #include <linux/pci.h> |
28 | #include <linux/pci_ids.h> | 28 | #include <linux/pci_ids.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/edac.h> | ||
30 | #include "edac_mc.h" | 31 | #include "edac_mc.h" |
31 | 32 | ||
32 | #define E7XXX_REVISION " Ver: 2.0.1 " __DATE__ | 33 | #define E7XXX_REVISION " Ver: 2.0.1 " __DATE__ |
@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx) | |||
419 | struct e7xxx_error_info discard; | 420 | struct e7xxx_error_info discard; |
420 | 421 | ||
421 | debugf0("%s(): mci\n", __func__); | 422 | debugf0("%s(): mci\n", __func__); |
423 | |||
424 | /* make sure error reporting method is sane */ | ||
425 | switch(edac_op_state) { | ||
426 | case EDAC_OPSTATE_POLL: | ||
427 | case EDAC_OPSTATE_NMI: | ||
428 | break; | ||
429 | default: | ||
430 | edac_op_state = EDAC_OPSTATE_POLL; | ||
431 | break; | ||
432 | } | ||
433 | |||
422 | pci_read_config_dword(pdev, E7XXX_DRC, &drc); | 434 | pci_read_config_dword(pdev, E7XXX_DRC, &drc); |
423 | 435 | ||
424 | drc_chan = dual_channel_active(drc, dev_idx); | 436 | drc_chan = dual_channel_active(drc, dev_idx); |
@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL"); | |||
565 | MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n" | 577 | MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n" |
566 | "Based on.work by Dan Hollis et al"); | 578 | "Based on.work by Dan Hollis et al"); |
567 | MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers"); | 579 | MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers"); |
580 | module_param(edac_op_state, int, 0444); | ||
581 | MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); | ||
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index eae1ca1caebd..81a28d6662e4 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/list.h> | 27 | #include <linux/list.h> |
28 | #include <linux/sysdev.h> | 28 | #include <linux/sysdev.h> |
29 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
30 | #include <linux/edac.h> | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/page.h> | 32 | #include <asm/page.h> |
32 | #include <asm/edac.h> | 33 | #include <asm/edac.h> |
@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci) | |||
241 | } | 242 | } |
242 | 243 | ||
243 | list_add_tail_rcu(&mci->link, insert_before); | 244 | list_add_tail_rcu(&mci->link, insert_before); |
245 | atomic_inc(&edac_handlers); | ||
244 | return 0; | 246 | return 0; |
245 | 247 | ||
246 | fail0: | 248 | fail0: |
@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head) | |||
267 | 269 | ||
268 | static void del_mc_from_global_list(struct mem_ctl_info *mci) | 270 | static void del_mc_from_global_list(struct mem_ctl_info *mci) |
269 | { | 271 | { |
272 | atomic_dec(&edac_handlers); | ||
270 | list_del_rcu(&mci->link); | 273 | list_del_rcu(&mci->link); |
271 | init_completion(&mci->complete); | 274 | init_completion(&mci->complete); |
272 | call_rcu(&mci->rcu, complete_mc_list_del); | 275 | call_rcu(&mci->rcu, complete_mc_list_del); |
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 3cd3a236821c..89c96ecbf04e 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c | |||
@@ -1,6 +1,7 @@ | |||
1 | 1 | ||
2 | #include <linux/freezer.h> | 2 | #include <linux/freezer.h> |
3 | #include <linux/kthread.h> | 3 | #include <linux/kthread.h> |
4 | #include <linux/edac.h> | ||
4 | 5 | ||
5 | #include "edac_mc.h" | 6 | #include "edac_mc.h" |
6 | #include "edac_module.h" | 7 | #include "edac_module.h" |
@@ -102,6 +103,25 @@ static void do_edac_check(void) | |||
102 | } | 103 | } |
103 | 104 | ||
104 | /* | 105 | /* |
106 | * handler for EDAC to check if NMI type handler has asserted interrupt | ||
107 | */ | ||
108 | static int edac_assert_error_check_and_clear(void) | ||
109 | { | ||
110 | int vreg; | ||
111 | |||
112 | if(edac_op_state == EDAC_OPSTATE_POLL) | ||
113 | return 1; | ||
114 | |||
115 | vreg = atomic_read(&edac_err_assert); | ||
116 | if(vreg) { | ||
117 | atomic_set(&edac_err_assert, 0); | ||
118 | return 1; | ||
119 | } | ||
120 | |||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | /* | ||
105 | * Action thread for EDAC to perform the POLL operations | 125 | * Action thread for EDAC to perform the POLL operations |
106 | */ | 126 | */ |
107 | static int edac_kernel_thread(void *arg) | 127 | static int edac_kernel_thread(void *arg) |
@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg) | |||
109 | int msec; | 129 | int msec; |
110 | 130 | ||
111 | while (!kthread_should_stop()) { | 131 | while (!kthread_should_stop()) { |
112 | 132 | if(edac_assert_error_check_and_clear()) | |
113 | do_edac_check(); | 133 | do_edac_check(); |
114 | 134 | ||
115 | /* goto sleep for the interval */ | 135 | /* goto sleep for the interval */ |
116 | msec = (HZ * edac_get_poll_msec()) / 1000; | 136 | msec = (HZ * edac_get_poll_msec()) / 1000; |
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c new file mode 100644 index 000000000000..91a038d2f652 --- /dev/null +++ b/drivers/edac/edac_stub.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * common EDAC components that must be in kernel | ||
3 | * | ||
4 | * Author: Dave Jiang <djiang@mvista.com> | ||
5 | * | ||
6 | * 2007 (c) MontaVista Software, Inc. This file is licensed under | ||
7 | * the terms of the GNU General Public License version 2. This program | ||
8 | * is licensed "as is" without any warranty of any kind, whether express | ||
9 | * or implied. | ||
10 | * | ||
11 | */ | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/edac.h> | ||
14 | #include <asm/atomic.h> | ||
15 | #include <asm/edac.h> | ||
16 | |||
17 | int edac_op_state = EDAC_OPSTATE_INVAL; | ||
18 | EXPORT_SYMBOL(edac_op_state); | ||
19 | |||
20 | atomic_t edac_handlers = ATOMIC_INIT(0); | ||
21 | EXPORT_SYMBOL(edac_handlers); | ||
22 | |||
23 | atomic_t edac_err_assert = ATOMIC_INIT(0); | ||
24 | EXPORT_SYMBOL(edac_err_assert); | ||
25 | |||
26 | inline int edac_handler_set(void) | ||
27 | { | ||
28 | if (edac_op_state == EDAC_OPSTATE_POLL) | ||
29 | return 0; | ||
30 | |||
31 | return atomic_read(&edac_handlers); | ||
32 | } | ||
33 | EXPORT_SYMBOL(edac_handler_set); | ||
34 | |||
35 | /* | ||
36 | * handler for NMI type of interrupts to assert error | ||
37 | */ | ||
38 | inline void edac_atomic_assert_error(void) | ||
39 | { | ||
40 | atomic_set(&edac_err_assert, 1); | ||
41 | } | ||
42 | EXPORT_SYMBOL(edac_atomic_assert_error); | ||
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c index 4d7e786065aa..8eb8b6e5b32c 100644 --- a/drivers/edac/i5000_edac.c +++ b/drivers/edac/i5000_edac.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
20 | #include <linux/pci_ids.h> | 20 | #include <linux/pci_ids.h> |
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | #include <linux/edac.h> | ||
22 | #include <asm/mmzone.h> | 23 | #include <asm/mmzone.h> |
23 | 24 | ||
24 | #include "edac_mc.h" | 25 | #include "edac_mc.h" |
@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx) | |||
1285 | if (PCI_FUNC(pdev->devfn) != 0) | 1286 | if (PCI_FUNC(pdev->devfn) != 0) |
1286 | return -ENODEV; | 1287 | return -ENODEV; |
1287 | 1288 | ||
1289 | /* make sure error reporting method is sane */ | ||
1290 | switch(edac_op_state) { | ||
1291 | case EDAC_OPSTATE_POLL: | ||
1292 | case EDAC_OPSTATE_NMI: | ||
1293 | break; | ||
1294 | default: | ||
1295 | edac_op_state = EDAC_OPSTATE_POLL; | ||
1296 | break; | ||
1297 | } | ||
1298 | |||
1288 | /* Ask the devices for the number of CSROWS and CHANNELS so | 1299 | /* Ask the devices for the number of CSROWS and CHANNELS so |
1289 | * that we can calculate the memory resources, etc | 1300 | * that we can calculate the memory resources, etc |
1290 | * | 1301 | * |
@@ -1475,3 +1486,5 @@ MODULE_AUTHOR | |||
1475 | ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); | 1486 | ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); |
1476 | MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " | 1487 | MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " |
1477 | I5000_REVISION); | 1488 | I5000_REVISION); |
1489 | module_param(edac_op_state, int, 0444); | ||
1490 | MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); | ||
diff --git a/include/linux/edac.h b/include/linux/edac.h new file mode 100644 index 000000000000..c8b92d79f884 --- /dev/null +++ b/include/linux/edac.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * Generic EDAC defs | ||
3 | * | ||
4 | * Author: Dave Jiang <djiang@mvista.com> | ||
5 | * | ||
6 | * 2006-2007 (c) MontaVista Software, Inc. This file is licensed under | ||
7 | * the terms of the GNU General Public License version 2. This program | ||
8 | * is licensed "as is" without any warranty of any kind, whether express | ||
9 | * or implied. | ||
10 | * | ||
11 | */ | ||
12 | #ifndef _LINUX_EDAC_H_ | ||
13 | #define _LINUX_EDAC_H_ | ||
14 | |||
15 | #include <asm/atomic.h> | ||
16 | |||
17 | #define EDAC_OPSTATE_INVAL -1 | ||
18 | #define EDAC_OPSTATE_POLL 0 | ||
19 | #define EDAC_OPSTATE_NMI 1 | ||
20 | #define EDAC_OPSTATE_INT 2 | ||
21 | |||
22 | extern int edac_op_state; | ||
23 | extern atomic_t edac_handlers; | ||
24 | extern atomic_t edac_err_assert; | ||
25 | |||
26 | extern int edac_handler_set(void); | ||
27 | extern void edac_atomic_assert_error(void); | ||
28 | |||
29 | #endif | ||