aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Jiang <djiang@mvista.com>2007-07-19 04:49:46 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:53 -0400
commitc0d121720220584bba2876b032e58a076b843fa1 (patch)
tree13ba24c6d875ded1494e1560f336b8551c663ef1
parent28f96eeafc89643d411d54c258788a8573576127 (diff)
drivers/edac: add new nmi rescan
Provides a way for NMI reported errors on x86 to notify the EDAC subsystem pending ECC errors by writing to a software state variable. Here's the reworked patch. I added an EDAC stub to the kernel so we can have variables that are in the kernel even if EDAC is a module. I also implemented the idea of using the chip driver to select error detection mode via module parameter and eliminate the kernel compile option. Please review/test. Thx! Also, I only made changes to some of the chipset drivers since I am unfamiliar with the other ones. We can add similar changes as we go. Signed-off-by: Dave Jiang <djiang@mvista.com> Signed-off-by: Douglas Thompson <dougthompson@xmission.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/i386/kernel/traps.c12
-rw-r--r--arch/x86_64/kernel/traps.c11
-rw-r--r--drivers/edac/Kconfig11
-rw-r--r--drivers/edac/Makefile2
-rw-r--r--drivers/edac/e752x_edac.c14
-rw-r--r--drivers/edac/e7xxx_edac.c14
-rw-r--r--drivers/edac/edac_mc.c3
-rw-r--r--drivers/edac/edac_module.c24
-rw-r--r--drivers/edac/edac_stub.c42
-rw-r--r--drivers/edac/i5000_edac.c13
-rw-r--r--include/linux/edac.h29
11 files changed, 160 insertions, 15 deletions
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 109ebbcde585..3e7753c78b9b 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -41,6 +41,10 @@
41#include <linux/mca.h> 41#include <linux/mca.h>
42#endif 42#endif
43 43
44#if defined(CONFIG_EDAC)
45#include <linux/edac.h>
46#endif
47
44#include <asm/processor.h> 48#include <asm/processor.h>
45#include <asm/system.h> 49#include <asm/system.h>
46#include <asm/io.h> 50#include <asm/io.h>
@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
638 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 642 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
639 "CPU %d.\n", reason, smp_processor_id()); 643 "CPU %d.\n", reason, smp_processor_id());
640 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); 644 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
645
646#if defined(CONFIG_EDAC)
647 if(edac_handler_set()) {
648 edac_atomic_assert_error();
649 return;
650 }
651#endif
652
641 if (panic_on_unrecovered_nmi) 653 if (panic_on_unrecovered_nmi)
642 panic("NMI: Not continuing"); 654 panic("NMI: Not continuing");
643 655
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 74cbeb2e99a6..8713ad4a4db1 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -34,6 +34,10 @@
34#include <linux/bug.h> 34#include <linux/bug.h>
35#include <linux/kdebug.h> 35#include <linux/kdebug.h>
36 36
37#if defined(CONFIG_EDAC)
38#include <linux/edac.h>
39#endif
40
37#include <asm/system.h> 41#include <asm/system.h>
38#include <asm/io.h> 42#include <asm/io.h>
39#include <asm/atomic.h> 43#include <asm/atomic.h>
@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
719 reason); 723 reason);
720 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); 724 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
721 725
726#if defined(CONFIG_EDAC)
727 if(edac_handler_set()) {
728 edac_atomic_assert_error();
729 return;
730 }
731#endif
732
722 if (panic_on_unrecovered_nmi) 733 if (panic_on_unrecovered_nmi)
723 panic("NMI: Not continuing"); 734 panic("NMI: Not continuing");
724 735
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index e8c4a2bedaa1..3cfd9065a9b4 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -109,15 +109,4 @@ config EDAC_I5000
109 Support for error detection and correction the Intel 109 Support for error detection and correction the Intel
110 Greekcreek/Blackford chipsets. 110 Greekcreek/Blackford chipsets.
111 111
112choice
113 prompt "Error detecting method"
114 default EDAC_POLL
115
116config EDAC_POLL
117 bool "Poll for errors"
118 help
119 Poll the chipset periodically to detect errors.
120
121endchoice
122
123endif # EDAC 112endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 773472cef76e..19d5ac724098 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -5,9 +5,9 @@
5# This file may be distributed under the terms of the 5# This file may be distributed under the terms of the
6# GNU General Public License. 6# GNU General Public License.
7# 7#
8# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $
9 8
10 9
10obj-$(CONFIG_EDAC) := edac_stub.o
11obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o 11obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o
12 12
13edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o 13edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index 8bcc887692ab..f51e79a6f891 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -22,6 +22,7 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/pci_ids.h> 23#include <linux/pci_ids.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/edac.h>
25#include "edac_mc.h" 26#include "edac_mc.h"
26 27
27#define E752X_REVISION " Ver: 2.0.1 " __DATE__ 28#define E752X_REVISION " Ver: 2.0.1 " __DATE__
@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
948 debugf0("%s(): mci\n", __func__); 949 debugf0("%s(): mci\n", __func__);
949 debugf0("Starting Probe1\n"); 950 debugf0("Starting Probe1\n");
950 951
952 /* make sure error reporting method is sane */
953 switch(edac_op_state) {
954 case EDAC_OPSTATE_POLL:
955 case EDAC_OPSTATE_NMI:
956 break;
957 default:
958 edac_op_state = EDAC_OPSTATE_POLL;
959 break;
960 }
961
951 /* check to see if device 0 function 1 is enabled; if it isn't, we 962 /* check to see if device 0 function 1 is enabled; if it isn't, we
952 * assume the BIOS has reserved it for a reason and is expecting 963 * assume the BIOS has reserved it for a reason and is expecting
953 * exclusive access, we take care not to violate that assumption and 964 * exclusive access, we take care not to violate that assumption and
@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers");
1123module_param(force_function_unhide, int, 0444); 1134module_param(force_function_unhide, int, 0444);
1124MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:" 1135MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:"
1125" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access"); 1136" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access");
1126 1137module_param(edac_op_state, int, 0444);
1138MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index 310d91b41c96..0827b9a7b386 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -27,6 +27,7 @@
27#include <linux/pci.h> 27#include <linux/pci.h>
28#include <linux/pci_ids.h> 28#include <linux/pci_ids.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/edac.h>
30#include "edac_mc.h" 31#include "edac_mc.h"
31 32
32#define E7XXX_REVISION " Ver: 2.0.1 " __DATE__ 33#define E7XXX_REVISION " Ver: 2.0.1 " __DATE__
@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
419 struct e7xxx_error_info discard; 420 struct e7xxx_error_info discard;
420 421
421 debugf0("%s(): mci\n", __func__); 422 debugf0("%s(): mci\n", __func__);
423
424 /* make sure error reporting method is sane */
425 switch(edac_op_state) {
426 case EDAC_OPSTATE_POLL:
427 case EDAC_OPSTATE_NMI:
428 break;
429 default:
430 edac_op_state = EDAC_OPSTATE_POLL;
431 break;
432 }
433
422 pci_read_config_dword(pdev, E7XXX_DRC, &drc); 434 pci_read_config_dword(pdev, E7XXX_DRC, &drc);
423 435
424 drc_chan = dual_channel_active(drc, dev_idx); 436 drc_chan = dual_channel_active(drc, dev_idx);
@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL");
565MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n" 577MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n"
566 "Based on.work by Dan Hollis et al"); 578 "Based on.work by Dan Hollis et al");
567MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers"); 579MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers");
580module_param(edac_op_state, int, 0444);
581MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index eae1ca1caebd..81a28d6662e4 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -27,6 +27,7 @@
27#include <linux/list.h> 27#include <linux/list.h>
28#include <linux/sysdev.h> 28#include <linux/sysdev.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/edac.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/page.h> 32#include <asm/page.h>
32#include <asm/edac.h> 33#include <asm/edac.h>
@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci)
241 } 242 }
242 243
243 list_add_tail_rcu(&mci->link, insert_before); 244 list_add_tail_rcu(&mci->link, insert_before);
245 atomic_inc(&edac_handlers);
244 return 0; 246 return 0;
245 247
246fail0: 248fail0:
@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head)
267 269
268static void del_mc_from_global_list(struct mem_ctl_info *mci) 270static void del_mc_from_global_list(struct mem_ctl_info *mci)
269{ 271{
272 atomic_dec(&edac_handlers);
270 list_del_rcu(&mci->link); 273 list_del_rcu(&mci->link);
271 init_completion(&mci->complete); 274 init_completion(&mci->complete);
272 call_rcu(&mci->rcu, complete_mc_list_del); 275 call_rcu(&mci->rcu, complete_mc_list_del);
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 3cd3a236821c..89c96ecbf04e 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -1,6 +1,7 @@
1 1
2#include <linux/freezer.h> 2#include <linux/freezer.h>
3#include <linux/kthread.h> 3#include <linux/kthread.h>
4#include <linux/edac.h>
4 5
5#include "edac_mc.h" 6#include "edac_mc.h"
6#include "edac_module.h" 7#include "edac_module.h"
@@ -102,6 +103,25 @@ static void do_edac_check(void)
102} 103}
103 104
104/* 105/*
106 * handler for EDAC to check if NMI type handler has asserted interrupt
107 */
108static int edac_assert_error_check_and_clear(void)
109{
110 int vreg;
111
112 if(edac_op_state == EDAC_OPSTATE_POLL)
113 return 1;
114
115 vreg = atomic_read(&edac_err_assert);
116 if(vreg) {
117 atomic_set(&edac_err_assert, 0);
118 return 1;
119 }
120
121 return 0;
122}
123
124/*
105 * Action thread for EDAC to perform the POLL operations 125 * Action thread for EDAC to perform the POLL operations
106 */ 126 */
107static int edac_kernel_thread(void *arg) 127static int edac_kernel_thread(void *arg)
@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg)
109 int msec; 129 int msec;
110 130
111 while (!kthread_should_stop()) { 131 while (!kthread_should_stop()) {
112 132 if(edac_assert_error_check_and_clear())
113 do_edac_check(); 133 do_edac_check();
114 134
115 /* goto sleep for the interval */ 135 /* goto sleep for the interval */
116 msec = (HZ * edac_get_poll_msec()) / 1000; 136 msec = (HZ * edac_get_poll_msec()) / 1000;
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
new file mode 100644
index 000000000000..91a038d2f652
--- /dev/null
+++ b/drivers/edac/edac_stub.c
@@ -0,0 +1,42 @@
1/*
2 * common EDAC components that must be in kernel
3 *
4 * Author: Dave Jiang <djiang@mvista.com>
5 *
6 * 2007 (c) MontaVista Software, Inc. This file is licensed under
7 * the terms of the GNU General Public License version 2. This program
8 * is licensed "as is" without any warranty of any kind, whether express
9 * or implied.
10 *
11 */
12#include <linux/module.h>
13#include <linux/edac.h>
14#include <asm/atomic.h>
15#include <asm/edac.h>
16
17int edac_op_state = EDAC_OPSTATE_INVAL;
18EXPORT_SYMBOL(edac_op_state);
19
20atomic_t edac_handlers = ATOMIC_INIT(0);
21EXPORT_SYMBOL(edac_handlers);
22
23atomic_t edac_err_assert = ATOMIC_INIT(0);
24EXPORT_SYMBOL(edac_err_assert);
25
26inline int edac_handler_set(void)
27{
28 if (edac_op_state == EDAC_OPSTATE_POLL)
29 return 0;
30
31 return atomic_read(&edac_handlers);
32}
33EXPORT_SYMBOL(edac_handler_set);
34
35/*
36 * handler for NMI type of interrupts to assert error
37 */
38inline void edac_atomic_assert_error(void)
39{
40 atomic_set(&edac_err_assert, 1);
41}
42EXPORT_SYMBOL(edac_atomic_assert_error);
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 4d7e786065aa..8eb8b6e5b32c 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -19,6 +19,7 @@
19#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/edac.h>
22#include <asm/mmzone.h> 23#include <asm/mmzone.h>
23 24
24#include "edac_mc.h" 25#include "edac_mc.h"
@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
1285 if (PCI_FUNC(pdev->devfn) != 0) 1286 if (PCI_FUNC(pdev->devfn) != 0)
1286 return -ENODEV; 1287 return -ENODEV;
1287 1288
1289 /* make sure error reporting method is sane */
1290 switch(edac_op_state) {
1291 case EDAC_OPSTATE_POLL:
1292 case EDAC_OPSTATE_NMI:
1293 break;
1294 default:
1295 edac_op_state = EDAC_OPSTATE_POLL;
1296 break;
1297 }
1298
1288 /* Ask the devices for the number of CSROWS and CHANNELS so 1299 /* Ask the devices for the number of CSROWS and CHANNELS so
1289 * that we can calculate the memory resources, etc 1300 * that we can calculate the memory resources, etc
1290 * 1301 *
@@ -1475,3 +1486,5 @@ MODULE_AUTHOR
1475 ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); 1486 ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>");
1476MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " 1487MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
1477 I5000_REVISION); 1488 I5000_REVISION);
1489module_param(edac_op_state, int, 0444);
1490MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/include/linux/edac.h b/include/linux/edac.h
new file mode 100644
index 000000000000..c8b92d79f884
--- /dev/null
+++ b/include/linux/edac.h
@@ -0,0 +1,29 @@
1/*
2 * Generic EDAC defs
3 *
4 * Author: Dave Jiang <djiang@mvista.com>
5 *
6 * 2006-2007 (c) MontaVista Software, Inc. This file is licensed under
7 * the terms of the GNU General Public License version 2. This program
8 * is licensed "as is" without any warranty of any kind, whether express
9 * or implied.
10 *
11 */
12#ifndef _LINUX_EDAC_H_
13#define _LINUX_EDAC_H_
14
15#include <asm/atomic.h>
16
17#define EDAC_OPSTATE_INVAL -1
18#define EDAC_OPSTATE_POLL 0
19#define EDAC_OPSTATE_NMI 1
20#define EDAC_OPSTATE_INT 2
21
22extern int edac_op_state;
23extern atomic_t edac_handlers;
24extern atomic_t edac_err_assert;
25
26extern int edac_handler_set(void);
27extern void edac_atomic_assert_error(void);
28
29#endif