diff options
author | Qiuxu Zhuo <qiuxu.zhuo@intel.com> | 2017-09-13 06:42:14 -0400 |
---|---|---|
committer | Borislav Petkov <bp@suse.de> | 2017-09-27 06:15:43 -0400 |
commit | 15cc3ae001873845b5d842e212478a6570c7d938 (patch) | |
tree | 42bf230985ce374df078317918e68c804f8f12d1 | |
parent | 301375e764324b8048704eaf2c46fe1ee290830e (diff) |
EDAC, sb_edac: Don't create a second memory controller if HA1 is not present
Yi Zhang reported the following failure on a 2-socket Haswell (E5-2603v3)
server (DELL PowerEdge 730xd):
EDAC sbridge: Some needed devices are missing
EDAC MC: Removed device 0 for sb_edac.c Haswell SrcID#0_Ha#0: DEV 0000:7f:12.0
EDAC MC: Removed device 1 for sb_edac.c Haswell SrcID#1_Ha#0: DEV 0000:ff:12.0
EDAC sbridge: Couldn't find mci handler
EDAC sbridge: Couldn't find mci handler
EDAC sbridge: Failed to register device with error -19.
The refactored sb_edac driver creates the IMC1 (the 2nd memory
controller) if any IMC1 device is present. In this case only
HA1_TA of IMC1 was present, but the driver expected to find
HA1/HA1_TM/HA1_TAD[0-3] devices too, leading to the above failure.
The document [1] says the 'E5-2603 v3' CPU has 4 memory channels max. Yi
Zhang inserted one DIMM per channel for each CPU, and did random error
address injection test with this patch:
4024 addresses fell in TOLM hole area
12715 addresses fell in CPU_SrcID#0_Ha#0_Chan#0_DIMM#0
12774 addresses fell in CPU_SrcID#0_Ha#0_Chan#1_DIMM#0
12798 addresses fell in CPU_SrcID#0_Ha#0_Chan#2_DIMM#0
12913 addresses fell in CPU_SrcID#0_Ha#0_Chan#3_DIMM#0
12674 addresses fell in CPU_SrcID#1_Ha#0_Chan#0_DIMM#0
12686 addresses fell in CPU_SrcID#1_Ha#0_Chan#1_DIMM#0
12882 addresses fell in CPU_SrcID#1_Ha#0_Chan#2_DIMM#0
12934 addresses fell in CPU_SrcID#1_Ha#0_Chan#3_DIMM#0
106400 addresses were injected totally.
The test result shows that all the 4 channels belong to IMC0 per CPU, so
the server really only has one IMC per CPU.
In the 1st page of chapter 2 in datasheet [2], it also says 'E5-2600 v3'
implements either one or two IMCs. For CPUs with one IMC, IMC1 is not
used and should be ignored.
Thus, do not create a second memory controller if the key HA1 is absent.
[1] http://ark.intel.com/products/83349/Intel-Xeon-Processor-E5-2603-v3-15M-Cache-1_60-GHz
[2] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf
Reported-and-tested-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Fixes: e2f747b1f42a ("EDAC, sb_edac: Assign EDAC memory controller per h/w controller")
Link: http://lkml.kernel.org/r/20170913104214.7325-1-qiuxu.zhuo@intel.com
[ Massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
-rw-r--r-- | drivers/edac/sb_edac.c | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 3ea90fc5978b..54d9d179cb77 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c | |||
@@ -462,6 +462,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = { | |||
462 | static const struct pci_id_descr pci_dev_descr_ibridge[] = { | 462 | static const struct pci_id_descr pci_dev_descr_ibridge[] = { |
463 | /* Processor Home Agent */ | 463 | /* Processor Home Agent */ |
464 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) }, | 464 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) }, |
465 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, | ||
465 | 466 | ||
466 | /* Memory controller */ | 467 | /* Memory controller */ |
467 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) }, | 468 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) }, |
@@ -472,7 +473,6 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = { | |||
472 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) }, | 473 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) }, |
473 | 474 | ||
474 | /* Optional, mode 2HA */ | 475 | /* Optional, mode 2HA */ |
475 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, | ||
476 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) }, | 476 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) }, |
477 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) }, | 477 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) }, |
478 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) }, | 478 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) }, |
@@ -2291,6 +2291,13 @@ static int sbridge_get_onedevice(struct pci_dev **prev, | |||
2291 | next_imc: | 2291 | next_imc: |
2292 | sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev); | 2292 | sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev); |
2293 | if (!sbridge_dev) { | 2293 | if (!sbridge_dev) { |
2294 | /* If the HA1 wasn't found, don't create EDAC second memory controller */ | ||
2295 | if (dev_descr->dom == IMC1 && devno != 1) { | ||
2296 | edac_dbg(0, "Skip IMC1: %04x:%04x (since HA1 was absent)\n", | ||
2297 | PCI_VENDOR_ID_INTEL, dev_descr->dev_id); | ||
2298 | pci_dev_put(pdev); | ||
2299 | return 0; | ||
2300 | } | ||
2294 | 2301 | ||
2295 | if (dev_descr->dom == SOCK) | 2302 | if (dev_descr->dom == SOCK) |
2296 | goto out_imc; | 2303 | goto out_imc; |