aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQiuxu Zhuo <qiuxu.zhuo@intel.com>2017-07-30 14:06:51 -0400
committerBorislav Petkov <bp@suse.de>2017-08-01 23:40:11 -0400
commit039d7af651ba414b468bbb22d5d1a76169f81c0d (patch)
tree5edb8e2e98131ad1c0eba63ffac9e4a245cf50f5
parent2efdda4a41086aaf62a1a89e898514d674a8b727 (diff)
EDAC, sb_edac: Classify memory mirroring modes
Basically, there are full memory mirroring and address range partial memory mirroring (supported by Haswell EX and Broadwell EX) modes. a) In full memory mirroring, the memory behind each memory controller is mirrored, i.e. the memory is split into two identical mirrors (primary and secondary), half of the memory is reserved for redundancy. b) In address range partial memory mirroring, the memory size (range) of primary and secondary behind each memory controller can be user defined by the TAD0 register. The rest of memory ranges defined by TAD1/TAD2/... in that memory controller are non-mirrored. For more detail on memory mirroring, see the following link written by Tony Luck: https://01.org/lkp/blogs/tonyluck/2016/address-range-partial-memory-mirroring-linux Currently the sb_edac driver only supports address decoding in full memory mirroring and non-mirroring modes. In address range partial memory mirroring mode, it may fail to decode an address that falls in a non-mirroring area (the following was one of this kind of failed logs). mce: Uncorrected hardware memory error in user-access at 566d53a400 Memory failure: 0x566d53a: Killing einj_mem_uc:4647 due to hardware memory corruption Memory failure: 0x566d53a: recovery action for dirty LRU page: Recovered mce: [Hardware Error]: Machine check events logged EDAC sbridge MC1: HANDLING MCE MEMORY ERROR EDAC sbridge MC1: CPU 48: Machine Check Event: 0 Bank 7: ec00000000010090 EDAC sbridge MC1: TSC 4b914aa5a99dab EDAC sbridge MC1: ADDR 566d53a400 EDAC sbridge MC1: MISC 1443a0c86 EDAC sbridge MC1: PROCESSOR 0:406f1 TIME 1499712764 SOCKET 2 APIC 80 EDAC MC1: 0 UE Can't discover the memory rank for ch addr 0x7fb54e900 on any memory ( page:0x0 offset:0x0 grain:32) mce: [Hardware Error]: Machine check events logged Therefore, classify memory mirroring modes and make the address decoding in address range partial memory mode correct. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: Tony Luck <tony.luck@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20170730180651.30060-1-qiuxu.zhuo@intel.com Signed-off-by: Borislav Petkov <bp@suse.de>
-rw-r--r--drivers/edac/sb_edac.c63
1 files changed, 47 insertions, 16 deletions
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 687d0f23b9cc..dc0591654011 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -300,6 +300,12 @@ enum domain {
300 SOCK, 300 SOCK,
301}; 301};
302 302
303enum mirroring_mode {
304 NON_MIRRORING,
305 ADDR_RANGE_MIRRORING,
306 FULL_MIRRORING,
307};
308
303struct sbridge_pvt; 309struct sbridge_pvt;
304struct sbridge_info { 310struct sbridge_info {
305 enum type type; 311 enum type type;
@@ -377,8 +383,9 @@ struct sbridge_pvt {
377 struct sbridge_channel channel[NUM_CHANNELS]; 383 struct sbridge_channel channel[NUM_CHANNELS];
378 384
379 /* Memory type detection */ 385 /* Memory type detection */
380 bool is_mirrored, is_lockstep, is_close_pg; 386 bool is_cur_addr_mirrored, is_lockstep, is_close_pg;
381 bool is_chan_hash; 387 bool is_chan_hash;
388 enum mirroring_mode mirror_mode;
382 389
383 /* Memory description */ 390 /* Memory description */
384 u64 tolm, tohm; 391 u64 tolm, tohm;
@@ -1648,10 +1655,6 @@ static int get_dimm_config(struct mem_ctl_info *mci)
1648 enum edac_type mode; 1655 enum edac_type mode;
1649 u32 reg; 1656 u32 reg;
1650 1657
1651 if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) {
1652 pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, &reg);
1653 pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21);
1654 }
1655 pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt); 1658 pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt);
1656 edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n", 1659 edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n",
1657 pvt->sbridge_dev->mc, 1660 pvt->sbridge_dev->mc,
@@ -1663,22 +1666,45 @@ static int get_dimm_config(struct mem_ctl_info *mci)
1663 */ 1666 */
1664 if (pvt->info.type == KNIGHTS_LANDING) { 1667 if (pvt->info.type == KNIGHTS_LANDING) {
1665 mode = EDAC_S4ECD4ED; 1668 mode = EDAC_S4ECD4ED;
1666 pvt->is_mirrored = false; 1669 pvt->mirror_mode = NON_MIRRORING;
1670 pvt->is_cur_addr_mirrored = false;
1667 1671
1668 if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0) 1672 if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0)
1669 return -1; 1673 return -1;
1670 pci_read_config_dword(pvt->pci_ta, KNL_MCMTR, &pvt->info.mcmtr); 1674 if (pci_read_config_dword(pvt->pci_ta, KNL_MCMTR, &pvt->info.mcmtr)) {
1675 edac_dbg(0, "Failed to read KNL_MCMTR register\n");
1676 return -ENODEV;
1677 }
1671 } else { 1678 } else {
1672 pci_read_config_dword(pvt->pci_ras, RASENABLES, &reg); 1679 if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) {
1680 if (pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, &reg)) {
1681 edac_dbg(0, "Failed to read HASWELL_HASYSDEFEATURE2 register\n");
1682 return -ENODEV;
1683 }
1684 pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21);
1685 if (GET_BITFIELD(reg, 28, 28)) {
1686 pvt->mirror_mode = ADDR_RANGE_MIRRORING;
1687 edac_dbg(0, "Address range partial memory mirroring is enabled\n");
1688 goto next;
1689 }
1690 }
1691 if (pci_read_config_dword(pvt->pci_ras, RASENABLES, &reg)) {
1692 edac_dbg(0, "Failed to read RASENABLES register\n");
1693 return -ENODEV;
1694 }
1673 if (IS_MIRROR_ENABLED(reg)) { 1695 if (IS_MIRROR_ENABLED(reg)) {
1674 edac_dbg(0, "Memory mirror is enabled\n"); 1696 pvt->mirror_mode = FULL_MIRRORING;
1675 pvt->is_mirrored = true; 1697 edac_dbg(0, "Full memory mirroring is enabled\n");
1676 } else { 1698 } else {
1677 edac_dbg(0, "Memory mirror is disabled\n"); 1699 pvt->mirror_mode = NON_MIRRORING;
1678 pvt->is_mirrored = false; 1700 edac_dbg(0, "Memory mirroring is disabled\n");
1679 } 1701 }
1680 1702
1681 pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr); 1703next:
1704 if (pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr)) {
1705 edac_dbg(0, "Failed to read MCMTR register\n");
1706 return -ENODEV;
1707 }
1682 if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) { 1708 if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) {
1683 edac_dbg(0, "Lockstep is enabled\n"); 1709 edac_dbg(0, "Lockstep is enabled\n");
1684 mode = EDAC_S8ECD8ED; 1710 mode = EDAC_S8ECD8ED;
@@ -2092,7 +2118,8 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
2092 2118
2093 pci_read_config_dword(pvt->pci_tad[base_ch], tad_ch_nilv_offset[n_tads], &tad_offset); 2119 pci_read_config_dword(pvt->pci_tad[base_ch], tad_ch_nilv_offset[n_tads], &tad_offset);
2094 2120
2095 if (pvt->is_mirrored) { 2121 if (pvt->mirror_mode == FULL_MIRRORING ||
2122 (pvt->mirror_mode == ADDR_RANGE_MIRRORING && n_tads == 0)) {
2096 *channel_mask |= 1 << ((base_ch + 2) % 4); 2123 *channel_mask |= 1 << ((base_ch + 2) % 4);
2097 switch(ch_way) { 2124 switch(ch_way) {
2098 case 2: 2125 case 2:
@@ -2103,8 +2130,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
2103 sprintf(msg, "Invalid mirror set. Can't decode addr"); 2130 sprintf(msg, "Invalid mirror set. Can't decode addr");
2104 return -EINVAL; 2131 return -EINVAL;
2105 } 2132 }
2106 } else 2133
2134 pvt->is_cur_addr_mirrored = true;
2135 } else {
2107 sck_xch = (1 << sck_way) * ch_way; 2136 sck_xch = (1 << sck_way) * ch_way;
2137 pvt->is_cur_addr_mirrored = false;
2138 }
2108 2139
2109 if (pvt->is_lockstep) 2140 if (pvt->is_lockstep)
2110 *channel_mask |= 1 << ((base_ch + 1) % 4); 2141 *channel_mask |= 1 << ((base_ch + 1) % 4);
@@ -2967,7 +2998,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
2967 * EDAC core should be handling the channel mask, in order to point 2998 * EDAC core should be handling the channel mask, in order to point
2968 * to the group of dimm's where the error may be happening. 2999 * to the group of dimm's where the error may be happening.
2969 */ 3000 */
2970 if (!pvt->is_lockstep && !pvt->is_mirrored && !pvt->is_close_pg) 3001 if (!pvt->is_lockstep && !pvt->is_cur_addr_mirrored && !pvt->is_close_pg)
2971 channel = first_channel; 3002 channel = first_channel;
2972 3003
2973 snprintf(msg, sizeof(msg), 3004 snprintf(msg, sizeof(msg),