aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQiuxu Zhuo <qiuxu.zhuo@intel.com>2018-10-21 03:18:56 -0400
committerBorislav Petkov <bp@suse.de>2018-10-25 10:59:18 -0400
commitad6e16059d8e00de0887885db11d87cba0bd1512 (patch)
treed31d3eb322c81a1b231c7e112f912926919a69b3
parent36168d7123311d52e085c116f6c66e16f0b84615 (diff)
EDAC, skx_edac: Add address translation for non-volatile DIMMs
Currently, this driver doesn't support address translation for non-volatile DIMMs. The ACPI ADXL DSM method provides address translation for both volatile and non-volatile DIMMs. Enable it to use the ACPI DSM methods if they are supported and there are non-volatile DIMMs populated on the system. Co-developed-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> CC: Mauro Carvalho Chehab <mchehab@kernel.org> CC: arozansk@redhat.com CC: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/1540106336-5212-1-git-send-email-qiuxu.zhuo@intel.com
-rw-r--r--drivers/edac/Kconfig1
-rw-r--r--drivers/edac/skx_edac.c193
2 files changed, 181 insertions, 13 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 57304b2e989f..ffd349c12479 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -234,6 +234,7 @@ config EDAC_SKX
234 depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG 234 depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
235 depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y 235 depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
236 select DMI 236 select DMI
237 select ACPI_ADXL
237 help 238 help
238 Support for error detection and correction the Intel 239 Support for error detection and correction the Intel
239 Skylake server Integrated Memory Controllers. If your 240 Skylake server Integrated Memory Controllers. If your
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index dd209e0dd9ab..a99ea61dad32 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -26,6 +26,7 @@
26#include <linux/bitmap.h> 26#include <linux/bitmap.h>
27#include <linux/math64.h> 27#include <linux/math64.h>
28#include <linux/mod_devicetable.h> 28#include <linux/mod_devicetable.h>
29#include <linux/adxl.h>
29#include <acpi/nfit.h> 30#include <acpi/nfit.h>
30#include <asm/cpu_device_id.h> 31#include <asm/cpu_device_id.h>
31#include <asm/intel-family.h> 32#include <asm/intel-family.h>
@@ -35,6 +36,7 @@
35#include "edac_module.h" 36#include "edac_module.h"
36 37
37#define EDAC_MOD_STR "skx_edac" 38#define EDAC_MOD_STR "skx_edac"
39#define MSG_SIZE 1024
38 40
39/* 41/*
40 * Debug macros 42 * Debug macros
@@ -54,6 +56,29 @@
54static LIST_HEAD(skx_edac_list); 56static LIST_HEAD(skx_edac_list);
55 57
56static u64 skx_tolm, skx_tohm; 58static u64 skx_tolm, skx_tohm;
59static char *skx_msg;
60static unsigned int nvdimm_count;
61
62enum {
63 INDEX_SOCKET,
64 INDEX_MEMCTRL,
65 INDEX_CHANNEL,
66 INDEX_DIMM,
67 INDEX_MAX
68};
69
70static const char * const component_names[] = {
71 [INDEX_SOCKET] = "ProcessorSocketId",
72 [INDEX_MEMCTRL] = "MemoryControllerId",
73 [INDEX_CHANNEL] = "ChannelId",
74 [INDEX_DIMM] = "DimmSlotId",
75};
76
77static int component_indices[ARRAY_SIZE(component_names)];
78static int adxl_component_count;
79static const char * const *adxl_component_names;
80static u64 *adxl_values;
81static char *adxl_msg;
57 82
58#define NUM_IMC 2 /* memory controllers per socket */ 83#define NUM_IMC 2 /* memory controllers per socket */
59#define NUM_CHANNELS 3 /* channels per memory controller */ 84#define NUM_CHANNELS 3 /* channels per memory controller */
@@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
393 u16 flags; 418 u16 flags;
394 u64 size = 0; 419 u64 size = 0;
395 420
421 nvdimm_count++;
422
396 dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, 423 dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
397 imc->src_id, 0); 424 imc->src_id, 0);
398 425
@@ -941,12 +968,46 @@ static void teardown_skx_debug(void)
941} 968}
942#endif /*CONFIG_EDAC_DEBUG*/ 969#endif /*CONFIG_EDAC_DEBUG*/
943 970
971static bool skx_adxl_decode(struct decoded_addr *res)
972
973{
974 int i, len = 0;
975
976 if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
977 res->addr < BIT_ULL(32))) {
978 edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
979 return false;
980 }
981
982 if (adxl_decode(res->addr, adxl_values)) {
983 edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
984 return false;
985 }
986
987 res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
988 res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
989 res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
990 res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
991
992 for (i = 0; i < adxl_component_count; i++) {
993 if (adxl_values[i] == ~0x0ull)
994 continue;
995
996 len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
997 adxl_component_names[i], adxl_values[i]);
998 if (MSG_SIZE - len <= 0)
999 break;
1000 }
1001
1002 return true;
1003}
1004
944static void skx_mce_output_error(struct mem_ctl_info *mci, 1005static void skx_mce_output_error(struct mem_ctl_info *mci,
945 const struct mce *m, 1006 const struct mce *m,
946 struct decoded_addr *res) 1007 struct decoded_addr *res)
947{ 1008{
948 enum hw_event_mc_err_type tp_event; 1009 enum hw_event_mc_err_type tp_event;
949 char *type, *optype, msg[256]; 1010 char *type, *optype;
950 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); 1011 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
951 bool overflow = GET_BITFIELD(m->status, 62, 62); 1012 bool overflow = GET_BITFIELD(m->status, 62, 62);
952 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); 1013 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
1007 break; 1068 break;
1008 } 1069 }
1009 } 1070 }
1071 if (adxl_component_count) {
1072 snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s",
1073 overflow ? " OVERFLOW" : "",
1074 (uncorrected_error && recoverable) ? " recoverable" : "",
1075 mscod, errcode, adxl_msg);
1076 } else {
1077 snprintf(skx_msg, MSG_SIZE,
1078 "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
1079 overflow ? " OVERFLOW" : "",
1080 (uncorrected_error && recoverable) ? " recoverable" : "",
1081 mscod, errcode,
1082 res->socket, res->imc, res->rank,
1083 res->bank_group, res->bank_address, res->row, res->column);
1084 }
1010 1085
1011 snprintf(msg, sizeof(msg), 1086 edac_dbg(0, "%s\n", skx_msg);
1012 "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
1013 overflow ? " OVERFLOW" : "",
1014 (uncorrected_error && recoverable) ? " recoverable" : "",
1015 mscod, errcode,
1016 res->socket, res->imc, res->rank,
1017 res->bank_group, res->bank_address, res->row, res->column);
1018
1019 edac_dbg(0, "%s\n", msg);
1020 1087
1021 /* Call the helper to output message */ 1088 /* Call the helper to output message */
1022 edac_mc_handle_error(tp_event, mci, core_err_cnt, 1089 edac_mc_handle_error(tp_event, mci, core_err_cnt,
1023 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, 1090 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
1024 res->channel, res->dimm, -1, 1091 res->channel, res->dimm, -1,
1025 optype, msg); 1092 optype, skx_msg);
1093}
1094
1095static struct mem_ctl_info *get_mci(int src_id, int lmc)
1096{
1097 struct skx_dev *d;
1098
1099 if (lmc > NUM_IMC - 1) {
1100 skx_printk(KERN_ERR, "Bad lmc %d\n", lmc);
1101 return NULL;
1102 }
1103
1104 list_for_each_entry(d, &skx_edac_list, list) {
1105 if (d->imc[0].src_id == src_id)
1106 return d->imc[lmc].mci;
1107 }
1108
1109 skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc);
1110
1111 return NULL;
1026} 1112}
1027 1113
1028static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, 1114static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
1040 if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) 1126 if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
1041 return NOTIFY_DONE; 1127 return NOTIFY_DONE;
1042 1128
1129 memset(&res, 0, sizeof(res));
1043 res.addr = mce->addr; 1130 res.addr = mce->addr;
1044 if (!skx_decode(&res)) 1131
1132 if (adxl_component_count) {
1133 if (!skx_adxl_decode(&res))
1134 return NOTIFY_DONE;
1135
1136 mci = get_mci(res.socket, res.imc);
1137 } else {
1138 if (!skx_decode(&res))
1139 return NOTIFY_DONE;
1140
1141 mci = res.dev->imc[res.imc].mci;
1142 }
1143
1144 if (!mci)
1045 return NOTIFY_DONE; 1145 return NOTIFY_DONE;
1046 mci = res.dev->imc[res.imc].mci;
1047 1146
1048 if (mce->mcgstatus & MCG_STATUS_MCIP) 1147 if (mce->mcgstatus & MCG_STATUS_MCIP)
1049 type = "Exception"; 1148 type = "Exception";
@@ -1094,6 +1193,62 @@ static void skx_remove(void)
1094 } 1193 }
1095} 1194}
1096 1195
1196static void __init skx_adxl_get(void)
1197{
1198 const char * const *names;
1199 int i, j;
1200
1201 names = adxl_get_component_names();
1202 if (!names) {
1203 skx_printk(KERN_NOTICE, "No firmware support for address translation.");
1204 skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
1205 return;
1206 }
1207
1208 for (i = 0; i < INDEX_MAX; i++) {
1209 for (j = 0; names[j]; j++) {
1210 if (!strcmp(component_names[i], names[j])) {
1211 component_indices[i] = j;
1212 break;
1213 }
1214 }
1215
1216 if (!names[j])
1217 goto err;
1218 }
1219
1220 adxl_component_names = names;
1221 while (*names++)
1222 adxl_component_count++;
1223
1224 adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
1225 GFP_KERNEL);
1226 if (!adxl_values) {
1227 adxl_component_count = 0;
1228 return;
1229 }
1230
1231 adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
1232 if (!adxl_msg) {
1233 adxl_component_count = 0;
1234 kfree(adxl_values);
1235 }
1236
1237 return;
1238err:
1239 skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
1240 component_names[i]);
1241 for (j = 0; names[j]; j++)
1242 skx_printk(KERN_CONT, "%s ", names[j]);
1243 skx_printk(KERN_CONT, "\n");
1244}
1245
1246static void __exit skx_adxl_put(void)
1247{
1248 kfree(adxl_values);
1249 kfree(adxl_msg);
1250}
1251
1097/* 1252/*
1098 * skx_init: 1253 * skx_init:
1099 * make sure we are running on the correct cpu model 1254 * make sure we are running on the correct cpu model
@@ -1158,6 +1313,15 @@ static int __init skx_init(void)
1158 } 1313 }
1159 } 1314 }
1160 1315
1316 skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
1317 if (!skx_msg) {
1318 rc = -ENOMEM;
1319 goto fail;
1320 }
1321
1322 if (nvdimm_count)
1323 skx_adxl_get();
1324
1161 /* Ensure that the OPSTATE is set correctly for POLL or NMI */ 1325 /* Ensure that the OPSTATE is set correctly for POLL or NMI */
1162 opstate_init(); 1326 opstate_init();
1163 1327
@@ -1176,6 +1340,9 @@ static void __exit skx_exit(void)
1176 edac_dbg(2, "\n"); 1340 edac_dbg(2, "\n");
1177 mce_unregister_decode_chain(&skx_mce_dec); 1341 mce_unregister_decode_chain(&skx_mce_dec);
1178 skx_remove(); 1342 skx_remove();
1343 if (nvdimm_count)
1344 skx_adxl_put();
1345 kfree(skx_msg);
1179 teardown_skx_debug(); 1346 teardown_skx_debug();
1180} 1347}
1181 1348