aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 12:11:39 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 12:11:39 -0500
commite13284da944df29ab08e8a9d2a50fc0ad1d858ab (patch)
tree8e6e2580d27cf4fe5f712e0857dc495aa52fd27e /drivers/edac
parent1b37b8c48d2c2d8553f116ec2a75d21056f1fb35 (diff)
parent41f035a86b5b72a4f947c38e94239d20d595352a (diff)
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov: "This time around we have in store: - Disable MC4_MISC thresholding banks on all AMD family 0x15 models (Shirish S) - AMD MCE error descriptions update and error decode improvements (Yazen Ghannam) - The usual smaller conversions and fixes" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Improve error message when kernel cannot recover, p2 EDAC/mce_amd: Decode MCA_STATUS in bit definition order EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit EDAC, mce_amd: Print ExtErrorCode and description on a single line EDAC, mce_amd: Match error descriptions to latest documentation x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types RAS: Add a MAINTAINERS entry RAS: Use consistent types for UUIDs x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models x86/MCE: Switch to use the new generic UUID API
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/mce_amd.c291
1 files changed, 192 insertions, 99 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index c605089d899f..0a1814dad6cf 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -151,138 +151,223 @@ static const char * const mc6_mce_desc[] = {
151 151
152/* Scalable MCA error strings */ 152/* Scalable MCA error strings */
153static const char * const smca_ls_mce_desc[] = { 153static const char * const smca_ls_mce_desc[] = {
154 "Load queue parity", 154 "Load queue parity error",
155 "Store queue parity", 155 "Store queue parity error",
156 "Miss address buffer payload parity", 156 "Miss address buffer payload parity error",
157 "L1 TLB parity", 157 "Level 1 TLB parity error",
158 "Reserved", 158 "DC Tag error type 5",
159 "DC tag error type 6", 159 "DC Tag error type 6",
160 "DC tag error type 1", 160 "DC Tag error type 1",
161 "Internal error type 1", 161 "Internal error type 1",
162 "Internal error type 2", 162 "Internal error type 2",
163 "Sys Read data error thread 0", 163 "System Read Data Error Thread 0",
164 "Sys read data error thread 1", 164 "System Read Data Error Thread 1",
165 "DC tag error type 2", 165 "DC Tag error type 2",
166 "DC data error type 1 (poison consumption)", 166 "DC Data error type 1 and poison consumption",
167 "DC data error type 2", 167 "DC Data error type 2",
168 "DC data error type 3", 168 "DC Data error type 3",
169 "DC tag error type 4", 169 "DC Tag error type 4",
170 "L2 TLB parity", 170 "Level 2 TLB parity error",
171 "PDC parity error", 171 "PDC parity error",
172 "DC tag error type 3", 172 "DC Tag error type 3",
173 "DC tag error type 5", 173 "DC Tag error type 5",
174 "L2 fill data error", 174 "L2 Fill Data error",
175}; 175};
176 176
177static const char * const smca_if_mce_desc[] = { 177static const char * const smca_if_mce_desc[] = {
178 "microtag probe port parity error", 178 "Op Cache Microtag Probe Port Parity Error",
179 "IC microtag or full tag multi-hit error", 179 "IC Microtag or Full Tag Multi-hit Error",
180 "IC full tag parity", 180 "IC Full Tag Parity Error",
181 "IC data array parity", 181 "IC Data Array Parity Error",
182 "Decoupling queue phys addr parity error", 182 "Decoupling Queue PhysAddr Parity Error",
183 "L0 ITLB parity error", 183 "L0 ITLB Parity Error",
184 "L1 ITLB parity error", 184 "L1 ITLB Parity Error",
185 "L2 ITLB parity error", 185 "L2 ITLB Parity Error",
186 "BPQ snoop parity on Thread 0", 186 "BPQ Thread 0 Snoop Parity Error",
187 "BPQ snoop parity on Thread 1", 187 "BPQ Thread 1 Snoop Parity Error",
188 "L1 BTB multi-match error", 188 "L1 BTB Multi-Match Error",
189 "L2 BTB multi-match error", 189 "L2 BTB Multi-Match Error",
190 "L2 Cache Response Poison error", 190 "L2 Cache Response Poison Error",
191 "System Read Data error", 191 "System Read Data Error",
192}; 192};
193 193
194static const char * const smca_l2_mce_desc[] = { 194static const char * const smca_l2_mce_desc[] = {
195 "L2M tag multi-way-hit error", 195 "L2M Tag Multiple-Way-Hit error",
196 "L2M tag ECC error", 196 "L2M Tag or State Array ECC Error",
197 "L2M data ECC error", 197 "L2M Data Array ECC Error",
198 "HW assert", 198 "Hardware Assert Error",
199}; 199};
200 200
201static const char * const smca_de_mce_desc[] = { 201static const char * const smca_de_mce_desc[] = {
202 "uop cache tag parity error", 202 "Micro-op cache tag parity error",
203 "uop cache data parity error", 203 "Micro-op cache data parity error",
204 "Insn buffer parity error", 204 "Instruction buffer parity error",
205 "uop queue parity error", 205 "Micro-op queue parity error",
206 "Insn dispatch queue parity error", 206 "Instruction dispatch queue parity error",
207 "Fetch address FIFO parity", 207 "Fetch address FIFO parity error",
208 "Patch RAM data parity", 208 "Patch RAM data parity error",
209 "Patch RAM sequencer parity", 209 "Patch RAM sequencer parity error",
210 "uop buffer parity" 210 "Micro-op buffer parity error"
211}; 211};
212 212
213static const char * const smca_ex_mce_desc[] = { 213static const char * const smca_ex_mce_desc[] = {
214 "Watchdog timeout error", 214 "Watchdog Timeout error",
215 "Phy register file parity", 215 "Physical register file parity error",
216 "Flag register file parity", 216 "Flag register file parity error",
217 "Immediate displacement register file parity", 217 "Immediate displacement register file parity error",
218 "Address generator payload parity", 218 "Address generator payload parity error",
219 "EX payload parity", 219 "EX payload parity error",
220 "Checkpoint queue parity", 220 "Checkpoint queue parity error",
221 "Retire dispatch queue parity", 221 "Retire dispatch queue parity error",
222 "Retire status queue parity error", 222 "Retire status queue parity error",
223 "Scheduling queue parity error", 223 "Scheduling queue parity error",
224 "Branch buffer queue parity error", 224 "Branch buffer queue parity error",
225 "Hardware Assertion error",
225}; 226};
226 227
227static const char * const smca_fp_mce_desc[] = { 228static const char * const smca_fp_mce_desc[] = {
228 "Physical register file parity", 229 "Physical register file (PRF) parity error",
229 "Freelist parity error", 230 "Freelist (FL) parity error",
230 "Schedule queue parity", 231 "Schedule queue parity error",
231 "NSQ parity error", 232 "NSQ parity error",
232 "Retire queue parity", 233 "Retire queue (RQ) parity error",
233 "Status register file parity", 234 "Status register file (SRF) parity error",
234 "Hardware assertion", 235 "Hardware assertion",
235}; 236};
236 237
237static const char * const smca_l3_mce_desc[] = { 238static const char * const smca_l3_mce_desc[] = {
238 "Shadow tag macro ECC error", 239 "Shadow Tag Macro ECC Error",
239 "Shadow tag macro multi-way-hit error", 240 "Shadow Tag Macro Multi-way-hit Error",
240 "L3M tag ECC error", 241 "L3M Tag ECC Error",
241 "L3M tag multi-way-hit error", 242 "L3M Tag Multi-way-hit Error",
242 "L3M data ECC error", 243 "L3M Data ECC Error",
243 "XI parity, L3 fill done channel error", 244 "SDP Parity Error or SystemReadDataError from XI",
244 "L3 victim queue parity", 245 "L3 Victim Queue Parity Error",
245 "L3 HW assert", 246 "L3 Hardware Assertion",
246}; 247};
247 248
248static const char * const smca_cs_mce_desc[] = { 249static const char * const smca_cs_mce_desc[] = {
249 "Illegal request from transport layer", 250 "Illegal Request",
250 "Address violation", 251 "Address Violation",
251 "Security violation", 252 "Security Violation",
252 "Illegal response from transport layer", 253 "Illegal Response",
253 "Unexpected response", 254 "Unexpected Response",
254 "Parity error on incoming request or probe response data", 255 "Request or Probe Parity Error",
255 "Parity error on incoming read response data", 256 "Read Response Parity Error",
256 "Atomic request parity", 257 "Atomic Request Parity Error",
257 "ECC error on probe filter access", 258 "Probe Filter ECC Error",
259};
260
261static const char * const smca_cs2_mce_desc[] = {
262 "Illegal Request",
263 "Address Violation",
264 "Security Violation",
265 "Illegal Response",
266 "Unexpected Response",
267 "Request or Probe Parity Error",
268 "Read Response Parity Error",
269 "Atomic Request Parity Error",
270 "SDP read response had no match in the CS queue",
271 "Probe Filter Protocol Error",
272 "Probe Filter ECC Error",
273 "SDP read response had an unexpected RETRY error",
274 "Counter overflow error",
275 "Counter underflow error",
258}; 276};
259 277
260static const char * const smca_pie_mce_desc[] = { 278static const char * const smca_pie_mce_desc[] = {
261 "HW assert", 279 "Hardware Assert",
262 "Internal PIE register security violation", 280 "Register security violation",
263 "Error on GMI link", 281 "Link Error",
264 "Poison data written to internal PIE register", 282 "Poison data consumption",
283 "A deferred error was detected in the DF"
265}; 284};
266 285
267static const char * const smca_umc_mce_desc[] = { 286static const char * const smca_umc_mce_desc[] = {
268 "DRAM ECC error", 287 "DRAM ECC error",
269 "Data poison error on DRAM", 288 "Data poison error",
270 "SDP parity error", 289 "SDP parity error",
271 "Advanced peripheral bus error", 290 "Advanced peripheral bus error",
272 "Command/address parity error", 291 "Address/Command parity error",
273 "Write data CRC error", 292 "Write data CRC error",
293 "DCQ SRAM ECC error",
294 "AES SRAM ECC error",
274}; 295};
275 296
276static const char * const smca_pb_mce_desc[] = { 297static const char * const smca_pb_mce_desc[] = {
277 "Parameter Block RAM ECC error", 298 "An ECC error in the Parameter Block RAM array",
278}; 299};
279 300
280static const char * const smca_psp_mce_desc[] = { 301static const char * const smca_psp_mce_desc[] = {
281 "PSP RAM ECC or parity error", 302 "An ECC or parity error in a PSP RAM instance",
303};
304
305static const char * const smca_psp2_mce_desc[] = {
306 "High SRAM ECC or parity error",
307 "Low SRAM ECC or parity error",
308 "Instruction Cache Bank 0 ECC or parity error",
309 "Instruction Cache Bank 1 ECC or parity error",
310 "Instruction Tag Ram 0 parity error",
311 "Instruction Tag Ram 1 parity error",
312 "Data Cache Bank 0 ECC or parity error",
313 "Data Cache Bank 1 ECC or parity error",
314 "Data Cache Bank 2 ECC or parity error",
315 "Data Cache Bank 3 ECC or parity error",
316 "Data Tag Bank 0 parity error",
317 "Data Tag Bank 1 parity error",
318 "Data Tag Bank 2 parity error",
319 "Data Tag Bank 3 parity error",
320 "Dirty Data Ram parity error",
321 "TLB Bank 0 parity error",
322 "TLB Bank 1 parity error",
323 "System Hub Read Buffer ECC or parity error",
282}; 324};
283 325
284static const char * const smca_smu_mce_desc[] = { 326static const char * const smca_smu_mce_desc[] = {
285 "SMU RAM ECC or parity error", 327 "An ECC or parity error in an SMU RAM instance",
328};
329
330static const char * const smca_smu2_mce_desc[] = {
331 "High SRAM ECC or parity error",
332 "Low SRAM ECC or parity error",
333 "Data Cache Bank A ECC or parity error",
334 "Data Cache Bank B ECC or parity error",
335 "Data Tag Cache Bank A ECC or parity error",
336 "Data Tag Cache Bank B ECC or parity error",
337 "Instruction Cache Bank A ECC or parity error",
338 "Instruction Cache Bank B ECC or parity error",
339 "Instruction Tag Cache Bank A ECC or parity error",
340 "Instruction Tag Cache Bank B ECC or parity error",
341 "System Hub Read Buffer ECC or parity error",
342};
343
344static const char * const smca_mp5_mce_desc[] = {
345 "High SRAM ECC or parity error",
346 "Low SRAM ECC or parity error",
347 "Data Cache Bank A ECC or parity error",
348 "Data Cache Bank B ECC or parity error",
349 "Data Tag Cache Bank A ECC or parity error",
350 "Data Tag Cache Bank B ECC or parity error",
351 "Instruction Cache Bank A ECC or parity error",
352 "Instruction Cache Bank B ECC or parity error",
353 "Instruction Tag Cache Bank A ECC or parity error",
354 "Instruction Tag Cache Bank B ECC or parity error",
355};
356
357static const char * const smca_nbio_mce_desc[] = {
358 "ECC or Parity error",
359 "PCIE error",
360 "SDP ErrEvent error",
361 "SDP Egress Poison Error",
362 "IOHC Internal Poison Error",
363};
364
365static const char * const smca_pcie_mce_desc[] = {
366 "CCIX PER Message logging",
367 "CCIX Read Response with Status: Non-Data Error",
368 "CCIX Write Response with Status: Non-Data Error",
369 "CCIX Read Response with Status: Data Error",
370 "CCIX Non-okay write response with data error",
286}; 371};
287 372
288struct smca_mce_desc { 373struct smca_mce_desc {
@@ -299,11 +384,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
299 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, 384 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
300 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, 385 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
301 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, 386 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
387 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
302 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, 388 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
303 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, 389 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
304 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, 390 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
305 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, 391 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
392 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
306 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, 393 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
394 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
395 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
396 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
397 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
307}; 398};
308 399
309static bool f12h_mc0_mce(u16 ec, u8 xec) 400static bool f12h_mc0_mce(u16 ec, u8 xec)
@@ -874,13 +965,12 @@ static void decode_smca_error(struct mce *m)
874 965
875 ip_name = smca_get_long_name(bank_type); 966 ip_name = smca_get_long_name(bank_type);
876 967
877 pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); 968 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
878 969
879 /* Only print the decode of valid error codes */ 970 /* Only print the decode of valid error codes */
880 if (xec < smca_mce_descs[bank_type].num_descs && 971 if (xec < smca_mce_descs[bank_type].num_descs &&
881 (hwid->xec_bitmap & BIT_ULL(xec))) { 972 (hwid->xec_bitmap & BIT_ULL(xec))) {
882 pr_emerg(HW_ERR "%s Error: ", ip_name); 973 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
883 pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
884 } 974 }
885 975
886 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) 976 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
@@ -961,26 +1051,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
961 ((m->status & MCI_STATUS_UC) ? "UE" : 1051 ((m->status & MCI_STATUS_UC) ? "UE" :
962 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 1052 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
963 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 1053 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
964 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), 1054 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
965 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); 1055 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
966
967 if (fam >= 0x15) {
968 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
969
970 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
971 if (fam != 0x15 || m->bank != 4)
972 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
973 }
974 1056
975 if (boot_cpu_has(X86_FEATURE_SMCA)) { 1057 if (boot_cpu_has(X86_FEATURE_SMCA)) {
976 u32 low, high; 1058 u32 low, high;
977 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); 1059 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
978 1060
979 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
980
981 if (!rdmsr_safe(addr, &low, &high) && 1061 if (!rdmsr_safe(addr, &low, &high) &&
982 (low & MCI_CONFIG_MCAX)) 1062 (low & MCI_CONFIG_MCAX))
983 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 1063 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1064
1065 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
984 } 1066 }
985 1067
986 /* do the two bits[14:13] together */ 1068 /* do the two bits[14:13] together */
@@ -988,6 +1070,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
988 if (ecc) 1070 if (ecc)
989 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 1071 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
990 1072
1073 if (fam >= 0x15) {
1074 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1075
1076 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1077 if (fam != 0x15 || m->bank != 4)
1078 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1079 }
1080
1081 if (fam >= 0x17)
1082 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1083
991 pr_cont("]: 0x%016llx\n", m->status); 1084 pr_cont("]: 0x%016llx\n", m->status);
992 1085
993 if (m->status & MCI_STATUS_ADDRV) 1086 if (m->status & MCI_STATUS_ADDRV)