aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2010-08-31 12:28:08 -0400
committerBorislav Petkov <bp@amd64.org>2010-10-21 08:48:02 -0400
commit5ce88f6ea6bef929f59f9468413f922c9a486fa4 (patch)
treee4a3b7fa9f3e782424453da68bb3aeff78647796
parentded506232865e8e932bc21c87f48170d50db4d97 (diff)
EDAC, MCE: Complete NB MCE decoders
Add support for decoding F14h BU MCEs and improve decoding of the remaining families. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
-rw-r--r--drivers/edac/amd64_edac.h1
-rw-r--r--drivers/edac/mce_amd.c210
-rw-r--r--drivers/edac/mce_amd.h3
3 files changed, 158 insertions, 56 deletions
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 13e1d6f25bd1..044aee4f944d 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16];
482extern const char *to_msgs[2]; 482extern const char *to_msgs[2];
483extern const char *pp_msgs[4]; 483extern const char *pp_msgs[4];
484extern const char *ii_msgs[4]; 484extern const char *ii_msgs[4];
485extern const char *ext_msgs[32];
486extern const char *htlink_msgs[8]; 485extern const char *htlink_msgs[8];
487 486
488#ifdef CONFIG_EDAC_DEBUG 487#ifdef CONFIG_EDAC_DEBUG
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 3c161672a84b..d8d1c9de1ed6 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -5,6 +5,8 @@
5 5
6static struct amd_decoder_ops *fam_ops; 6static struct amd_decoder_ops *fam_ops;
7 7
8static u8 nb_err_cpumask = 0xf;
9
8static bool report_gart_errors; 10static bool report_gart_errors;
9static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); 11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
10 12
@@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs);
61const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
62EXPORT_SYMBOL_GPL(ii_msgs); 64EXPORT_SYMBOL_GPL(ii_msgs);
63 65
64/* 66static const char *f10h_nb_mce_desc[] = {
65 * Map the 4 or 5 (family-specific) bits of Extended Error code to the 67 "HT link data error",
66 * string table. 68 "Protocol error (link, L3, probe filter, etc.)",
67 */ 69 "Parity error in NB-internal arrays",
68const char *ext_msgs[] = { 70 "Link Retry due to IO link transmission error",
69 "K8 ECC error", /* 0_0000b */ 71 "L3 ECC data cache error",
70 "CRC error on link", /* 0_0001b */ 72 "ECC error in L3 cache tag",
71 "Sync error packets on link", /* 0_0010b */ 73 "L3 LRU parity bits error",
72 "Master Abort during link operation", /* 0_0011b */ 74 "ECC Error in the Probe Filter directory"
73 "Target Abort during link operation", /* 0_0100b */
74 "Invalid GART PTE entry during table walk", /* 0_0101b */
75 "Unsupported atomic RMW command received", /* 0_0110b */
76 "WDT error: NB transaction timeout", /* 0_0111b */
77 "ECC/ChipKill ECC error", /* 0_1000b */
78 "SVM DEV Error", /* 0_1001b */
79 "Link Data error", /* 0_1010b */
80 "Link/L3/Probe Filter Protocol error", /* 0_1011b */
81 "NB Internal Arrays Parity error", /* 0_1100b */
82 "DRAM Address/Control Parity error", /* 0_1101b */
83 "Link Transmission error", /* 0_1110b */
84 "GART/DEV Table Walk Data error" /* 0_1111b */
85 "Res 0x100 error", /* 1_0000b */
86 "Res 0x101 error", /* 1_0001b */
87 "Res 0x102 error", /* 1_0010b */
88 "Res 0x103 error", /* 1_0011b */
89 "Res 0x104 error", /* 1_0100b */
90 "Res 0x105 error", /* 1_0101b */
91 "Res 0x106 error", /* 1_0110b */
92 "Res 0x107 error", /* 1_0111b */
93 "Res 0x108 error", /* 1_1000b */
94 "Res 0x109 error", /* 1_1001b */
95 "Res 0x10A error", /* 1_1010b */
96 "Res 0x10B error", /* 1_1011b */
97 "ECC error in L3 Cache Data", /* 1_1100b */
98 "L3 Cache Tag error", /* 1_1101b */
99 "L3 Cache LRU Parity error", /* 1_1110b */
100 "Probe Filter error" /* 1_1111b */
101}; 75};
102EXPORT_SYMBOL_GPL(ext_msgs);
103 76
104static bool f10h_dc_mce(u16 ec) 77static bool f10h_dc_mce(u16 ec)
105{ 78{
@@ -366,19 +339,97 @@ wrong_ls_mce:
366 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 339 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
367} 340}
368 341
342static bool k8_nb_mce(u16 ec, u8 xec)
343{
344 bool ret = true;
345
346 switch (xec) {
347 case 0x1:
348 pr_cont("CRC error detected on HT link.\n");
349 break;
350
351 case 0x5:
352 pr_cont("Invalid GART PTE entry during GART table walk.\n");
353 break;
354
355 case 0x6:
356 pr_cont("Unsupported atomic RMW received from an IO link.\n");
357 break;
358
359 case 0x0:
360 case 0x8:
361 pr_cont("DRAM ECC error detected on the NB.\n");
362 break;
363
364 case 0xd:
365 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
366 break;
367
368 default:
369 ret = false;
370 break;
371 }
372
373 return ret;
374}
375
376static bool f10h_nb_mce(u16 ec, u8 xec)
377{
378 bool ret = true;
379 u8 offset = 0;
380
381 if (k8_nb_mce(ec, xec))
382 return true;
383
384 switch(xec) {
385 case 0xa ... 0xc:
386 offset = 10;
387 break;
388
389 case 0xe:
390 offset = 11;
391 break;
392
393 case 0xf:
394 if (TLB_ERROR(ec))
395 pr_cont("GART Table Walk data error.\n");
396 else if (BUS_ERROR(ec))
397 pr_cont("DMA Exclusion Vector Table Walk error.\n");
398 else
399 ret = false;
400
401 goto out;
402 break;
403
404 case 0x1c ... 0x1f:
405 offset = 24;
406 break;
407
408 default:
409 ret = false;
410
411 goto out;
412 break;
413 }
414
415 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
416
417out:
418 return ret;
419}
420
421static bool f14h_nb_mce(u16 ec, u8 xec)
422{
423 return false;
424}
425
369void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) 426void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
370{ 427{
371 u32 ec = m->status & 0xffff; 428 u8 xec = (m->status >> 16) & 0x1f;
429 u16 ec = m->status & 0xffff;
372 u32 nbsh = (u32)(m->status >> 32); 430 u32 nbsh = (u32)(m->status >> 32);
373 u32 nbsl = (u32)m->status;
374
375 /*
376 * GART TLB error reporting is disabled by default. Bail out early.
377 */
378 if (TLB_ERROR(ec) && !report_gart_errors)
379 return;
380 431
381 pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); 432 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
382 433
383 /* 434 /*
384 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 435 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
@@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
387 if ((boot_cpu_data.x86 == 0x10) && 438 if ((boot_cpu_data.x86 == 0x10) &&
388 (boot_cpu_data.x86_model > 7)) { 439 (boot_cpu_data.x86_model > 7)) {
389 if (nbsh & K8_NBSH_ERR_CPU_VAL) 440 if (nbsh & K8_NBSH_ERR_CPU_VAL)
390 pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); 441 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
391 } else { 442 } else {
392 u8 assoc_cpus = nbsh & 0xf; 443 u8 assoc_cpus = nbsh & nb_err_cpumask;
393 444
394 if (assoc_cpus > 0) 445 if (assoc_cpus > 0)
395 pr_cont(", core: %d", fls(assoc_cpus) - 1); 446 pr_cont(", core: %d", fls(assoc_cpus) - 1);
447 }
396 448
397 pr_cont("\n"); 449 switch (xec) {
450 case 0x2:
451 pr_cont("Sync error (sync packets on HT link detected).\n");
452 return;
453
454 case 0x3:
455 pr_cont("HT Master abort.\n");
456 return;
457
458 case 0x4:
459 pr_cont("HT Target abort.\n");
460 return;
461
462 case 0x7:
463 pr_cont("NB Watchdog timeout.\n");
464 return;
465
466 case 0x9:
467 pr_cont("SVM DMA Exclusion Vector error.\n");
468 return;
469
470 default:
471 break;
398 } 472 }
399 473
400 pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); 474 if (!fam_ops->nb_mce(ec, xec))
475 goto wrong_nb_mce;
476
477 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
478 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
479 nb_bus_decoder(node_id, m, nbcfg);
401 480
402 if (BUS_ERROR(ec) && nb_bus_decoder) 481 return;
403 nb_bus_decoder(node_id, m, nbcfg); 482
483wrong_nb_mce:
484 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
404} 485}
405EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 486EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
406 487
@@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec)
430 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); 511 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
431} 512}
432 513
514/*
515 * Filter out unwanted MCE signatures here.
516 */
517static bool amd_filter_mce(struct mce *m)
518{
519 u8 xec = (m->status >> 16) & 0x1f;
520
521 /*
522 * NB GART TLB error reporting is disabled by default.
523 */
524 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
525 return true;
526
527 return false;
528}
529
433int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 530int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
434{ 531{
435 struct mce *m = (struct mce *)data; 532 struct mce *m = (struct mce *)data;
436 int node, ecc; 533 int node, ecc;
437 534
535 if (amd_filter_mce(m))
536 return NOTIFY_STOP;
537
438 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); 538 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
439 539
440 pr_cont("%sorrected error, other errors lost: %s, " 540 pr_cont("%sorrected error, other errors lost: %s, "
@@ -509,16 +609,20 @@ static int __init mce_amd_init(void)
509 case 0xf: 609 case 0xf:
510 fam_ops->dc_mce = k8_dc_mce; 610 fam_ops->dc_mce = k8_dc_mce;
511 fam_ops->ic_mce = k8_ic_mce; 611 fam_ops->ic_mce = k8_ic_mce;
612 fam_ops->nb_mce = k8_nb_mce;
512 break; 613 break;
513 614
514 case 0x10: 615 case 0x10:
515 fam_ops->dc_mce = f10h_dc_mce; 616 fam_ops->dc_mce = f10h_dc_mce;
516 fam_ops->ic_mce = k8_ic_mce; 617 fam_ops->ic_mce = k8_ic_mce;
618 fam_ops->nb_mce = f10h_nb_mce;
517 break; 619 break;
518 620
519 case 0x14: 621 case 0x14:
622 nb_err_cpumask = 0x3;
520 fam_ops->dc_mce = f14h_dc_mce; 623 fam_ops->dc_mce = f14h_dc_mce;
521 fam_ops->ic_mce = f14h_ic_mce; 624 fam_ops->ic_mce = f14h_ic_mce;
625 fam_ops->nb_mce = f14h_nb_mce;
522 break; 626 break;
523 627
524 default: 628 default:
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index dc81dba9364b..0d0637debbad 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -7,7 +7,6 @@
7 7
8#define ERROR_CODE(x) ((x) & 0xffff) 8#define ERROR_CODE(x) ((x) & 0xffff)
9#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) 9#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
10#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
11 10
12#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) 11#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
13#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) 12#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@@ -83,7 +82,6 @@ extern const char *rrrr_msgs[];
83extern const char *pp_msgs[]; 82extern const char *pp_msgs[];
84extern const char *to_msgs[]; 83extern const char *to_msgs[];
85extern const char *ii_msgs[]; 84extern const char *ii_msgs[];
86extern const char *ext_msgs[];
87 85
88/* 86/*
89 * relevant NB regs 87 * relevant NB regs
@@ -102,6 +100,7 @@ struct err_regs {
102struct amd_decoder_ops { 100struct amd_decoder_ops {
103 bool (*dc_mce)(u16); 101 bool (*dc_mce)(u16);
104 bool (*ic_mce)(u16); 102 bool (*ic_mce)(u16);
103 bool (*nb_mce)(u16, u8);
105}; 104};
106 105
107void amd_report_gart_errors(bool); 106void amd_report_gart_errors(bool);