diff options
author | Borislav Petkov <borislav.petkov@amd.com> | 2010-08-31 12:28:08 -0400 |
---|---|---|
committer | Borislav Petkov <bp@amd64.org> | 2010-10-21 08:48:02 -0400 |
commit | 5ce88f6ea6bef929f59f9468413f922c9a486fa4 (patch) | |
tree | e4a3b7fa9f3e782424453da68bb3aeff78647796 | |
parent | ded506232865e8e932bc21c87f48170d50db4d97 (diff) |
EDAC, MCE: Complete NB MCE decoders
Add support for decoding F14h BU MCEs and improve decoding of the
remaining families.
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
-rw-r--r-- | drivers/edac/amd64_edac.h | 1 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 210 | ||||
-rw-r--r-- | drivers/edac/mce_amd.h | 3 |
3 files changed, 158 insertions, 56 deletions
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 13e1d6f25bd1..044aee4f944d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h | |||
@@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16]; | |||
482 | extern const char *to_msgs[2]; | 482 | extern const char *to_msgs[2]; |
483 | extern const char *pp_msgs[4]; | 483 | extern const char *pp_msgs[4]; |
484 | extern const char *ii_msgs[4]; | 484 | extern const char *ii_msgs[4]; |
485 | extern const char *ext_msgs[32]; | ||
486 | extern const char *htlink_msgs[8]; | 485 | extern const char *htlink_msgs[8]; |
487 | 486 | ||
488 | #ifdef CONFIG_EDAC_DEBUG | 487 | #ifdef CONFIG_EDAC_DEBUG |
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3c161672a84b..d8d1c9de1ed6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c | |||
@@ -5,6 +5,8 @@ | |||
5 | 5 | ||
6 | static struct amd_decoder_ops *fam_ops; | 6 | static struct amd_decoder_ops *fam_ops; |
7 | 7 | ||
8 | static u8 nb_err_cpumask = 0xf; | ||
9 | |||
8 | static bool report_gart_errors; | 10 | static bool report_gart_errors; |
9 | static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); | 11 | static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); |
10 | 12 | ||
@@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs); | |||
61 | const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; | 63 | const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; |
62 | EXPORT_SYMBOL_GPL(ii_msgs); | 64 | EXPORT_SYMBOL_GPL(ii_msgs); |
63 | 65 | ||
64 | /* | 66 | static const char *f10h_nb_mce_desc[] = { |
65 | * Map the 4 or 5 (family-specific) bits of Extended Error code to the | 67 | "HT link data error", |
66 | * string table. | 68 | "Protocol error (link, L3, probe filter, etc.)", |
67 | */ | 69 | "Parity error in NB-internal arrays", |
68 | const char *ext_msgs[] = { | 70 | "Link Retry due to IO link transmission error", |
69 | "K8 ECC error", /* 0_0000b */ | 71 | "L3 ECC data cache error", |
70 | "CRC error on link", /* 0_0001b */ | 72 | "ECC error in L3 cache tag", |
71 | "Sync error packets on link", /* 0_0010b */ | 73 | "L3 LRU parity bits error", |
72 | "Master Abort during link operation", /* 0_0011b */ | 74 | "ECC Error in the Probe Filter directory" |
73 | "Target Abort during link operation", /* 0_0100b */ | ||
74 | "Invalid GART PTE entry during table walk", /* 0_0101b */ | ||
75 | "Unsupported atomic RMW command received", /* 0_0110b */ | ||
76 | "WDT error: NB transaction timeout", /* 0_0111b */ | ||
77 | "ECC/ChipKill ECC error", /* 0_1000b */ | ||
78 | "SVM DEV Error", /* 0_1001b */ | ||
79 | "Link Data error", /* 0_1010b */ | ||
80 | "Link/L3/Probe Filter Protocol error", /* 0_1011b */ | ||
81 | "NB Internal Arrays Parity error", /* 0_1100b */ | ||
82 | "DRAM Address/Control Parity error", /* 0_1101b */ | ||
83 | "Link Transmission error", /* 0_1110b */ | ||
84 | "GART/DEV Table Walk Data error" /* 0_1111b */ | ||
85 | "Res 0x100 error", /* 1_0000b */ | ||
86 | "Res 0x101 error", /* 1_0001b */ | ||
87 | "Res 0x102 error", /* 1_0010b */ | ||
88 | "Res 0x103 error", /* 1_0011b */ | ||
89 | "Res 0x104 error", /* 1_0100b */ | ||
90 | "Res 0x105 error", /* 1_0101b */ | ||
91 | "Res 0x106 error", /* 1_0110b */ | ||
92 | "Res 0x107 error", /* 1_0111b */ | ||
93 | "Res 0x108 error", /* 1_1000b */ | ||
94 | "Res 0x109 error", /* 1_1001b */ | ||
95 | "Res 0x10A error", /* 1_1010b */ | ||
96 | "Res 0x10B error", /* 1_1011b */ | ||
97 | "ECC error in L3 Cache Data", /* 1_1100b */ | ||
98 | "L3 Cache Tag error", /* 1_1101b */ | ||
99 | "L3 Cache LRU Parity error", /* 1_1110b */ | ||
100 | "Probe Filter error" /* 1_1111b */ | ||
101 | }; | 75 | }; |
102 | EXPORT_SYMBOL_GPL(ext_msgs); | ||
103 | 76 | ||
104 | static bool f10h_dc_mce(u16 ec) | 77 | static bool f10h_dc_mce(u16 ec) |
105 | { | 78 | { |
@@ -366,19 +339,97 @@ wrong_ls_mce: | |||
366 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); | 339 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); |
367 | } | 340 | } |
368 | 341 | ||
342 | static bool k8_nb_mce(u16 ec, u8 xec) | ||
343 | { | ||
344 | bool ret = true; | ||
345 | |||
346 | switch (xec) { | ||
347 | case 0x1: | ||
348 | pr_cont("CRC error detected on HT link.\n"); | ||
349 | break; | ||
350 | |||
351 | case 0x5: | ||
352 | pr_cont("Invalid GART PTE entry during GART table walk.\n"); | ||
353 | break; | ||
354 | |||
355 | case 0x6: | ||
356 | pr_cont("Unsupported atomic RMW received from an IO link.\n"); | ||
357 | break; | ||
358 | |||
359 | case 0x0: | ||
360 | case 0x8: | ||
361 | pr_cont("DRAM ECC error detected on the NB.\n"); | ||
362 | break; | ||
363 | |||
364 | case 0xd: | ||
365 | pr_cont("Parity error on the DRAM addr/ctl signals.\n"); | ||
366 | break; | ||
367 | |||
368 | default: | ||
369 | ret = false; | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | return ret; | ||
374 | } | ||
375 | |||
376 | static bool f10h_nb_mce(u16 ec, u8 xec) | ||
377 | { | ||
378 | bool ret = true; | ||
379 | u8 offset = 0; | ||
380 | |||
381 | if (k8_nb_mce(ec, xec)) | ||
382 | return true; | ||
383 | |||
384 | switch(xec) { | ||
385 | case 0xa ... 0xc: | ||
386 | offset = 10; | ||
387 | break; | ||
388 | |||
389 | case 0xe: | ||
390 | offset = 11; | ||
391 | break; | ||
392 | |||
393 | case 0xf: | ||
394 | if (TLB_ERROR(ec)) | ||
395 | pr_cont("GART Table Walk data error.\n"); | ||
396 | else if (BUS_ERROR(ec)) | ||
397 | pr_cont("DMA Exclusion Vector Table Walk error.\n"); | ||
398 | else | ||
399 | ret = false; | ||
400 | |||
401 | goto out; | ||
402 | break; | ||
403 | |||
404 | case 0x1c ... 0x1f: | ||
405 | offset = 24; | ||
406 | break; | ||
407 | |||
408 | default: | ||
409 | ret = false; | ||
410 | |||
411 | goto out; | ||
412 | break; | ||
413 | } | ||
414 | |||
415 | pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); | ||
416 | |||
417 | out: | ||
418 | return ret; | ||
419 | } | ||
420 | |||
421 | static bool f14h_nb_mce(u16 ec, u8 xec) | ||
422 | { | ||
423 | return false; | ||
424 | } | ||
425 | |||
369 | void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) | 426 | void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) |
370 | { | 427 | { |
371 | u32 ec = m->status & 0xffff; | 428 | u8 xec = (m->status >> 16) & 0x1f; |
429 | u16 ec = m->status & 0xffff; | ||
372 | u32 nbsh = (u32)(m->status >> 32); | 430 | u32 nbsh = (u32)(m->status >> 32); |
373 | u32 nbsl = (u32)m->status; | ||
374 | |||
375 | /* | ||
376 | * GART TLB error reporting is disabled by default. Bail out early. | ||
377 | */ | ||
378 | if (TLB_ERROR(ec) && !report_gart_errors) | ||
379 | return; | ||
380 | 431 | ||
381 | pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); | 432 | pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); |
382 | 433 | ||
383 | /* | 434 | /* |
384 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the | 435 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the |
@@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) | |||
387 | if ((boot_cpu_data.x86 == 0x10) && | 438 | if ((boot_cpu_data.x86 == 0x10) && |
388 | (boot_cpu_data.x86_model > 7)) { | 439 | (boot_cpu_data.x86_model > 7)) { |
389 | if (nbsh & K8_NBSH_ERR_CPU_VAL) | 440 | if (nbsh & K8_NBSH_ERR_CPU_VAL) |
390 | pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); | 441 | pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); |
391 | } else { | 442 | } else { |
392 | u8 assoc_cpus = nbsh & 0xf; | 443 | u8 assoc_cpus = nbsh & nb_err_cpumask; |
393 | 444 | ||
394 | if (assoc_cpus > 0) | 445 | if (assoc_cpus > 0) |
395 | pr_cont(", core: %d", fls(assoc_cpus) - 1); | 446 | pr_cont(", core: %d", fls(assoc_cpus) - 1); |
447 | } | ||
396 | 448 | ||
397 | pr_cont("\n"); | 449 | switch (xec) { |
450 | case 0x2: | ||
451 | pr_cont("Sync error (sync packets on HT link detected).\n"); | ||
452 | return; | ||
453 | |||
454 | case 0x3: | ||
455 | pr_cont("HT Master abort.\n"); | ||
456 | return; | ||
457 | |||
458 | case 0x4: | ||
459 | pr_cont("HT Target abort.\n"); | ||
460 | return; | ||
461 | |||
462 | case 0x7: | ||
463 | pr_cont("NB Watchdog timeout.\n"); | ||
464 | return; | ||
465 | |||
466 | case 0x9: | ||
467 | pr_cont("SVM DMA Exclusion Vector error.\n"); | ||
468 | return; | ||
469 | |||
470 | default: | ||
471 | break; | ||
398 | } | 472 | } |
399 | 473 | ||
400 | pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); | 474 | if (!fam_ops->nb_mce(ec, xec)) |
475 | goto wrong_nb_mce; | ||
476 | |||
477 | if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) | ||
478 | if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) | ||
479 | nb_bus_decoder(node_id, m, nbcfg); | ||
401 | 480 | ||
402 | if (BUS_ERROR(ec) && nb_bus_decoder) | 481 | return; |
403 | nb_bus_decoder(node_id, m, nbcfg); | 482 | |
483 | wrong_nb_mce: | ||
484 | pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); | ||
404 | } | 485 | } |
405 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); | 486 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); |
406 | 487 | ||
@@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec) | |||
430 | pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); | 511 | pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); |
431 | } | 512 | } |
432 | 513 | ||
514 | /* | ||
515 | * Filter out unwanted MCE signatures here. | ||
516 | */ | ||
517 | static bool amd_filter_mce(struct mce *m) | ||
518 | { | ||
519 | u8 xec = (m->status >> 16) & 0x1f; | ||
520 | |||
521 | /* | ||
522 | * NB GART TLB error reporting is disabled by default. | ||
523 | */ | ||
524 | if (m->bank == 4 && xec == 0x5 && !report_gart_errors) | ||
525 | return true; | ||
526 | |||
527 | return false; | ||
528 | } | ||
529 | |||
433 | int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) | 530 | int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) |
434 | { | 531 | { |
435 | struct mce *m = (struct mce *)data; | 532 | struct mce *m = (struct mce *)data; |
436 | int node, ecc; | 533 | int node, ecc; |
437 | 534 | ||
535 | if (amd_filter_mce(m)) | ||
536 | return NOTIFY_STOP; | ||
537 | |||
438 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); | 538 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); |
439 | 539 | ||
440 | pr_cont("%sorrected error, other errors lost: %s, " | 540 | pr_cont("%sorrected error, other errors lost: %s, " |
@@ -509,16 +609,20 @@ static int __init mce_amd_init(void) | |||
509 | case 0xf: | 609 | case 0xf: |
510 | fam_ops->dc_mce = k8_dc_mce; | 610 | fam_ops->dc_mce = k8_dc_mce; |
511 | fam_ops->ic_mce = k8_ic_mce; | 611 | fam_ops->ic_mce = k8_ic_mce; |
612 | fam_ops->nb_mce = k8_nb_mce; | ||
512 | break; | 613 | break; |
513 | 614 | ||
514 | case 0x10: | 615 | case 0x10: |
515 | fam_ops->dc_mce = f10h_dc_mce; | 616 | fam_ops->dc_mce = f10h_dc_mce; |
516 | fam_ops->ic_mce = k8_ic_mce; | 617 | fam_ops->ic_mce = k8_ic_mce; |
618 | fam_ops->nb_mce = f10h_nb_mce; | ||
517 | break; | 619 | break; |
518 | 620 | ||
519 | case 0x14: | 621 | case 0x14: |
622 | nb_err_cpumask = 0x3; | ||
520 | fam_ops->dc_mce = f14h_dc_mce; | 623 | fam_ops->dc_mce = f14h_dc_mce; |
521 | fam_ops->ic_mce = f14h_ic_mce; | 624 | fam_ops->ic_mce = f14h_ic_mce; |
625 | fam_ops->nb_mce = f14h_nb_mce; | ||
522 | break; | 626 | break; |
523 | 627 | ||
524 | default: | 628 | default: |
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index dc81dba9364b..0d0637debbad 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h | |||
@@ -7,7 +7,6 @@ | |||
7 | 7 | ||
8 | #define ERROR_CODE(x) ((x) & 0xffff) | 8 | #define ERROR_CODE(x) ((x) & 0xffff) |
9 | #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) | 9 | #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) |
10 | #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] | ||
11 | 10 | ||
12 | #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) | 11 | #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) |
13 | #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) | 12 | #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) |
@@ -83,7 +82,6 @@ extern const char *rrrr_msgs[]; | |||
83 | extern const char *pp_msgs[]; | 82 | extern const char *pp_msgs[]; |
84 | extern const char *to_msgs[]; | 83 | extern const char *to_msgs[]; |
85 | extern const char *ii_msgs[]; | 84 | extern const char *ii_msgs[]; |
86 | extern const char *ext_msgs[]; | ||
87 | 85 | ||
88 | /* | 86 | /* |
89 | * relevant NB regs | 87 | * relevant NB regs |
@@ -102,6 +100,7 @@ struct err_regs { | |||
102 | struct amd_decoder_ops { | 100 | struct amd_decoder_ops { |
103 | bool (*dc_mce)(u16); | 101 | bool (*dc_mce)(u16); |
104 | bool (*ic_mce)(u16); | 102 | bool (*ic_mce)(u16); |
103 | bool (*nb_mce)(u16, u8); | ||
105 | }; | 104 | }; |
106 | 105 | ||
107 | void amd_report_gart_errors(bool); | 106 | void amd_report_gart_errors(bool); |