diff options
| author | Borislav Petkov <borislav.petkov@amd.com> | 2010-08-31 12:28:08 -0400 |
|---|---|---|
| committer | Borislav Petkov <bp@amd64.org> | 2010-10-21 08:48:02 -0400 |
| commit | 5ce88f6ea6bef929f59f9468413f922c9a486fa4 (patch) | |
| tree | e4a3b7fa9f3e782424453da68bb3aeff78647796 | |
| parent | ded506232865e8e932bc21c87f48170d50db4d97 (diff) | |
EDAC, MCE: Complete NB MCE decoders
Add support for decoding F14h BU MCEs and improve decoding of the
remaining families.
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
| -rw-r--r-- | drivers/edac/amd64_edac.h | 1 | ||||
| -rw-r--r-- | drivers/edac/mce_amd.c | 210 | ||||
| -rw-r--r-- | drivers/edac/mce_amd.h | 3 |
3 files changed, 158 insertions, 56 deletions
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 13e1d6f25bd1..044aee4f944d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h | |||
| @@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16]; | |||
| 482 | extern const char *to_msgs[2]; | 482 | extern const char *to_msgs[2]; |
| 483 | extern const char *pp_msgs[4]; | 483 | extern const char *pp_msgs[4]; |
| 484 | extern const char *ii_msgs[4]; | 484 | extern const char *ii_msgs[4]; |
| 485 | extern const char *ext_msgs[32]; | ||
| 486 | extern const char *htlink_msgs[8]; | 485 | extern const char *htlink_msgs[8]; |
| 487 | 486 | ||
| 488 | #ifdef CONFIG_EDAC_DEBUG | 487 | #ifdef CONFIG_EDAC_DEBUG |
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3c161672a84b..d8d1c9de1ed6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c | |||
| @@ -5,6 +5,8 @@ | |||
| 5 | 5 | ||
| 6 | static struct amd_decoder_ops *fam_ops; | 6 | static struct amd_decoder_ops *fam_ops; |
| 7 | 7 | ||
| 8 | static u8 nb_err_cpumask = 0xf; | ||
| 9 | |||
| 8 | static bool report_gart_errors; | 10 | static bool report_gart_errors; |
| 9 | static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); | 11 | static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); |
| 10 | 12 | ||
| @@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs); | |||
| 61 | const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; | 63 | const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; |
| 62 | EXPORT_SYMBOL_GPL(ii_msgs); | 64 | EXPORT_SYMBOL_GPL(ii_msgs); |
| 63 | 65 | ||
| 64 | /* | 66 | static const char *f10h_nb_mce_desc[] = { |
| 65 | * Map the 4 or 5 (family-specific) bits of Extended Error code to the | 67 | "HT link data error", |
| 66 | * string table. | 68 | "Protocol error (link, L3, probe filter, etc.)", |
| 67 | */ | 69 | "Parity error in NB-internal arrays", |
| 68 | const char *ext_msgs[] = { | 70 | "Link Retry due to IO link transmission error", |
| 69 | "K8 ECC error", /* 0_0000b */ | 71 | "L3 ECC data cache error", |
| 70 | "CRC error on link", /* 0_0001b */ | 72 | "ECC error in L3 cache tag", |
| 71 | "Sync error packets on link", /* 0_0010b */ | 73 | "L3 LRU parity bits error", |
| 72 | "Master Abort during link operation", /* 0_0011b */ | 74 | "ECC Error in the Probe Filter directory" |
| 73 | "Target Abort during link operation", /* 0_0100b */ | ||
| 74 | "Invalid GART PTE entry during table walk", /* 0_0101b */ | ||
| 75 | "Unsupported atomic RMW command received", /* 0_0110b */ | ||
| 76 | "WDT error: NB transaction timeout", /* 0_0111b */ | ||
| 77 | "ECC/ChipKill ECC error", /* 0_1000b */ | ||
| 78 | "SVM DEV Error", /* 0_1001b */ | ||
| 79 | "Link Data error", /* 0_1010b */ | ||
| 80 | "Link/L3/Probe Filter Protocol error", /* 0_1011b */ | ||
| 81 | "NB Internal Arrays Parity error", /* 0_1100b */ | ||
| 82 | "DRAM Address/Control Parity error", /* 0_1101b */ | ||
| 83 | "Link Transmission error", /* 0_1110b */ | ||
| 84 | "GART/DEV Table Walk Data error" /* 0_1111b */ | ||
| 85 | "Res 0x100 error", /* 1_0000b */ | ||
| 86 | "Res 0x101 error", /* 1_0001b */ | ||
| 87 | "Res 0x102 error", /* 1_0010b */ | ||
| 88 | "Res 0x103 error", /* 1_0011b */ | ||
| 89 | "Res 0x104 error", /* 1_0100b */ | ||
| 90 | "Res 0x105 error", /* 1_0101b */ | ||
| 91 | "Res 0x106 error", /* 1_0110b */ | ||
| 92 | "Res 0x107 error", /* 1_0111b */ | ||
| 93 | "Res 0x108 error", /* 1_1000b */ | ||
| 94 | "Res 0x109 error", /* 1_1001b */ | ||
| 95 | "Res 0x10A error", /* 1_1010b */ | ||
| 96 | "Res 0x10B error", /* 1_1011b */ | ||
| 97 | "ECC error in L3 Cache Data", /* 1_1100b */ | ||
| 98 | "L3 Cache Tag error", /* 1_1101b */ | ||
| 99 | "L3 Cache LRU Parity error", /* 1_1110b */ | ||
| 100 | "Probe Filter error" /* 1_1111b */ | ||
| 101 | }; | 75 | }; |
| 102 | EXPORT_SYMBOL_GPL(ext_msgs); | ||
| 103 | 76 | ||
| 104 | static bool f10h_dc_mce(u16 ec) | 77 | static bool f10h_dc_mce(u16 ec) |
| 105 | { | 78 | { |
| @@ -366,19 +339,97 @@ wrong_ls_mce: | |||
| 366 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); | 339 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); |
| 367 | } | 340 | } |
| 368 | 341 | ||
| 342 | static bool k8_nb_mce(u16 ec, u8 xec) | ||
| 343 | { | ||
| 344 | bool ret = true; | ||
| 345 | |||
| 346 | switch (xec) { | ||
| 347 | case 0x1: | ||
| 348 | pr_cont("CRC error detected on HT link.\n"); | ||
| 349 | break; | ||
| 350 | |||
| 351 | case 0x5: | ||
| 352 | pr_cont("Invalid GART PTE entry during GART table walk.\n"); | ||
| 353 | break; | ||
| 354 | |||
| 355 | case 0x6: | ||
| 356 | pr_cont("Unsupported atomic RMW received from an IO link.\n"); | ||
| 357 | break; | ||
| 358 | |||
| 359 | case 0x0: | ||
| 360 | case 0x8: | ||
| 361 | pr_cont("DRAM ECC error detected on the NB.\n"); | ||
| 362 | break; | ||
| 363 | |||
| 364 | case 0xd: | ||
| 365 | pr_cont("Parity error on the DRAM addr/ctl signals.\n"); | ||
| 366 | break; | ||
| 367 | |||
| 368 | default: | ||
| 369 | ret = false; | ||
| 370 | break; | ||
| 371 | } | ||
| 372 | |||
| 373 | return ret; | ||
| 374 | } | ||
| 375 | |||
| 376 | static bool f10h_nb_mce(u16 ec, u8 xec) | ||
| 377 | { | ||
| 378 | bool ret = true; | ||
| 379 | u8 offset = 0; | ||
| 380 | |||
| 381 | if (k8_nb_mce(ec, xec)) | ||
| 382 | return true; | ||
| 383 | |||
| 384 | switch(xec) { | ||
| 385 | case 0xa ... 0xc: | ||
| 386 | offset = 10; | ||
| 387 | break; | ||
| 388 | |||
| 389 | case 0xe: | ||
| 390 | offset = 11; | ||
| 391 | break; | ||
| 392 | |||
| 393 | case 0xf: | ||
| 394 | if (TLB_ERROR(ec)) | ||
| 395 | pr_cont("GART Table Walk data error.\n"); | ||
| 396 | else if (BUS_ERROR(ec)) | ||
| 397 | pr_cont("DMA Exclusion Vector Table Walk error.\n"); | ||
| 398 | else | ||
| 399 | ret = false; | ||
| 400 | |||
| 401 | goto out; | ||
| 402 | break; | ||
| 403 | |||
| 404 | case 0x1c ... 0x1f: | ||
| 405 | offset = 24; | ||
| 406 | break; | ||
| 407 | |||
| 408 | default: | ||
| 409 | ret = false; | ||
| 410 | |||
| 411 | goto out; | ||
| 412 | break; | ||
| 413 | } | ||
| 414 | |||
| 415 | pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); | ||
| 416 | |||
| 417 | out: | ||
| 418 | return ret; | ||
| 419 | } | ||
| 420 | |||
| 421 | static bool f14h_nb_mce(u16 ec, u8 xec) | ||
| 422 | { | ||
| 423 | return false; | ||
| 424 | } | ||
| 425 | |||
| 369 | void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) | 426 | void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) |
| 370 | { | 427 | { |
| 371 | u32 ec = m->status & 0xffff; | 428 | u8 xec = (m->status >> 16) & 0x1f; |
| 429 | u16 ec = m->status & 0xffff; | ||
| 372 | u32 nbsh = (u32)(m->status >> 32); | 430 | u32 nbsh = (u32)(m->status >> 32); |
| 373 | u32 nbsl = (u32)m->status; | ||
| 374 | |||
| 375 | /* | ||
| 376 | * GART TLB error reporting is disabled by default. Bail out early. | ||
| 377 | */ | ||
| 378 | if (TLB_ERROR(ec) && !report_gart_errors) | ||
| 379 | return; | ||
| 380 | 431 | ||
| 381 | pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); | 432 | pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); |
| 382 | 433 | ||
| 383 | /* | 434 | /* |
| 384 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the | 435 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the |
| @@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) | |||
| 387 | if ((boot_cpu_data.x86 == 0x10) && | 438 | if ((boot_cpu_data.x86 == 0x10) && |
| 388 | (boot_cpu_data.x86_model > 7)) { | 439 | (boot_cpu_data.x86_model > 7)) { |
| 389 | if (nbsh & K8_NBSH_ERR_CPU_VAL) | 440 | if (nbsh & K8_NBSH_ERR_CPU_VAL) |
| 390 | pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); | 441 | pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); |
| 391 | } else { | 442 | } else { |
| 392 | u8 assoc_cpus = nbsh & 0xf; | 443 | u8 assoc_cpus = nbsh & nb_err_cpumask; |
| 393 | 444 | ||
| 394 | if (assoc_cpus > 0) | 445 | if (assoc_cpus > 0) |
| 395 | pr_cont(", core: %d", fls(assoc_cpus) - 1); | 446 | pr_cont(", core: %d", fls(assoc_cpus) - 1); |
| 447 | } | ||
| 396 | 448 | ||
| 397 | pr_cont("\n"); | 449 | switch (xec) { |
| 450 | case 0x2: | ||
| 451 | pr_cont("Sync error (sync packets on HT link detected).\n"); | ||
| 452 | return; | ||
| 453 | |||
| 454 | case 0x3: | ||
| 455 | pr_cont("HT Master abort.\n"); | ||
| 456 | return; | ||
| 457 | |||
| 458 | case 0x4: | ||
| 459 | pr_cont("HT Target abort.\n"); | ||
| 460 | return; | ||
| 461 | |||
| 462 | case 0x7: | ||
| 463 | pr_cont("NB Watchdog timeout.\n"); | ||
| 464 | return; | ||
| 465 | |||
| 466 | case 0x9: | ||
| 467 | pr_cont("SVM DMA Exclusion Vector error.\n"); | ||
| 468 | return; | ||
| 469 | |||
| 470 | default: | ||
| 471 | break; | ||
| 398 | } | 472 | } |
| 399 | 473 | ||
| 400 | pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); | 474 | if (!fam_ops->nb_mce(ec, xec)) |
| 475 | goto wrong_nb_mce; | ||
| 476 | |||
| 477 | if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) | ||
| 478 | if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) | ||
| 479 | nb_bus_decoder(node_id, m, nbcfg); | ||
| 401 | 480 | ||
| 402 | if (BUS_ERROR(ec) && nb_bus_decoder) | 481 | return; |
| 403 | nb_bus_decoder(node_id, m, nbcfg); | 482 | |
| 483 | wrong_nb_mce: | ||
| 484 | pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); | ||
| 404 | } | 485 | } |
| 405 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); | 486 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); |
| 406 | 487 | ||
| @@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec) | |||
| 430 | pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); | 511 | pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); |
| 431 | } | 512 | } |
| 432 | 513 | ||
| 514 | /* | ||
| 515 | * Filter out unwanted MCE signatures here. | ||
| 516 | */ | ||
| 517 | static bool amd_filter_mce(struct mce *m) | ||
| 518 | { | ||
| 519 | u8 xec = (m->status >> 16) & 0x1f; | ||
| 520 | |||
| 521 | /* | ||
| 522 | * NB GART TLB error reporting is disabled by default. | ||
| 523 | */ | ||
| 524 | if (m->bank == 4 && xec == 0x5 && !report_gart_errors) | ||
| 525 | return true; | ||
| 526 | |||
| 527 | return false; | ||
| 528 | } | ||
| 529 | |||
| 433 | int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) | 530 | int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) |
| 434 | { | 531 | { |
| 435 | struct mce *m = (struct mce *)data; | 532 | struct mce *m = (struct mce *)data; |
| 436 | int node, ecc; | 533 | int node, ecc; |
| 437 | 534 | ||
| 535 | if (amd_filter_mce(m)) | ||
| 536 | return NOTIFY_STOP; | ||
| 537 | |||
| 438 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); | 538 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); |
| 439 | 539 | ||
| 440 | pr_cont("%sorrected error, other errors lost: %s, " | 540 | pr_cont("%sorrected error, other errors lost: %s, " |
| @@ -509,16 +609,20 @@ static int __init mce_amd_init(void) | |||
| 509 | case 0xf: | 609 | case 0xf: |
| 510 | fam_ops->dc_mce = k8_dc_mce; | 610 | fam_ops->dc_mce = k8_dc_mce; |
| 511 | fam_ops->ic_mce = k8_ic_mce; | 611 | fam_ops->ic_mce = k8_ic_mce; |
| 612 | fam_ops->nb_mce = k8_nb_mce; | ||
| 512 | break; | 613 | break; |
| 513 | 614 | ||
| 514 | case 0x10: | 615 | case 0x10: |
| 515 | fam_ops->dc_mce = f10h_dc_mce; | 616 | fam_ops->dc_mce = f10h_dc_mce; |
| 516 | fam_ops->ic_mce = k8_ic_mce; | 617 | fam_ops->ic_mce = k8_ic_mce; |
| 618 | fam_ops->nb_mce = f10h_nb_mce; | ||
| 517 | break; | 619 | break; |
| 518 | 620 | ||
| 519 | case 0x14: | 621 | case 0x14: |
| 622 | nb_err_cpumask = 0x3; | ||
| 520 | fam_ops->dc_mce = f14h_dc_mce; | 623 | fam_ops->dc_mce = f14h_dc_mce; |
| 521 | fam_ops->ic_mce = f14h_ic_mce; | 624 | fam_ops->ic_mce = f14h_ic_mce; |
| 625 | fam_ops->nb_mce = f14h_nb_mce; | ||
| 522 | break; | 626 | break; |
| 523 | 627 | ||
| 524 | default: | 628 | default: |
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index dc81dba9364b..0d0637debbad 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h | |||
| @@ -7,7 +7,6 @@ | |||
| 7 | 7 | ||
| 8 | #define ERROR_CODE(x) ((x) & 0xffff) | 8 | #define ERROR_CODE(x) ((x) & 0xffff) |
| 9 | #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) | 9 | #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) |
| 10 | #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] | ||
| 11 | 10 | ||
| 12 | #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) | 11 | #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) |
| 13 | #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) | 12 | #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) |
| @@ -83,7 +82,6 @@ extern const char *rrrr_msgs[]; | |||
| 83 | extern const char *pp_msgs[]; | 82 | extern const char *pp_msgs[]; |
| 84 | extern const char *to_msgs[]; | 83 | extern const char *to_msgs[]; |
| 85 | extern const char *ii_msgs[]; | 84 | extern const char *ii_msgs[]; |
| 86 | extern const char *ext_msgs[]; | ||
| 87 | 85 | ||
| 88 | /* | 86 | /* |
| 89 | * relevant NB regs | 87 | * relevant NB regs |
| @@ -102,6 +100,7 @@ struct err_regs { | |||
| 102 | struct amd_decoder_ops { | 100 | struct amd_decoder_ops { |
| 103 | bool (*dc_mce)(u16); | 101 | bool (*dc_mce)(u16); |
| 104 | bool (*ic_mce)(u16); | 102 | bool (*ic_mce)(u16); |
| 103 | bool (*nb_mce)(u16, u8); | ||
| 105 | }; | 104 | }; |
| 106 | 105 | ||
| 107 | void amd_report_gart_errors(bool); | 106 | void amd_report_gart_errors(bool); |
