From 5ce88f6ea6bef929f59f9468413f922c9a486fa4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Aug 2010 18:28:08 +0200 Subject: EDAC, MCE: Complete NB MCE decoders Add support for decoding F14h BU MCEs and improve decoding of the remaining families. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 210 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 157 insertions(+), 53 deletions(-) (limited to 'drivers/edac/mce_amd.c') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3c161672a84b..d8d1c9de1ed6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -5,6 +5,8 @@ static struct amd_decoder_ops *fam_ops; +static u8 nb_err_cpumask = 0xf; + static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); @@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -/* - * Map the 4 or 5 (family-specific) bits of Extended Error code to the - * string table. - */ -const char *ext_msgs[] = { - "K8 ECC error", /* 0_0000b */ - "CRC error on link", /* 0_0001b */ - "Sync error packets on link", /* 0_0010b */ - "Master Abort during link operation", /* 0_0011b */ - "Target Abort during link operation", /* 0_0100b */ - "Invalid GART PTE entry during table walk", /* 0_0101b */ - "Unsupported atomic RMW command received", /* 0_0110b */ - "WDT error: NB transaction timeout", /* 0_0111b */ - "ECC/ChipKill ECC error", /* 0_1000b */ - "SVM DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link/L3/Probe Filter Protocol error", /* 0_1011b */ - "NB Internal Arrays Parity error", /* 0_1100b */ - "DRAM Address/Control Parity error", /* 0_1101b */ - "Link Transmission error", /* 0_1110b */ - "GART/DEV Table Walk Data error" /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "ECC error in L3 Cache Data", /* 1_1100b */ - "L3 Cache Tag error", /* 1_1101b */ - "L3 Cache LRU Parity error", /* 1_1110b */ - "Probe Filter error" /* 1_1111b */ +static const char *f10h_nb_mce_desc[] = { + "HT link data error", + "Protocol error (link, L3, probe filter, etc.)", + "Parity error in NB-internal arrays", + "Link Retry due to IO link transmission error", + "L3 ECC data cache error", + "ECC error in L3 cache tag", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" }; -EXPORT_SYMBOL_GPL(ext_msgs); static bool f10h_dc_mce(u16 ec) { @@ -366,19 +339,97 @@ wrong_ls_mce: pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } +static bool k8_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + + switch (xec) { + case 0x1: + pr_cont("CRC error detected on HT link.\n"); + break; + + case 0x5: + pr_cont("Invalid GART PTE entry during GART table walk.\n"); + break; + + case 0x6: + pr_cont("Unsupported atomic RMW received from an IO link.\n"); + break; + + case 0x0: + case 0x8: + pr_cont("DRAM ECC error detected on the NB.\n"); + break; + + case 0xd: + pr_cont("Parity error on the DRAM addr/ctl signals.\n"); + break; + + default: + ret = false; + break; + } + + return ret; +} + +static bool f10h_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + u8 offset = 0; + + if (k8_nb_mce(ec, xec)) + return true; + + switch(xec) { + case 0xa ... 0xc: + offset = 10; + break; + + case 0xe: + offset = 11; + break; + + case 0xf: + if (TLB_ERROR(ec)) + pr_cont("GART Table Walk data error.\n"); + else if (BUS_ERROR(ec)) + pr_cont("DMA Exclusion Vector Table Walk error.\n"); + else + ret = false; + + goto out; + break; + + case 0x1c ... 0x1f: + offset = 24; + break; + + default: + ret = false; + + goto out; + break; + } + + pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); + +out: + return ret; +} + +static bool f14h_nb_mce(u16 ec, u8 xec) +{ + return false; +} + void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) { - u32 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0x1f; + u16 ec = m->status & 0xffff; u32 nbsh = (u32)(m->status >> 32); - u32 nbsl = (u32)m->status; - - /* - * GART TLB error reporting is disabled by default. Bail out early. - */ - if (TLB_ERROR(ec) && !report_gart_errors) - return; - pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); + pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); /* * F10h, revD can disable ErrCpu[3:0] so check that first and also the @@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model > 7)) { if (nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); + pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); } else { - u8 assoc_cpus = nbsh & 0xf; + u8 assoc_cpus = nbsh & nb_err_cpumask; if (assoc_cpus > 0) pr_cont(", core: %d", fls(assoc_cpus) - 1); + } - pr_cont("\n"); + switch (xec) { + case 0x2: + pr_cont("Sync error (sync packets on HT link detected).\n"); + return; + + case 0x3: + pr_cont("HT Master abort.\n"); + return; + + case 0x4: + pr_cont("HT Target abort.\n"); + return; + + case 0x7: + pr_cont("NB Watchdog timeout.\n"); + return; + + case 0x9: + pr_cont("SVM DMA Exclusion Vector error.\n"); + return; + + default: + break; } - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); + if (!fam_ops->nb_mce(ec, xec)) + goto wrong_nb_mce; + + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) + nb_bus_decoder(node_id, m, nbcfg); - if (BUS_ERROR(ec) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); + return; + +wrong_nb_mce: + pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); @@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec) pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); } +/* + * Filter out unwanted MCE signatures here. + */ +static bool amd_filter_mce(struct mce *m) +{ + u8 xec = (m->status >> 16) & 0x1f; + + /* + * NB GART TLB error reporting is disabled by default. + */ + if (m->bank == 4 && xec == 0x5 && !report_gart_errors) + return true; + + return false; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; int node, ecc; + if (amd_filter_mce(m)) + return NOTIFY_STOP; + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); pr_cont("%sorrected error, other errors lost: %s, " @@ -509,16 +609,20 @@ static int __init mce_amd_init(void) case 0xf: fam_ops->dc_mce = k8_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = k8_nb_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = f10h_nb_mce; break; case 0x14: + nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; fam_ops->ic_mce = f14h_ic_mce; + fam_ops->nb_mce = f14h_nb_mce; break; default: -- cgit v1.2.2