diff options
| -rw-r--r-- | drivers/edac/amd64_edac.c | 118 | ||||
| -rw-r--r-- | drivers/edac/edac_mce_amd.c | 16 |
2 files changed, 8 insertions, 126 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e8d84f89dbcf..a44e90abb755 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c | |||
| @@ -1979,107 +1979,6 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome) | |||
| 1979 | } | 1979 | } |
| 1980 | 1980 | ||
| 1981 | /* | 1981 | /* |
| 1982 | * Check for valid error in the NB Status High register. If so, proceed to read | ||
| 1983 | * NB Status Low, NB Address Low and NB Address High registers and store data | ||
| 1984 | * into error structure. | ||
| 1985 | * | ||
| 1986 | * Returns: | ||
| 1987 | * - 1: if hardware regs contains valid error info | ||
| 1988 | * - 0: if no valid error is indicated | ||
| 1989 | */ | ||
| 1990 | static int amd64_get_error_info_regs(struct mem_ctl_info *mci, | ||
| 1991 | struct err_regs *regs) | ||
| 1992 | { | ||
| 1993 | struct amd64_pvt *pvt; | ||
| 1994 | struct pci_dev *misc_f3_ctl; | ||
| 1995 | |||
| 1996 | pvt = mci->pvt_info; | ||
| 1997 | misc_f3_ctl = pvt->misc_f3_ctl; | ||
| 1998 | |||
| 1999 | if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, ®s->nbsh)) | ||
| 2000 | return 0; | ||
| 2001 | |||
| 2002 | if (!(regs->nbsh & K8_NBSH_VALID_BIT)) | ||
| 2003 | return 0; | ||
| 2004 | |||
| 2005 | /* valid error, read remaining error information registers */ | ||
| 2006 | if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, ®s->nbsl) || | ||
| 2007 | amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, ®s->nbeal) || | ||
| 2008 | amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, ®s->nbeah) || | ||
| 2009 | amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, ®s->nbcfg)) | ||
| 2010 | return 0; | ||
| 2011 | |||
| 2012 | return 1; | ||
| 2013 | } | ||
| 2014 | |||
| 2015 | /* | ||
| 2016 | * This function is called to retrieve the error data from hardware and store it | ||
| 2017 | * in the info structure. | ||
| 2018 | * | ||
| 2019 | * Returns: | ||
| 2020 | * - 1: if a valid error is found | ||
| 2021 | * - 0: if no error is found | ||
| 2022 | */ | ||
| 2023 | static int amd64_get_error_info(struct mem_ctl_info *mci, | ||
| 2024 | struct err_regs *info) | ||
| 2025 | { | ||
| 2026 | struct amd64_pvt *pvt; | ||
| 2027 | struct err_regs regs; | ||
| 2028 | |||
| 2029 | pvt = mci->pvt_info; | ||
| 2030 | |||
| 2031 | if (!amd64_get_error_info_regs(mci, info)) | ||
| 2032 | return 0; | ||
| 2033 | |||
| 2034 | /* | ||
| 2035 | * Here's the problem with the K8's EDAC reporting: There are four | ||
| 2036 | * registers which report pieces of error information. They are shared | ||
| 2037 | * between CEs and UEs. Furthermore, contrary to what is stated in the | ||
| 2038 | * BKDG, the overflow bit is never used! Every error always updates the | ||
| 2039 | * reporting registers. | ||
| 2040 | * | ||
| 2041 | * Can you see the race condition? All four error reporting registers | ||
| 2042 | * must be read before a new error updates them! There is no way to read | ||
| 2043 | * all four registers atomically. The best than can be done is to detect | ||
| 2044 | * that a race has occured and then report the error without any kind of | ||
| 2045 | * precision. | ||
| 2046 | * | ||
| 2047 | * What is still positive is that errors are still reported and thus | ||
| 2048 | * problems can still be detected - just not localized because the | ||
| 2049 | * syndrome and address are spread out across registers. | ||
| 2050 | * | ||
| 2051 | * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev. | ||
| 2052 | * UEs and CEs should have separate register sets with proper overflow | ||
| 2053 | * bits that are used! At very least the problem can be fixed by | ||
| 2054 | * honoring the ErrValid bit in 'nbsh' and not updating registers - just | ||
| 2055 | * set the overflow bit - unless the current error is CE and the new | ||
| 2056 | * error is UE which would be the only situation for overwriting the | ||
| 2057 | * current values. | ||
| 2058 | */ | ||
| 2059 | |||
| 2060 | regs = *info; | ||
| 2061 | |||
| 2062 | /* Use info from the second read - most current */ | ||
| 2063 | if (unlikely(!amd64_get_error_info_regs(mci, info))) | ||
| 2064 | return 0; | ||
| 2065 | |||
| 2066 | /* clear the error bits in hardware */ | ||
| 2067 | pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT); | ||
| 2068 | |||
| 2069 | /* Check for the possible race condition */ | ||
| 2070 | if ((regs.nbsh != info->nbsh) || | ||
| 2071 | (regs.nbsl != info->nbsl) || | ||
| 2072 | (regs.nbeah != info->nbeah) || | ||
| 2073 | (regs.nbeal != info->nbeal)) { | ||
| 2074 | amd64_mc_printk(mci, KERN_WARNING, | ||
| 2075 | "hardware STATUS read access race condition " | ||
| 2076 | "detected!\n"); | ||
| 2077 | return 0; | ||
| 2078 | } | ||
| 2079 | return 1; | ||
| 2080 | } | ||
| 2081 | |||
| 2082 | /* | ||
| 2083 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR | 1982 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR |
| 2084 | * ADDRESS and process. | 1983 | * ADDRESS and process. |
| 2085 | */ | 1984 | */ |
| @@ -2203,20 +2102,6 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs) | |||
| 2203 | } | 2102 | } |
| 2204 | 2103 | ||
| 2205 | /* | 2104 | /* |
| 2206 | * The main polling 'check' function, called FROM the edac core to perform the | ||
| 2207 | * error checking and if an error is encountered, error processing. | ||
| 2208 | */ | ||
| 2209 | static void amd64_check(struct mem_ctl_info *mci) | ||
| 2210 | { | ||
| 2211 | struct err_regs regs; | ||
| 2212 | |||
| 2213 | if (amd64_get_error_info(mci, ®s)) { | ||
| 2214 | struct amd64_pvt *pvt = mci->pvt_info; | ||
| 2215 | amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); | ||
| 2216 | } | ||
| 2217 | } | ||
| 2218 | |||
| 2219 | /* | ||
| 2220 | * Input: | 2105 | * Input: |
| 2221 | * 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer | 2106 | * 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer |
| 2222 | * 2) AMD Family index value | 2107 | * 2) AMD Family index value |
| @@ -2756,9 +2641,6 @@ static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci) | |||
| 2756 | mci->dev_name = pci_name(pvt->dram_f2_ctl); | 2641 | mci->dev_name = pci_name(pvt->dram_f2_ctl); |
| 2757 | mci->ctl_page_to_phys = NULL; | 2642 | mci->ctl_page_to_phys = NULL; |
| 2758 | 2643 | ||
| 2759 | /* IMPORTANT: Set the polling 'check' function in this module */ | ||
| 2760 | mci->edac_check = amd64_check; | ||
| 2761 | |||
| 2762 | /* memory scrubber interface */ | 2644 | /* memory scrubber interface */ |
| 2763 | mci->set_sdram_scrub_rate = amd64_set_scrub_rate; | 2645 | mci->set_sdram_scrub_rate = amd64_set_scrub_rate; |
| 2764 | mci->get_sdram_scrub_rate = amd64_get_scrub_rate; | 2646 | mci->get_sdram_scrub_rate = amd64_get_scrub_rate; |
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 97e64bcdbc06..bae9351e9473 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c | |||
| @@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status) | |||
| 133 | u32 ec = mc0_status & 0xffff; | 133 | u32 ec = mc0_status & 0xffff; |
| 134 | u32 xec = (mc0_status >> 16) & 0xf; | 134 | u32 xec = (mc0_status >> 16) & 0xf; |
| 135 | 135 | ||
| 136 | pr_emerg(" Data Cache Error"); | 136 | pr_emerg("Data Cache Error"); |
| 137 | 137 | ||
| 138 | if (xec == 1 && TLB_ERROR(ec)) | 138 | if (xec == 1 && TLB_ERROR(ec)) |
| 139 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); | 139 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); |
| @@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status) | |||
| 176 | u32 ec = mc1_status & 0xffff; | 176 | u32 ec = mc1_status & 0xffff; |
| 177 | u32 xec = (mc1_status >> 16) & 0xf; | 177 | u32 xec = (mc1_status >> 16) & 0xf; |
| 178 | 178 | ||
| 179 | pr_emerg(" Instruction Cache Error"); | 179 | pr_emerg("Instruction Cache Error"); |
| 180 | 180 | ||
| 181 | if (xec == 1 && TLB_ERROR(ec)) | 181 | if (xec == 1 && TLB_ERROR(ec)) |
| 182 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); | 182 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); |
| @@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status) | |||
| 233 | u32 ec = mc2_status & 0xffff; | 233 | u32 ec = mc2_status & 0xffff; |
| 234 | u32 xec = (mc2_status >> 16) & 0xf; | 234 | u32 xec = (mc2_status >> 16) & 0xf; |
| 235 | 235 | ||
| 236 | pr_emerg(" Bus Unit Error"); | 236 | pr_emerg("Bus Unit Error"); |
| 237 | 237 | ||
| 238 | if (xec == 0x1) | 238 | if (xec == 0x1) |
| 239 | pr_cont(" in the write data buffers.\n"); | 239 | pr_cont(" in the write data buffers.\n"); |
| @@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status) | |||
| 275 | u32 ec = mc3_status & 0xffff; | 275 | u32 ec = mc3_status & 0xffff; |
| 276 | u32 xec = (mc3_status >> 16) & 0xf; | 276 | u32 xec = (mc3_status >> 16) & 0xf; |
| 277 | 277 | ||
| 278 | pr_emerg(" Load Store Error"); | 278 | pr_emerg("Load Store Error"); |
| 279 | 279 | ||
| 280 | if (xec == 0x0) { | 280 | if (xec == 0x0) { |
| 281 | u8 rrrr = (ec >> 4) & 0xf; | 281 | u8 rrrr = (ec >> 4) & 0xf; |
| @@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) | |||
| 304 | if (TLB_ERROR(ec) && !report_gart_errors) | 304 | if (TLB_ERROR(ec) && !report_gart_errors) |
| 305 | return; | 305 | return; |
| 306 | 306 | ||
| 307 | pr_emerg(" Northbridge Error, node %d", node_id); | 307 | pr_emerg("Northbridge Error, node %d", node_id); |
| 308 | 308 | ||
| 309 | /* | 309 | /* |
| 310 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the | 310 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the |
| @@ -342,13 +342,13 @@ static void amd_decode_fr_mce(u64 mc5_status) | |||
| 342 | static inline void amd_decode_err_code(unsigned int ec) | 342 | static inline void amd_decode_err_code(unsigned int ec) |
| 343 | { | 343 | { |
| 344 | if (TLB_ERROR(ec)) { | 344 | if (TLB_ERROR(ec)) { |
| 345 | pr_emerg(" Transaction: %s, Cache Level %s\n", | 345 | pr_emerg("Transaction: %s, Cache Level %s\n", |
| 346 | TT_MSG(ec), LL_MSG(ec)); | 346 | TT_MSG(ec), LL_MSG(ec)); |
| 347 | } else if (MEM_ERROR(ec)) { | 347 | } else if (MEM_ERROR(ec)) { |
| 348 | pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", | 348 | pr_emerg("Transaction: %s, Type: %s, Cache Level: %s", |
| 349 | RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); | 349 | RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); |
| 350 | } else if (BUS_ERROR(ec)) { | 350 | } else if (BUS_ERROR(ec)) { |
| 351 | pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " | 351 | pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, " |
| 352 | "Participating Processor: %s\n", | 352 | "Participating Processor: %s\n", |
| 353 | RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), | 353 | RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), |
| 354 | PP_MSG(ec)); | 354 | PP_MSG(ec)); |
