aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/edac/amd64_edac.c118
-rw-r--r--drivers/edac/edac_mce_amd.c16
2 files changed, 8 insertions, 126 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e8d84f89dbcf..a44e90abb755 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1979,107 +1979,6 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
1979} 1979}
1980 1980
1981/* 1981/*
1982 * Check for valid error in the NB Status High register. If so, proceed to read
1983 * NB Status Low, NB Address Low and NB Address High registers and store data
1984 * into error structure.
1985 *
1986 * Returns:
1987 * - 1: if hardware regs contains valid error info
1988 * - 0: if no valid error is indicated
1989 */
1990static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
1991 struct err_regs *regs)
1992{
1993 struct amd64_pvt *pvt;
1994 struct pci_dev *misc_f3_ctl;
1995
1996 pvt = mci->pvt_info;
1997 misc_f3_ctl = pvt->misc_f3_ctl;
1998
1999 if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
2000 return 0;
2001
2002 if (!(regs->nbsh & K8_NBSH_VALID_BIT))
2003 return 0;
2004
2005 /* valid error, read remaining error information registers */
2006 if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) ||
2007 amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) ||
2008 amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) ||
2009 amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
2010 return 0;
2011
2012 return 1;
2013}
2014
2015/*
2016 * This function is called to retrieve the error data from hardware and store it
2017 * in the info structure.
2018 *
2019 * Returns:
2020 * - 1: if a valid error is found
2021 * - 0: if no error is found
2022 */
2023static int amd64_get_error_info(struct mem_ctl_info *mci,
2024 struct err_regs *info)
2025{
2026 struct amd64_pvt *pvt;
2027 struct err_regs regs;
2028
2029 pvt = mci->pvt_info;
2030
2031 if (!amd64_get_error_info_regs(mci, info))
2032 return 0;
2033
2034 /*
2035 * Here's the problem with the K8's EDAC reporting: There are four
2036 * registers which report pieces of error information. They are shared
2037 * between CEs and UEs. Furthermore, contrary to what is stated in the
2038 * BKDG, the overflow bit is never used! Every error always updates the
2039 * reporting registers.
2040 *
2041 * Can you see the race condition? All four error reporting registers
2042 * must be read before a new error updates them! There is no way to read
2043 * all four registers atomically. The best than can be done is to detect
2044 * that a race has occured and then report the error without any kind of
2045 * precision.
2046 *
2047 * What is still positive is that errors are still reported and thus
2048 * problems can still be detected - just not localized because the
2049 * syndrome and address are spread out across registers.
2050 *
2051 * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
2052 * UEs and CEs should have separate register sets with proper overflow
2053 * bits that are used! At very least the problem can be fixed by
2054 * honoring the ErrValid bit in 'nbsh' and not updating registers - just
2055 * set the overflow bit - unless the current error is CE and the new
2056 * error is UE which would be the only situation for overwriting the
2057 * current values.
2058 */
2059
2060 regs = *info;
2061
2062 /* Use info from the second read - most current */
2063 if (unlikely(!amd64_get_error_info_regs(mci, info)))
2064 return 0;
2065
2066 /* clear the error bits in hardware */
2067 pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
2068
2069 /* Check for the possible race condition */
2070 if ((regs.nbsh != info->nbsh) ||
2071 (regs.nbsl != info->nbsl) ||
2072 (regs.nbeah != info->nbeah) ||
2073 (regs.nbeal != info->nbeal)) {
2074 amd64_mc_printk(mci, KERN_WARNING,
2075 "hardware STATUS read access race condition "
2076 "detected!\n");
2077 return 0;
2078 }
2079 return 1;
2080}
2081
2082/*
2083 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR 1982 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2084 * ADDRESS and process. 1983 * ADDRESS and process.
2085 */ 1984 */
@@ -2203,20 +2102,6 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
2203} 2102}
2204 2103
2205/* 2104/*
2206 * The main polling 'check' function, called FROM the edac core to perform the
2207 * error checking and if an error is encountered, error processing.
2208 */
2209static void amd64_check(struct mem_ctl_info *mci)
2210{
2211 struct err_regs regs;
2212
2213 if (amd64_get_error_info(mci, &regs)) {
2214 struct amd64_pvt *pvt = mci->pvt_info;
2215 amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2216 }
2217}
2218
2219/*
2220 * Input: 2105 * Input:
2221 * 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer 2106 * 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
2222 * 2) AMD Family index value 2107 * 2) AMD Family index value
@@ -2756,9 +2641,6 @@ static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
2756 mci->dev_name = pci_name(pvt->dram_f2_ctl); 2641 mci->dev_name = pci_name(pvt->dram_f2_ctl);
2757 mci->ctl_page_to_phys = NULL; 2642 mci->ctl_page_to_phys = NULL;
2758 2643
2759 /* IMPORTANT: Set the polling 'check' function in this module */
2760 mci->edac_check = amd64_check;
2761
2762 /* memory scrubber interface */ 2644 /* memory scrubber interface */
2763 mci->set_sdram_scrub_rate = amd64_set_scrub_rate; 2645 mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
2764 mci->get_sdram_scrub_rate = amd64_get_scrub_rate; 2646 mci->get_sdram_scrub_rate = amd64_get_scrub_rate;
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 97e64bcdbc06..bae9351e9473 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status)
133 u32 ec = mc0_status & 0xffff; 133 u32 ec = mc0_status & 0xffff;
134 u32 xec = (mc0_status >> 16) & 0xf; 134 u32 xec = (mc0_status >> 16) & 0xf;
135 135
136 pr_emerg(" Data Cache Error"); 136 pr_emerg("Data Cache Error");
137 137
138 if (xec == 1 && TLB_ERROR(ec)) 138 if (xec == 1 && TLB_ERROR(ec))
139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
176 u32 ec = mc1_status & 0xffff; 176 u32 ec = mc1_status & 0xffff;
177 u32 xec = (mc1_status >> 16) & 0xf; 177 u32 xec = (mc1_status >> 16) & 0xf;
178 178
179 pr_emerg(" Instruction Cache Error"); 179 pr_emerg("Instruction Cache Error");
180 180
181 if (xec == 1 && TLB_ERROR(ec)) 181 if (xec == 1 && TLB_ERROR(ec))
182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status)
233 u32 ec = mc2_status & 0xffff; 233 u32 ec = mc2_status & 0xffff;
234 u32 xec = (mc2_status >> 16) & 0xf; 234 u32 xec = (mc2_status >> 16) & 0xf;
235 235
236 pr_emerg(" Bus Unit Error"); 236 pr_emerg("Bus Unit Error");
237 237
238 if (xec == 0x1) 238 if (xec == 0x1)
239 pr_cont(" in the write data buffers.\n"); 239 pr_cont(" in the write data buffers.\n");
@@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status)
275 u32 ec = mc3_status & 0xffff; 275 u32 ec = mc3_status & 0xffff;
276 u32 xec = (mc3_status >> 16) & 0xf; 276 u32 xec = (mc3_status >> 16) & 0xf;
277 277
278 pr_emerg(" Load Store Error"); 278 pr_emerg("Load Store Error");
279 279
280 if (xec == 0x0) { 280 if (xec == 0x0) {
281 u8 rrrr = (ec >> 4) & 0xf; 281 u8 rrrr = (ec >> 4) & 0xf;
@@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
304 if (TLB_ERROR(ec) && !report_gart_errors) 304 if (TLB_ERROR(ec) && !report_gart_errors)
305 return; 305 return;
306 306
307 pr_emerg(" Northbridge Error, node %d", node_id); 307 pr_emerg("Northbridge Error, node %d", node_id);
308 308
309 /* 309 /*
310 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 310 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
@@ -342,13 +342,13 @@ static void amd_decode_fr_mce(u64 mc5_status)
342static inline void amd_decode_err_code(unsigned int ec) 342static inline void amd_decode_err_code(unsigned int ec)
343{ 343{
344 if (TLB_ERROR(ec)) { 344 if (TLB_ERROR(ec)) {
345 pr_emerg(" Transaction: %s, Cache Level %s\n", 345 pr_emerg("Transaction: %s, Cache Level %s\n",
346 TT_MSG(ec), LL_MSG(ec)); 346 TT_MSG(ec), LL_MSG(ec));
347 } else if (MEM_ERROR(ec)) { 347 } else if (MEM_ERROR(ec)) {
348 pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", 348 pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
349 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); 349 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
350 } else if (BUS_ERROR(ec)) { 350 } else if (BUS_ERROR(ec)) {
351 pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " 351 pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
352 "Participating Processor: %s\n", 352 "Participating Processor: %s\n",
353 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), 353 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
354 PP_MSG(ec)); 354 PP_MSG(ec));