amd64_edac: Remove polling mechanism

Switch to reusing the mcheck core's machine check polling mechanism instead of duplicating functionality by using the EDAC polling routine. Correct formatting while at it. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Acked-by: Doug Thompson <dougthompson@xmission.com>
author: Borislav Petkov <borislav.petkov@amd.com> 2010-05-15 07:51:57 -0400
committer: Borislav Petkov <borislav.petkov@amd.com> 2010-08-03 10:14:03 -0400
commit: f4347553b30ec66530bfe63c84530afea3803396 (patch)
tree: 420649ea83f870ba097d8066ef18fd0259e79e33 /drivers/edac
parent: 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 (diff)
2 files changed, 8 insertions, 126 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e8d84f89dbcf..a44e90abb755 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1979,107 +1979,6 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
 }
 /*
- * Check for valid error in the NB Status High register. If so, proceed to read
- * NB Status Low, NB Address Low and NB Address High registers and store data
- * into error structure.
- *
- * Returns:
- *      - 1: if hardware regs contains valid error info
- *      - 0: if no valid error is indicated
- */
-static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
-                                     struct err_regs *regs)
-{
-        struct amd64_pvt *pvt;
-        struct pci_dev *misc_f3_ctl;
-        pvt = mci->pvt_info;
-        misc_f3_ctl = pvt->misc_f3_ctl;
-        if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
-                return 0;
-        if (!(regs->nbsh & K8_NBSH_VALID_BIT))
-                return 0;
-        /* valid error, read remaining error information registers */
-        if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) ||
-            amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) ||
-            amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) ||
-            amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
-                return 0;
-        return 1;
-}
-/*
- * This function is called to retrieve the error data from hardware and store it
- * in the info structure.
- *
- * Returns:
- *      - 1: if a valid error is found
- *      - 0: if no error is found
- */
-static int amd64_get_error_info(struct mem_ctl_info *mci,
-                                struct err_regs *info)
-{
-        struct amd64_pvt *pvt;
-        struct err_regs regs;
-        pvt = mci->pvt_info;
-        if (!amd64_get_error_info_regs(mci, info))
-                return 0;
-        /*
-         * Here's the problem with the K8's EDAC reporting: There are four
-         * registers which report pieces of error information. They are shared
-         * between CEs and UEs. Furthermore, contrary to what is stated in the
-         * BKDG, the overflow bit is never used! Every error always updates the
-         * reporting registers.
-         *
-         * Can you see the race condition? All four error reporting registers
-         * must be read before a new error updates them! There is no way to read
-         * all four registers atomically. The best than can be done is to detect
-         * that a race has occured and then report the error without any kind of
-         * precision.
-         *
-         * What is still positive is that errors are still reported and thus
-         * problems can still be detected - just not localized because the
-         * syndrome and address are spread out across registers.
-         *
-         * Grrrrr!!!!!  Here's hoping that AMD fixes this in some future K8 rev.
-         * UEs and CEs should have separate register sets with proper overflow
-         * bits that are used! At very least the problem can be fixed by
-         * honoring the ErrValid bit in 'nbsh' and not updating registers - just
-         * set the overflow bit - unless the current error is CE and the new
-         * error is UE which would be the only situation for overwriting the
-         * current values.
-         */
-        regs = *info;
-        /* Use info from the second read - most current */
-        if (unlikely(!amd64_get_error_info_regs(mci, info)))
-                return 0;
-        /* clear the error bits in hardware */
-        pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
-        /* Check for the possible race condition */
-        if ((regs.nbsh != info->nbsh) ||
-             (regs.nbsl != info->nbsl) ||
-             (regs.nbeah != info->nbeah) ||
-             (regs.nbeal != info->nbeal)) {
-                amd64_mc_printk(mci, KERN_WARNING,
-                                "hardware STATUS read access race condition "
-                                "detected!\n");
-                return 0;
-        }
-        return 1;
-}
-/*
 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
 * ADDRESS and process.
 */
@@ -2203,20 +2102,6 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
 }
 /*
- * The main polling 'check' function, called FROM the edac core to perform the
- * error checking and if an error is encountered, error processing.
- */
-static void amd64_check(struct mem_ctl_info *mci)
-{
-        struct err_regs regs;
-        if (amd64_get_error_info(mci, &regs)) {
-                struct amd64_pvt *pvt = mci->pvt_info;
-                amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
-        }
-}
-/*
 * Input:
 *      1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
 *      2) AMD Family index value
@@ -2756,9 +2641,6 @@ static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
        mci->dev_name           = pci_name(pvt->dram_f2_ctl);
        mci->ctl_page_to_phys   = NULL;
-        /* IMPORTANT: Set the polling 'check' function in this module */
-        mci->edac_check         = amd64_check;
        /* memory scrubber interface */
        mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
        mci->get_sdram_scrub_rate = amd64_get_scrub_rate;
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 97e64bcdbc06..bae9351e9473 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status)
        u32 ec  = mc0_status & 0xffff;
        u32 xec = (mc0_status >> 16) & 0xf;
-        pr_emerg(" Data Cache Error");
+        pr_emerg("Data Cache Error");
        if (xec == 1 && TLB_ERROR(ec))
                pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
        u32 ec  = mc1_status & 0xffff;
        u32 xec = (mc1_status >> 16) & 0xf;
-        pr_emerg(" Instruction Cache Error");
+        pr_emerg("Instruction Cache Error");
        if (xec == 1 && TLB_ERROR(ec))
                pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status)
        u32 ec = mc2_status & 0xffff;
        u32 xec = (mc2_status >> 16) & 0xf;
-        pr_emerg(" Bus Unit Error");
+        pr_emerg("Bus Unit Error");
        if (xec == 0x1)
                pr_cont(" in the write data buffers.\n");
@@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status)
        u32 ec  = mc3_status & 0xffff;
        u32 xec = (mc3_status >> 16) & 0xf;
-        pr_emerg(" Load Store Error");
+        pr_emerg("Load Store Error");
        if (xec == 0x0) {
                u8 rrrr = (ec >> 4) & 0xf;
@@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
        if (TLB_ERROR(ec) && !report_gart_errors)
                return;
-        pr_emerg(" Northbridge Error, node %d", node_id);
+        pr_emerg("Northbridge Error, node %d", node_id);
        /*
         * F10h, revD can disable ErrCpu[3:0] so check that first and also the
@@ -342,13 +342,13 @@ static void amd_decode_fr_mce(u64 mc5_status)
 static inline void amd_decode_err_code(unsigned int ec)
 {
        if (TLB_ERROR(ec)) {
-                pr_emerg(" Transaction: %s, Cache Level %s\n",
+                pr_emerg("Transaction: %s, Cache Level %s\n",
                         TT_MSG(ec), LL_MSG(ec));
        } else if (MEM_ERROR(ec)) {
-                pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
+                pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
                         RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
        } else if (BUS_ERROR(ec)) {
-                pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
+                pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
                         "Participating Processor: %s\n",
                          RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
                          PP_MSG(ec));
author	Borislav Petkov <borislav.petkov@amd.com>	2010-05-15 07:51:57 -0400
committer	Borislav Petkov <borislav.petkov@amd.com>	2010-08-03 10:14:03 -0400
commit	f4347553b30ec66530bfe63c84530afea3803396 (patch)
tree	420649ea83f870ba097d8066ef18fd0259e79e33 /drivers/edac
parent	98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 (diff)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e8d84f89dbcf..a44e90abb755 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c
@@ -1979,107 +1979,6 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
1979	}	1979	}
1980		1980
1981	/*	1981	/*
1982	* Check for valid error in the NB Status High register. If so, proceed to read
1983	* NB Status Low, NB Address Low and NB Address High registers and store data
1984	* into error structure.
1985	*
1986	* Returns:
1987	* - 1: if hardware regs contains valid error info
1988	* - 0: if no valid error is indicated
1989	*/
1990	static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
1991	struct err_regs *regs)
1992	{
1993	struct amd64_pvt *pvt;
1994	struct pci_dev *misc_f3_ctl;
1995
1996	pvt = mci->pvt_info;
1997	misc_f3_ctl = pvt->misc_f3_ctl;
1998
1999	if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
2000	return 0;
2001
2002	if (!(regs->nbsh & K8_NBSH_VALID_BIT))
2003	return 0;
2004
2005	/* valid error, read remaining error information registers */
2006	if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) \|\|
2007	amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) \|\|
2008	amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) \|\|
2009	amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
2010	return 0;
2011
2012	return 1;
2013	}
2014
2015	/*
2016	* This function is called to retrieve the error data from hardware and store it
2017	* in the info structure.
2018	*
2019	* Returns:
2020	* - 1: if a valid error is found
2021	* - 0: if no error is found
2022	*/
2023	static int amd64_get_error_info(struct mem_ctl_info *mci,
2024	struct err_regs *info)
2025	{
2026	struct amd64_pvt *pvt;
2027	struct err_regs regs;
2028
2029	pvt = mci->pvt_info;
2030
2031	if (!amd64_get_error_info_regs(mci, info))
2032	return 0;
2033
2034	/*
2035	* Here's the problem with the K8's EDAC reporting: There are four
2036	* registers which report pieces of error information. They are shared
2037	* between CEs and UEs. Furthermore, contrary to what is stated in the
2038	* BKDG, the overflow bit is never used! Every error always updates the
2039	* reporting registers.
2040	*
2041	* Can you see the race condition? All four error reporting registers
2042	* must be read before a new error updates them! There is no way to read
2043	* all four registers atomically. The best than can be done is to detect
2044	* that a race has occured and then report the error without any kind of
2045	* precision.
2046	*
2047	* What is still positive is that errors are still reported and thus
2048	* problems can still be detected - just not localized because the
2049	* syndrome and address are spread out across registers.
2050	*
2051	* Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
2052	* UEs and CEs should have separate register sets with proper overflow
2053	* bits that are used! At very least the problem can be fixed by
2054	* honoring the ErrValid bit in 'nbsh' and not updating registers - just
2055	* set the overflow bit - unless the current error is CE and the new
2056	* error is UE which would be the only situation for overwriting the
2057	* current values.
2058	*/
2059
2060	regs = *info;
2061
2062	/* Use info from the second read - most current */
2063	if (unlikely(!amd64_get_error_info_regs(mci, info)))
2064	return 0;
2065
2066	/* clear the error bits in hardware */
2067	pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
2068
2069	/* Check for the possible race condition */
2070	if ((regs.nbsh != info->nbsh) \|\|
2071	(regs.nbsl != info->nbsl) \|\|
2072	(regs.nbeah != info->nbeah) \|\|
2073	(regs.nbeal != info->nbeal)) {
2074	amd64_mc_printk(mci, KERN_WARNING,
2075	"hardware STATUS read access race condition "
2076	"detected!\n");
2077	return 0;
2078	}
2079	return 1;
2080	}
2081
2082	/*
2083	* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR	1982	* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2084	* ADDRESS and process.	1983	* ADDRESS and process.
2085	*/	1984	*/
@@ -2203,20 +2102,6 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
2203	}	2102	}
2204		2103
2205	/*	2104	/*
2206	* The main polling 'check' function, called FROM the edac core to perform the
2207	* error checking and if an error is encountered, error processing.
2208	*/
2209	static void amd64_check(struct mem_ctl_info *mci)
2210	{
2211	struct err_regs regs;
2212
2213	if (amd64_get_error_info(mci, &regs)) {
2214	struct amd64_pvt *pvt = mci->pvt_info;
2215	amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2216	}
2217	}
2218
2219	/*
2220	* Input:	2105	* Input:
2221	* 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer	2106	* 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
2222	* 2) AMD Family index value	2107	* 2) AMD Family index value
@@ -2756,9 +2641,6 @@ static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
2756	mci->dev_name = pci_name(pvt->dram_f2_ctl);	2641	mci->dev_name = pci_name(pvt->dram_f2_ctl);
2757	mci->ctl_page_to_phys = NULL;	2642	mci->ctl_page_to_phys = NULL;
2758		2643
2759	/* IMPORTANT: Set the polling 'check' function in this module */
2760	mci->edac_check = amd64_check;
2761
2762	/* memory scrubber interface */	2644	/* memory scrubber interface */
2763	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;	2645	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
2764	mci->get_sdram_scrub_rate = amd64_get_scrub_rate;	2646	mci->get_sdram_scrub_rate = amd64_get_scrub_rate;


diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 97e64bcdbc06..bae9351e9473 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c
@@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status)
133	u32 ec = mc0_status & 0xffff;	133	u32 ec = mc0_status & 0xffff;
134	u32 xec = (mc0_status >> 16) & 0xf;	134	u32 xec = (mc0_status >> 16) & 0xf;
135		135
136	pr_emerg(" Data Cache Error");	136	pr_emerg("Data Cache Error");
137		137
138	if (xec == 1 && TLB_ERROR(ec))	138	if (xec == 1 && TLB_ERROR(ec))
139	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));	139	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
176	u32 ec = mc1_status & 0xffff;	176	u32 ec = mc1_status & 0xffff;
177	u32 xec = (mc1_status >> 16) & 0xf;	177	u32 xec = (mc1_status >> 16) & 0xf;
178		178
179	pr_emerg(" Instruction Cache Error");	179	pr_emerg("Instruction Cache Error");
180		180
181	if (xec == 1 && TLB_ERROR(ec))	181	if (xec == 1 && TLB_ERROR(ec))
182	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));	182	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status)
233	u32 ec = mc2_status & 0xffff;	233	u32 ec = mc2_status & 0xffff;
234	u32 xec = (mc2_status >> 16) & 0xf;	234	u32 xec = (mc2_status >> 16) & 0xf;
235		235
236	pr_emerg(" Bus Unit Error");	236	pr_emerg("Bus Unit Error");
237		237
238	if (xec == 0x1)	238	if (xec == 0x1)
239	pr_cont(" in the write data buffers.\n");	239	pr_cont(" in the write data buffers.\n");
@@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status)
275	u32 ec = mc3_status & 0xffff;	275	u32 ec = mc3_status & 0xffff;
276	u32 xec = (mc3_status >> 16) & 0xf;	276	u32 xec = (mc3_status >> 16) & 0xf;
277		277
278	pr_emerg(" Load Store Error");	278	pr_emerg("Load Store Error");
279		279
280	if (xec == 0x0) {	280	if (xec == 0x0) {
281	u8 rrrr = (ec >> 4) & 0xf;	281	u8 rrrr = (ec >> 4) & 0xf;
@@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
304	if (TLB_ERROR(ec) && !report_gart_errors)	304	if (TLB_ERROR(ec) && !report_gart_errors)
305	return;	305	return;
306		306
307	pr_emerg(" Northbridge Error, node %d", node_id);	307	pr_emerg("Northbridge Error, node %d", node_id);
308		308
309	/*	309	/*
310	* F10h, revD can disable ErrCpu[3:0] so check that first and also the	310	* F10h, revD can disable ErrCpu[3:0] so check that first and also the
@@ -342,13 +342,13 @@ static void amd_decode_fr_mce(u64 mc5_status)
342	static inline void amd_decode_err_code(unsigned int ec)	342	static inline void amd_decode_err_code(unsigned int ec)
343	{	343	{
344	if (TLB_ERROR(ec)) {	344	if (TLB_ERROR(ec)) {
345	pr_emerg(" Transaction: %s, Cache Level %s\n",	345	pr_emerg("Transaction: %s, Cache Level %s\n",
346	TT_MSG(ec), LL_MSG(ec));	346	TT_MSG(ec), LL_MSG(ec));
347	} else if (MEM_ERROR(ec)) {	347	} else if (MEM_ERROR(ec)) {
348	pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",	348	pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
349	RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));	349	RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
350	} else if (BUS_ERROR(ec)) {	350	} else if (BUS_ERROR(ec)) {
351	pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "	351	pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
352	"Participating Processor: %s\n",	352	"Participating Processor: %s\n",
353	RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),	353	RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
354	PP_MSG(ec));	354	PP_MSG(ec));