amd64_edac: add error decoding logic

Borislav: - fold amd64_error_info_valid() into its only user - fix/cleanup comments - fix function return value patterns - cleanup debug calls Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com> Signed-off-by: Doug Thompson <dougthompson@xmission.com> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
author: Doug Thompson <dougthompson@xmission.com> 2009-05-06 11:55:27 -0400
committer: Borislav Petkov <borislav.petkov@amd.com> 2009-06-10 06:18:59 -0400
commit: d27bf6fa369ca0272df10558d2f290d6fc72e675 (patch)
tree: b43a34237e44dd567a34b3a3d2fd233905baf566 /drivers/edac/amd64_edac.c
parent: b1289d6f9d23abab396077abb65d5a23a775cdb0 (diff)
1 files changed, 425 insertions, 0 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index feb4986ea76d..09991c8a6ee3 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
        debugf0("syndrome(%x) not found\n", syndrome);
        return -1;
 }
+/*
+ * Check for valid error in the NB Status High register. If so, proceed to read
+ * NB Status Low, NB Address Low and NB Address High registers and store data
+ * into error structure.
+ *
+ * Returns:
+ *      - 1: if hardware regs contains valid error info
+ *      - 0: if no valid error is indicated
+ */
+static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
+                                     struct amd64_error_info_regs *regs)
+{
+        struct amd64_pvt *pvt;
+        struct pci_dev *misc_f3_ctl;
+        int err = 0;
+        pvt = mci->pvt_info;
+        misc_f3_ctl = pvt->misc_f3_ctl;
+        err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, &regs->nbsh);
+        if (err)
+                goto err_reg;
+        if (!(regs->nbsh & K8_NBSH_VALID_BIT))
+                return 0;
+        /* valid error, read remaining error information registers */
+        err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, &regs->nbsl);
+        if (err)
+                goto err_reg;
+        err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, &regs->nbeal);
+        if (err)
+                goto err_reg;
+        err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, &regs->nbeah);
+        if (err)
+                goto err_reg;
+        err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, &regs->nbcfg);
+        if (err)
+                goto err_reg;
+        return 1;
+err_reg:
+        debugf0("Reading error info register failed\n");
+        return 0;
+}
+/*
+ * This function is called to retrieve the error data from hardware and store it
+ * in the info structure.
+ *
+ * Returns:
+ *      - 1: if a valid error is found
+ *      - 0: if no error is found
+ */
+static int amd64_get_error_info(struct mem_ctl_info *mci,
+                                struct amd64_error_info_regs *info)
+{
+        struct amd64_pvt *pvt;
+        struct amd64_error_info_regs regs;
+        pvt = mci->pvt_info;
+        if (!amd64_get_error_info_regs(mci, info))
+                return 0;
+        /*
+         * Here's the problem with the K8's EDAC reporting: There are four
+         * registers which report pieces of error information. They are shared
+         * between CEs and UEs. Furthermore, contrary to what is stated in the
+         * BKDG, the overflow bit is never used! Every error always updates the
+         * reporting registers.
+         *
+         * Can you see the race condition? All four error reporting registers
+         * must be read before a new error updates them! There is no way to read
+         * all four registers atomically. The best than can be done is to detect
+         * that a race has occured and then report the error without any kind of
+         * precision.
+         *
+         * What is still positive is that errors are still reported and thus
+         * problems can still be detected - just not localized because the
+         * syndrome and address are spread out across registers.
+         *
+         * Grrrrr!!!!!  Here's hoping that AMD fixes this in some future K8 rev.
+         * UEs and CEs should have separate register sets with proper overflow
+         * bits that are used! At very least the problem can be fixed by
+         * honoring the ErrValid bit in 'nbsh' and not updating registers - just
+         * set the overflow bit - unless the current error is CE and the new
+         * error is UE which would be the only situation for overwriting the
+         * current values.
+         */
+        regs = *info;
+        /* Use info from the second read - most current */
+        if (unlikely(!amd64_get_error_info_regs(mci, info)))
+                return 0;
+        /* clear the error bits in hardware */
+        pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
+        /* Check for the possible race condition */
+        if ((regs.nbsh != info->nbsh) ||
+             (regs.nbsl != info->nbsl) ||
+             (regs.nbeah != info->nbeah) ||
+             (regs.nbeal != info->nbeal)) {
+                amd64_mc_printk(mci, KERN_WARNING,
+                                "hardware STATUS read access race condition "
+                                "detected!\n");
+                return 0;
+        }
+        return 1;
+}
+static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
+                                         struct amd64_error_info_regs *info)
+{
+        u32 err_code;
+        u32 ec_tt;              /* error code transaction type (2b) */
+        u32 ec_ll;              /* error code cache level (2b) */
+        err_code = EXTRACT_ERROR_CODE(info->nbsl);
+        ec_ll = EXTRACT_LL_CODE(err_code);
+        ec_tt = EXTRACT_TT_CODE(err_code);
+        amd64_mc_printk(mci, KERN_ERR,
+                     "GART TLB event: transaction type(%s), "
+                     "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
+                                      struct amd64_error_info_regs *info)
+{
+        u32 err_code;
+        u32 ec_rrrr;            /* error code memory transaction (4b) */
+        u32 ec_tt;              /* error code transaction type (2b) */
+        u32 ec_ll;              /* error code cache level (2b) */
+        err_code = EXTRACT_ERROR_CODE(info->nbsl);
+        ec_ll = EXTRACT_LL_CODE(err_code);
+        ec_tt = EXTRACT_TT_CODE(err_code);
+        ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+        amd64_mc_printk(mci, KERN_ERR,
+                     "cache hierarchy error: memory transaction type(%s), "
+                     "transaction type(%s), cache level(%s)\n",
+                     rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+/*
+ * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
+ * ADDRESS and process.
+ */
+static void amd64_handle_ce(struct mem_ctl_info *mci,
+                            struct amd64_error_info_regs *info)
+{
+        struct amd64_pvt *pvt = mci->pvt_info;
+        u64 SystemAddress;
+        /* Ensure that the Error Address is VALID */
+        if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+                amd64_mc_printk(mci, KERN_ERR,
+                        "HW has no ERROR_ADDRESS available\n");
+                edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+                return;
+        }
+        SystemAddress = extract_error_address(mci, info);
+        amd64_mc_printk(mci, KERN_ERR,
+                "CE ERROR_ADDRESS= 0x%llx\n", SystemAddress);
+        pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress);
+}
+/* Handle any Un-correctable Errors (UEs) */
+static void amd64_handle_ue(struct mem_ctl_info *mci,
+                            struct amd64_error_info_regs *info)
+{
+        int csrow;
+        u64 SystemAddress;
+        u32 page, offset;
+        struct mem_ctl_info *log_mci, *src_mci = NULL;
+        log_mci = mci;
+        if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+                amd64_mc_printk(mci, KERN_CRIT,
+                        "HW has no ERROR_ADDRESS available\n");
+                edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+                return;
+        }
+        SystemAddress = extract_error_address(mci, info);
+        /*
+         * Find out which node the error address belongs to. This may be
+         * different from the node that detected the error.
+         */
+        src_mci = find_mc_by_sys_addr(mci, SystemAddress);
+        if (!src_mci) {
+                amd64_mc_printk(mci, KERN_CRIT,
+                        "ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n",
+                        (unsigned long)SystemAddress);
+                edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+                return;
+        }
+        log_mci = src_mci;
+        csrow = sys_addr_to_csrow(log_mci, SystemAddress);
+        if (csrow < 0) {
+                amd64_mc_printk(mci, KERN_CRIT,
+                        "ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n",
+                        (unsigned long)SystemAddress);
+                edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+        } else {
+                error_address_to_page_and_offset(SystemAddress, &page, &offset);
+                edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
+        }
+}
+static void amd64_decode_bus_error(struct mem_ctl_info *mci,
+                                   struct amd64_error_info_regs *info)
+{
+        u32 err_code, ext_ec;
+        u32 ec_pp;              /* error code participating processor (2p) */
+        u32 ec_to;              /* error code timed out (1b) */
+        u32 ec_rrrr;            /* error code memory transaction (4b) */
+        u32 ec_ii;              /* error code memory or I/O (2b) */
+        u32 ec_ll;              /* error code cache level (2b) */
+        ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
+        err_code = EXTRACT_ERROR_CODE(info->nbsl);
+        ec_ll = EXTRACT_LL_CODE(err_code);
+        ec_ii = EXTRACT_II_CODE(err_code);
+        ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+        ec_to = EXTRACT_TO_CODE(err_code);
+        ec_pp = EXTRACT_PP_CODE(err_code);
+        amd64_mc_printk(mci, KERN_ERR,
+                "BUS ERROR:\n"
+                "  time-out(%s) mem or i/o(%s)\n"
+                "  participating processor(%s)\n"
+                "  memory transaction type(%s)\n"
+                "  cache level(%s) Error Found by: %s\n",
+                to_msgs[ec_to],
+                ii_msgs[ec_ii],
+                pp_msgs[ec_pp],
+                rrrr_msgs[ec_rrrr],
+                ll_msgs[ec_ll],
+                (info->nbsh & K8_NBSH_ERR_SCRUBER) ?
+                        "Scrubber" : "Normal Operation");
+        /* If this was an 'observed' error, early out */
+        if (ec_pp == K8_NBSL_PP_OBS)
+                return;         /* We aren't the node involved */
+        /* Parse out the extended error code for ECC events */
+        switch (ext_ec) {
+        /* F10 changed to one Extended ECC error code */
+        case F10_NBSL_EXT_ERR_RES:              /* Reserved field */
+        case F10_NBSL_EXT_ERR_ECC:              /* F10 ECC ext err code */
+                break;
+        default:
+                amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
+                                               "handling for this error\n");
+                return;
+        }
+        if (info->nbsh & K8_NBSH_CECC)
+                amd64_handle_ce(mci, info);
+        else if (info->nbsh & K8_NBSH_UECC)
+                amd64_handle_ue(mci, info);
+        /*
+         * If main error is CE then overflow must be CE.  If main error is UE
+         * then overflow is unknown.  We'll call the overflow a CE - if
+         * panic_on_ue is set then we're already panic'ed and won't arrive
+         * here. Else, then apparently someone doesn't think that UE's are
+         * catastrophic.
+         */
+        if (info->nbsh & K8_NBSH_OVERFLOW)
+                edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
+                                          "Error Overflow set");
+}
+int amd64_process_error_info(struct mem_ctl_info *mci,
+                             struct amd64_error_info_regs *info,
+                             int handle_errors)
+{
+        struct amd64_pvt *pvt;
+        struct amd64_error_info_regs *regs;
+        u32 err_code, ext_ec;
+        int gart_tlb_error = 0;
+        pvt = mci->pvt_info;
+        /* If caller doesn't want us to process the error, return */
+        if (!handle_errors)
+                return 1;
+        regs = info;
+        debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
+        debugf1("  MC node(%d) Error-Address(0x%.8x-%.8x)\n",
+                pvt->mc_node_id, regs->nbeah, regs->nbeal);
+        debugf1("  nbsh(0x%.8x) nbsl(0x%.8x)\n",
+                regs->nbsh, regs->nbsl);
+        debugf1("  Valid Error=%s Overflow=%s\n",
+                (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
+                (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
+        debugf1("  Err Uncorrected=%s MCA Error Reporting=%s\n",
+                (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
+                        "True" : "False",
+                (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
+                        "True" : "False");
+        debugf1("  MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
+                (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
+                        "True" : "False",
+                (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
+                        "True" : "False",
+                (regs->nbsh & K8_NBSH_PCC) ?
+                        "True" : "False");
+        debugf1("  CECC=%s UECC=%s Found by Scruber=%s\n",
+                (regs->nbsh & K8_NBSH_CECC) ?
+                        "True" : "False",
+                (regs->nbsh & K8_NBSH_UECC) ?
+                        "True" : "False",
+                (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
+                        "True" : "False");
+        debugf1("  CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
+                (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
+                (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
+                (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
+                (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
+        err_code = EXTRACT_ERROR_CODE(regs->nbsl);
+        /* Determine which error type:
+         *      1) GART errors - non-fatal, developmental events
+         *      2) MEMORY errors
+         *      3) BUS errors
+         *      4) Unknown error
+         */
+        if (TEST_TLB_ERROR(err_code)) {
+                /*
+                 * GART errors are intended to help graphics driver developers
+                 * to detect bad GART PTEs. It is recommended by AMD to disable
+                 * GART table walk error reporting by default[1] (currently
+                 * being disabled in mce_cpu_quirks()) and according to the
+                 * comment in mce_cpu_quirks(), such GART errors can be
+                 * incorrectly triggered. We may see these errors anyway and
+                 * unless requested by the user, they won't be reported.
+                 *
+                 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
+                 *     AMD NPT family 0Fh processors
+                 */
+                if (report_gart_errors == 0)
+                        return 1;
+                /*
+                 * Only if GART error reporting is requested should we generate
+                 * any logs.
+                 */
+                gart_tlb_error = 1;
+                debugf1("GART TLB error\n");
+                amd64_decode_gart_tlb_error(mci, info);
+        } else if (TEST_MEM_ERROR(err_code)) {
+                debugf1("Memory/Cache error\n");
+                amd64_decode_mem_cache_error(mci, info);
+        } else if (TEST_BUS_ERROR(err_code)) {
+                debugf1("Bus (Link/DRAM) error\n");
+                amd64_decode_bus_error(mci, info);
+        } else {
+                /* shouldn't reach here! */
+                amd64_mc_printk(mci, KERN_WARNING,
+                             "%s(): unknown MCE error 0x%x\n", __func__,
+                             err_code);
+        }
+        ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
+        amd64_mc_printk(mci, KERN_ERR,
+                "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
+        if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
+                        ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
+                        (ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
+                        EXTRACT_LDT_LINK(info->nbsh)) {
+                amd64_mc_printk(mci, KERN_ERR,
+                        "Error on hypertransport link: %s\n",
+                        htlink_msgs[
+                        EXTRACT_LDT_LINK(info->nbsh)]);
+        }
+        /*
+         * Check the UE bit of the NB status high register, if set generate some
+         * logs. If NOT a GART error, then process the event as a NO-INFO event.
+         * If it was a GART error, skip that process.
+         */
+        if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
+                amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
+                if (!gart_tlb_error)
+                        edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
+        }
+        if (regs->nbsh & K8_NBSH_PCC)
+                amd64_mc_printk(mci, KERN_CRIT,
+                        "PCC (processor context corrupt) set\n");
+        return 1;
+}
+EXPORT_SYMBOL_GPL(amd64_process_error_info);
author	Doug Thompson <dougthompson@xmission.com>	2009-05-06 11:55:27 -0400
committer	Borislav Petkov <borislav.petkov@amd.com>	2009-06-10 06:18:59 -0400
commit	d27bf6fa369ca0272df10558d2f290d6fc72e675 (patch)
tree	b43a34237e44dd567a34b3a3d2fd233905baf566 /drivers/edac/amd64_edac.c
parent	b1289d6f9d23abab396077abb65d5a23a775cdb0 (diff)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index feb4986ea76d..09991c8a6ee3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
2031	debugf0("syndrome(%x) not found\n", syndrome);	2031	debugf0("syndrome(%x) not found\n", syndrome);
2032	return -1;	2032	return -1;
2033	}	2033	}
		2034
		2035	/*
		2036	* Check for valid error in the NB Status High register. If so, proceed to read
		2037	* NB Status Low, NB Address Low and NB Address High registers and store data
		2038	* into error structure.
		2039	*
		2040	* Returns:
		2041	* - 1: if hardware regs contains valid error info
		2042	* - 0: if no valid error is indicated
		2043	*/
		2044	static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
		2045	struct amd64_error_info_regs *regs)
		2046	{
		2047	struct amd64_pvt *pvt;
		2048	struct pci_dev *misc_f3_ctl;
		2049	int err = 0;
		2050
		2051	pvt = mci->pvt_info;
		2052	misc_f3_ctl = pvt->misc_f3_ctl;
		2053
		2054	err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, &regs->nbsh);
		2055	if (err)
		2056	goto err_reg;
		2057
		2058	if (!(regs->nbsh & K8_NBSH_VALID_BIT))
		2059	return 0;
		2060
		2061	/* valid error, read remaining error information registers */
		2062	err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, &regs->nbsl);
		2063	if (err)
		2064	goto err_reg;
		2065
		2066	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, &regs->nbeal);
		2067	if (err)
		2068	goto err_reg;
		2069
		2070	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, &regs->nbeah);
		2071	if (err)
		2072	goto err_reg;
		2073
		2074	err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, &regs->nbcfg);
		2075	if (err)
		2076	goto err_reg;
		2077
		2078	return 1;
		2079
		2080	err_reg:
		2081	debugf0("Reading error info register failed\n");
		2082	return 0;
		2083	}
		2084
		2085	/*
		2086	* This function is called to retrieve the error data from hardware and store it
		2087	* in the info structure.
		2088	*
		2089	* Returns:
		2090	* - 1: if a valid error is found
		2091	* - 0: if no error is found
		2092	*/
		2093	static int amd64_get_error_info(struct mem_ctl_info *mci,
		2094	struct amd64_error_info_regs *info)
		2095	{
		2096	struct amd64_pvt *pvt;
		2097	struct amd64_error_info_regs regs;
		2098
		2099	pvt = mci->pvt_info;
		2100
		2101	if (!amd64_get_error_info_regs(mci, info))
		2102	return 0;
		2103
		2104	/*
		2105	* Here's the problem with the K8's EDAC reporting: There are four
		2106	* registers which report pieces of error information. They are shared
		2107	* between CEs and UEs. Furthermore, contrary to what is stated in the
		2108	* BKDG, the overflow bit is never used! Every error always updates the
		2109	* reporting registers.
		2110	*
		2111	* Can you see the race condition? All four error reporting registers
		2112	* must be read before a new error updates them! There is no way to read
		2113	* all four registers atomically. The best than can be done is to detect
		2114	* that a race has occured and then report the error without any kind of
		2115	* precision.
		2116	*
		2117	* What is still positive is that errors are still reported and thus
		2118	* problems can still be detected - just not localized because the
		2119	* syndrome and address are spread out across registers.
		2120	*
		2121	* Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
		2122	* UEs and CEs should have separate register sets with proper overflow
		2123	* bits that are used! At very least the problem can be fixed by
		2124	* honoring the ErrValid bit in 'nbsh' and not updating registers - just
		2125	* set the overflow bit - unless the current error is CE and the new
		2126	* error is UE which would be the only situation for overwriting the
		2127	* current values.
		2128	*/
		2129
		2130	regs = *info;
		2131
		2132	/* Use info from the second read - most current */
		2133	if (unlikely(!amd64_get_error_info_regs(mci, info)))
		2134	return 0;
		2135
		2136	/* clear the error bits in hardware */
		2137	pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
		2138
		2139	/* Check for the possible race condition */
		2140	if ((regs.nbsh != info->nbsh) \|\|
		2141	(regs.nbsl != info->nbsl) \|\|
		2142	(regs.nbeah != info->nbeah) \|\|
		2143	(regs.nbeal != info->nbeal)) {
		2144	amd64_mc_printk(mci, KERN_WARNING,
		2145	"hardware STATUS read access race condition "
		2146	"detected!\n");
		2147	return 0;
		2148	}
		2149	return 1;
		2150	}
		2151
		2152	static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
		2153	struct amd64_error_info_regs *info)
		2154	{
		2155	u32 err_code;
		2156	u32 ec_tt; /* error code transaction type (2b) */
		2157	u32 ec_ll; /* error code cache level (2b) */
		2158
		2159	err_code = EXTRACT_ERROR_CODE(info->nbsl);
		2160	ec_ll = EXTRACT_LL_CODE(err_code);
		2161	ec_tt = EXTRACT_TT_CODE(err_code);
		2162
		2163	amd64_mc_printk(mci, KERN_ERR,
		2164	"GART TLB event: transaction type(%s), "
		2165	"cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
		2166	}
		2167
		2168	static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
		2169	struct amd64_error_info_regs *info)
		2170	{
		2171	u32 err_code;
		2172	u32 ec_rrrr; /* error code memory transaction (4b) */
		2173	u32 ec_tt; /* error code transaction type (2b) */
		2174	u32 ec_ll; /* error code cache level (2b) */
		2175
		2176	err_code = EXTRACT_ERROR_CODE(info->nbsl);
		2177	ec_ll = EXTRACT_LL_CODE(err_code);
		2178	ec_tt = EXTRACT_TT_CODE(err_code);
		2179	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
		2180
		2181	amd64_mc_printk(mci, KERN_ERR,
		2182	"cache hierarchy error: memory transaction type(%s), "
		2183	"transaction type(%s), cache level(%s)\n",
		2184	rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
		2185	}
		2186
		2187
		2188	/*
		2189	* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
		2190	* ADDRESS and process.
		2191	*/
		2192	static void amd64_handle_ce(struct mem_ctl_info *mci,
		2193	struct amd64_error_info_regs *info)
		2194	{
		2195	struct amd64_pvt *pvt = mci->pvt_info;
		2196	u64 SystemAddress;
		2197
		2198	/* Ensure that the Error Address is VALID */
		2199	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
		2200	amd64_mc_printk(mci, KERN_ERR,
		2201	"HW has no ERROR_ADDRESS available\n");
		2202	edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
		2203	return;
		2204	}
		2205
		2206	SystemAddress = extract_error_address(mci, info);
		2207
		2208	amd64_mc_printk(mci, KERN_ERR,
		2209	"CE ERROR_ADDRESS= 0x%llx\n", SystemAddress);
		2210
		2211	pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress);
		2212	}
		2213
		2214	/* Handle any Un-correctable Errors (UEs) */
		2215	static void amd64_handle_ue(struct mem_ctl_info *mci,
		2216	struct amd64_error_info_regs *info)
		2217	{
		2218	int csrow;
		2219	u64 SystemAddress;
		2220	u32 page, offset;
		2221	struct mem_ctl_info log_mci, src_mci = NULL;
		2222
		2223	log_mci = mci;
		2224
		2225	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
		2226	amd64_mc_printk(mci, KERN_CRIT,
		2227	"HW has no ERROR_ADDRESS available\n");
		2228	edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
		2229	return;
		2230	}
		2231
		2232	SystemAddress = extract_error_address(mci, info);
		2233
		2234	/*
		2235	* Find out which node the error address belongs to. This may be
		2236	* different from the node that detected the error.
		2237	*/
		2238	src_mci = find_mc_by_sys_addr(mci, SystemAddress);
		2239	if (!src_mci) {
		2240	amd64_mc_printk(mci, KERN_CRIT,
		2241	"ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n",
		2242	(unsigned long)SystemAddress);
		2243	edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
		2244	return;
		2245	}
		2246
		2247	log_mci = src_mci;
		2248
		2249	csrow = sys_addr_to_csrow(log_mci, SystemAddress);
		2250	if (csrow < 0) {
		2251	amd64_mc_printk(mci, KERN_CRIT,
		2252	"ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n",
		2253	(unsigned long)SystemAddress);
		2254	edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
		2255	} else {
		2256	error_address_to_page_and_offset(SystemAddress, &page, &offset);
		2257	edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
		2258	}
		2259	}
		2260
		2261	static void amd64_decode_bus_error(struct mem_ctl_info *mci,
		2262	struct amd64_error_info_regs *info)
		2263	{
		2264	u32 err_code, ext_ec;
		2265	u32 ec_pp; /* error code participating processor (2p) */
		2266	u32 ec_to; /* error code timed out (1b) */
		2267	u32 ec_rrrr; /* error code memory transaction (4b) */
		2268	u32 ec_ii; /* error code memory or I/O (2b) */
		2269	u32 ec_ll; /* error code cache level (2b) */
		2270
		2271	ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
		2272	err_code = EXTRACT_ERROR_CODE(info->nbsl);
		2273
		2274	ec_ll = EXTRACT_LL_CODE(err_code);
		2275	ec_ii = EXTRACT_II_CODE(err_code);
		2276	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
		2277	ec_to = EXTRACT_TO_CODE(err_code);
		2278	ec_pp = EXTRACT_PP_CODE(err_code);
		2279
		2280	amd64_mc_printk(mci, KERN_ERR,
		2281	"BUS ERROR:\n"
		2282	" time-out(%s) mem or i/o(%s)\n"
		2283	" participating processor(%s)\n"
		2284	" memory transaction type(%s)\n"
		2285	" cache level(%s) Error Found by: %s\n",
		2286	to_msgs[ec_to],
		2287	ii_msgs[ec_ii],
		2288	pp_msgs[ec_pp],
		2289	rrrr_msgs[ec_rrrr],
		2290	ll_msgs[ec_ll],
		2291	(info->nbsh & K8_NBSH_ERR_SCRUBER) ?
		2292	"Scrubber" : "Normal Operation");
		2293
		2294	/* If this was an 'observed' error, early out */
		2295	if (ec_pp == K8_NBSL_PP_OBS)
		2296	return; /* We aren't the node involved */
		2297
		2298	/* Parse out the extended error code for ECC events */
		2299	switch (ext_ec) {
		2300	/* F10 changed to one Extended ECC error code */
		2301	case F10_NBSL_EXT_ERR_RES: /* Reserved field */
		2302	case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
		2303	break;
		2304
		2305	default:
		2306	amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
		2307	"handling for this error\n");
		2308	return;
		2309	}
		2310
		2311	if (info->nbsh & K8_NBSH_CECC)
		2312	amd64_handle_ce(mci, info);
		2313	else if (info->nbsh & K8_NBSH_UECC)
		2314	amd64_handle_ue(mci, info);
		2315
		2316	/*
		2317	* If main error is CE then overflow must be CE. If main error is UE
		2318	* then overflow is unknown. We'll call the overflow a CE - if
		2319	* panic_on_ue is set then we're already panic'ed and won't arrive
		2320	* here. Else, then apparently someone doesn't think that UE's are
		2321	* catastrophic.
		2322	*/
		2323	if (info->nbsh & K8_NBSH_OVERFLOW)
		2324	edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
		2325	"Error Overflow set");
		2326	}
		2327
		2328	int amd64_process_error_info(struct mem_ctl_info *mci,
		2329	struct amd64_error_info_regs *info,
		2330	int handle_errors)
		2331	{
		2332	struct amd64_pvt *pvt;
		2333	struct amd64_error_info_regs *regs;
		2334	u32 err_code, ext_ec;
		2335	int gart_tlb_error = 0;
		2336
		2337	pvt = mci->pvt_info;
		2338
		2339	/* If caller doesn't want us to process the error, return */
		2340	if (!handle_errors)
		2341	return 1;
		2342
		2343	regs = info;
		2344
		2345	debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
		2346	debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
		2347	pvt->mc_node_id, regs->nbeah, regs->nbeal);
		2348	debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
		2349	regs->nbsh, regs->nbsl);
		2350	debugf1(" Valid Error=%s Overflow=%s\n",
		2351	(regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
		2352	(regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
		2353	debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
		2354	(regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
		2355	"True" : "False",
		2356	(regs->nbsh & K8_NBSH_ERR_ENABLE) ?
		2357	"True" : "False");
		2358	debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
		2359	(regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
		2360	"True" : "False",
		2361	(regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
		2362	"True" : "False",
		2363	(regs->nbsh & K8_NBSH_PCC) ?
		2364	"True" : "False");
		2365	debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
		2366	(regs->nbsh & K8_NBSH_CECC) ?
		2367	"True" : "False",
		2368	(regs->nbsh & K8_NBSH_UECC) ?
		2369	"True" : "False",
		2370	(regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
		2371	"True" : "False");
		2372	debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
		2373	(regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
		2374	(regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
		2375	(regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
		2376	(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
		2377
		2378
		2379	err_code = EXTRACT_ERROR_CODE(regs->nbsl);
		2380
		2381	/* Determine which error type:
		2382	* 1) GART errors - non-fatal, developmental events
		2383	* 2) MEMORY errors
		2384	* 3) BUS errors
		2385	* 4) Unknown error
		2386	*/
		2387	if (TEST_TLB_ERROR(err_code)) {
		2388	/*
		2389	* GART errors are intended to help graphics driver developers
		2390	* to detect bad GART PTEs. It is recommended by AMD to disable
		2391	* GART table walk error reporting by default[1] (currently
		2392	* being disabled in mce_cpu_quirks()) and according to the
		2393	* comment in mce_cpu_quirks(), such GART errors can be
		2394	* incorrectly triggered. We may see these errors anyway and
		2395	* unless requested by the user, they won't be reported.
		2396	*
		2397	* [1] section 13.10.1 on BIOS and Kernel Developers Guide for
		2398	* AMD NPT family 0Fh processors
		2399	*/
		2400	if (report_gart_errors == 0)
		2401	return 1;
		2402
		2403	/*
		2404	* Only if GART error reporting is requested should we generate
		2405	* any logs.
		2406	*/
		2407	gart_tlb_error = 1;
		2408
		2409	debugf1("GART TLB error\n");
		2410	amd64_decode_gart_tlb_error(mci, info);
		2411	} else if (TEST_MEM_ERROR(err_code)) {
		2412	debugf1("Memory/Cache error\n");
		2413	amd64_decode_mem_cache_error(mci, info);
		2414	} else if (TEST_BUS_ERROR(err_code)) {
		2415	debugf1("Bus (Link/DRAM) error\n");
		2416	amd64_decode_bus_error(mci, info);
		2417	} else {
		2418	/* shouldn't reach here! */
		2419	amd64_mc_printk(mci, KERN_WARNING,
		2420	"%s(): unknown MCE error 0x%x\n", __func__,
		2421	err_code);
		2422	}
		2423
		2424	ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
		2425	amd64_mc_printk(mci, KERN_ERR,
		2426	"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
		2427
		2428	if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
		2429	ext_ec <= F10_NBSL_EXT_ERR_TGT) \|\|
		2430	(ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
		2431	EXTRACT_LDT_LINK(info->nbsh)) {
		2432
		2433	amd64_mc_printk(mci, KERN_ERR,
		2434	"Error on hypertransport link: %s\n",
		2435	htlink_msgs[
		2436	EXTRACT_LDT_LINK(info->nbsh)]);
		2437	}
		2438
		2439	/*
		2440	* Check the UE bit of the NB status high register, if set generate some
		2441	* logs. If NOT a GART error, then process the event as a NO-INFO event.
		2442	* If it was a GART error, skip that process.
		2443	*/
		2444	if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
		2445	amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
		2446	if (!gart_tlb_error)
		2447	edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
		2448	}
		2449
		2450	if (regs->nbsh & K8_NBSH_PCC)
		2451	amd64_mc_printk(mci, KERN_CRIT,
		2452	"PCC (processor context corrupt) set\n");
		2453
		2454	return 1;
		2455	}
		2456	EXPORT_SYMBOL_GPL(amd64_process_error_info);
		2457
		2458