aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/amd64_edac.c
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2009-07-24 07:51:42 -0400
committerBorislav Petkov <borislav.petkov@amd.com>2009-09-14 12:59:17 -0400
commit549d042df240dfb4203bab40ad44f9336751b7d6 (patch)
treeaf357ed8eaf06c26f19d458686b6c7ea4e425a05 /drivers/edac/amd64_edac.c
parentecaf5606de65cdd04de5f526185fe28fb0df654e (diff)
x86, mce: pass mce info to EDAC for decoding
Move NB decoder along with required defines to EDAC MCE core. Add registration routines for further decoding of the MCE info in the AMD64 EDAC module. CC: Andi Kleen <andi@firstfloor.org> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac/amd64_edac.c')
-rw-r--r--drivers/edac/amd64_edac.c98
1 files changed, 24 insertions, 74 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 82f48ee90f11..2080b1e2e8a2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
2282 } 2282 }
2283} 2283}
2284 2284
2285static void amd64_decode_bus_error(struct mem_ctl_info *mci, 2285static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2286 struct err_regs *info, int ecc_type) 2286 struct err_regs *info, int ecc_type)
2287{ 2287{
2288 u32 ec = ERROR_CODE(info->nbsl); 2288 u32 ec = ERROR_CODE(info->nbsl);
2289 u32 xec = EXT_ERROR_CODE(info->nbsl); 2289 u32 xec = EXT_ERROR_CODE(info->nbsl);
@@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); 2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
2317} 2317}
2318 2318
2319void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, 2319void amd64_decode_bus_error(int node_id, struct err_regs *regs,
2320 int handle_errors) 2320 int ecc_type)
2321{ 2321{
2322 struct amd64_pvt *pvt = mci->pvt_info; 2322 struct mem_ctl_info *mci = mci_lookup[node_id];
2323 int ecc;
2324 u32 ec = ERROR_CODE(regs->nbsl);
2325 u32 xec = EXT_ERROR_CODE(regs->nbsl);
2326
2327 if (!handle_errors)
2328 return;
2329
2330 pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
2331
2332 /*
2333 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
2334 * value encoding has changed so interpret those differently
2335 */
2336 if ((boot_cpu_data.x86 == 0x10) &&
2337 (boot_cpu_data.x86_model > 8)) {
2338 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
2339 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
2340 } else {
2341 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
2342 }
2343
2344 pr_emerg(" Error: %sorrected",
2345 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
2346 pr_cont(", Report Error: %s",
2347 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
2348 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
2349 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
2350 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
2351
2352 /* do the two bits[14:13] together */
2353 ecc = regs->nbsh & (0x3 << 13);
2354 if (ecc)
2355 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
2356
2357 pr_cont("\n");
2358
2359 if (TLB_ERROR(ec)) {
2360 /*
2361 * GART errors are intended to help graphics driver developers
2362 * to detect bad GART PTEs. It is recommended by AMD to disable
2363 * GART table walk error reporting by default[1] (currently
2364 * being disabled in mce_cpu_quirks()) and according to the
2365 * comment in mce_cpu_quirks(), such GART errors can be
2366 * incorrectly triggered. We may see these errors anyway and
2367 * unless requested by the user, they won't be reported.
2368 *
2369 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2370 * AMD NPT family 0Fh processors
2371 */
2372 if (!report_gart_errors)
2373 return;
2374
2375 pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
2376 TT_MSG(ec), LL_MSG(ec));
2377 } else if (MEM_ERROR(ec)) {
2378 pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
2379 " Cache Level: %s",
2380 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
2381 } else if (BUS_ERROR(ec)) {
2382 pr_emerg(" Bus (Link/DRAM) error\n");
2383 amd64_decode_bus_error(mci, regs, ecc);
2384 } else {
2385 /* shouldn't reach here! */
2386 amd64_mc_printk(mci, KERN_WARNING,
2387 "%s(): unknown MCE error 0x%x\n", __func__, ec);
2388 }
2389 2323
2390 pr_emerg("%s.\n", EXT_ERR_MSG(xec)); 2324 __amd64_decode_bus_error(mci, regs, ecc_type);
2391 2325
2392 /* 2326 /*
2393 * Check the UE bit of the NB status high register, if set generate some 2327 * Check the UE bit of the NB status high register, if set generate some
2394 * logs. If NOT a GART error, then process the event as a NO-INFO event. 2328 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2395 * If it was a GART error, skip that process. 2329 * If it was a GART error, skip that process.
2330 *
2331 * FIXME: this should go somewhere else, if at all.
2396 */ 2332 */
2397 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2333 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2398 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2334 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2335
2399} 2336}
2400 2337
2401/* 2338/*
@@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci)
2406{ 2343{
2407 struct err_regs regs; 2344 struct err_regs regs;
2408 2345
2409 if (amd64_get_error_info(mci, &regs)) 2346 if (amd64_get_error_info(mci, &regs)) {
2410 amd64_decode_nb_mce(mci, &regs, 1); 2347 struct amd64_pvt *pvt = mci->pvt_info;
2348 amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2349 }
2411} 2350}
2412 2351
2413/* 2352/*
@@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
3103 3042
3104 mci_lookup[node_id] = mci; 3043 mci_lookup[node_id] = mci;
3105 pvt_lookup[node_id] = NULL; 3044 pvt_lookup[node_id] = NULL;
3045
3046 /* register stuff with EDAC MCE */
3047 if (report_gart_errors)
3048 amd_report_gart_errors(true);
3049
3050 amd_register_ecc_decoder(amd64_decode_bus_error);
3051
3106 return 0; 3052 return 0;
3107 3053
3108err_add_mc: 3054err_add_mc:
@@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
3169 3115
3170 mci_lookup[pvt->mc_node_id] = NULL; 3116 mci_lookup[pvt->mc_node_id] = NULL;
3171 3117
3118 /* unregister from EDAC MCE */
3119 amd_report_gart_errors(false);
3120 amd_unregister_ecc_decoder(amd64_decode_bus_error);
3121
3172 /* Free the EDAC CORE resources */ 3122 /* Free the EDAC CORE resources */
3173 edac_mc_free(mci); 3123 edac_mc_free(mci);
3174} 3124}