aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2009-06-25 13:51:04 -0400
committerBorislav Petkov <borislav.petkov@amd.com>2009-09-14 12:58:25 -0400
commit5110dbdeab546268dda2e4c6a83448639b2fc5ae (patch)
treed504eb80a6b77a7511cd8259943d0d746758834d /drivers/edac
parentef44cc4c2245d3c43f3c11c7bff6239852eef498 (diff)
amd64_edac: cleanup/complete NB MCE decoding
* don't dump info which mcheck already does * update to newest BKDG * mv amd64_process_error_info -> amd64_decode_nb_mce * shorten error struct names * remove redundant info ptr in amd64_process_error_info * remove unused ErrorCodeExt[19:16] (MCx_STATUS) defines Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/amd64_edac.c125
-rw-r--r--drivers/edac/amd64_edac.h26
-rw-r--r--drivers/edac/amd64_edac_dbg.c2
-rw-r--r--drivers/edac/edac_mce_amd.h2
4 files changed, 56 insertions, 99 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index c9b88d82970..5af87d44c80 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2355,62 +2355,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2355 "Error Overflow set"); 2355 "Error Overflow set");
2356} 2356}
2357 2357
2358int amd64_process_error_info(struct mem_ctl_info *mci, 2358void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
2359 struct err_regs *regs, 2359 int handle_errors)
2360 int handle_errors)
2361{ 2360{
2362 struct amd64_pvt *pvt; 2361 struct amd64_pvt *pvt = mci->pvt_info;
2363 u32 err_code, ext_ec; 2362 int ecc;
2364 int gart_tlb_error = 0; 2363 u32 ec = ERROR_CODE(regs->nbsl);
2365 2364 u32 xec = EXT_ERROR_CODE(regs->nbsl);
2366 pvt = mci->pvt_info;
2367 2365
2368 if (!handle_errors) 2366 if (!handle_errors)
2369 return 1; 2367 return;
2370 2368
2371 debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); 2369 pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
2372 debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
2373 pvt->mc_node_id, regs->nbeah, regs->nbeal);
2374 debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
2375 regs->nbsh, regs->nbsl);
2376 debugf1(" Valid Error=%s Overflow=%s\n",
2377 (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
2378 (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
2379 debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
2380 (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
2381 "True" : "False",
2382 (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
2383 "True" : "False");
2384 debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
2385 (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
2386 "True" : "False",
2387 (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
2388 "True" : "False",
2389 (regs->nbsh & K8_NBSH_PCC) ?
2390 "True" : "False");
2391 debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
2392 (regs->nbsh & K8_NBSH_CECC) ?
2393 "True" : "False",
2394 (regs->nbsh & K8_NBSH_UECC) ?
2395 "True" : "False",
2396 (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
2397 "True" : "False");
2398 debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
2399 (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
2400 (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
2401 (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
2402 (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
2403 2370
2371 /*
2372 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
2373 * value encoding has changed so interpret those differently
2374 */
2375 if ((boot_cpu_data.x86 == 0x10) &&
2376 (boot_cpu_data.x86_model > 8)) {
2377 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
2378 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
2379 } else {
2380 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
2381 }
2404 2382
2405 err_code = ERROR_CODE(regs->nbsl); 2383 pr_emerg(" Error: %sorrected",
2384 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
2385 pr_cont(", Report Error: %s",
2386 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
2387 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
2388 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
2389 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
2406 2390
2407 /* Determine which error type: 2391 /* do the two bits[14:13] together */
2408 * 1) GART errors - non-fatal, developmental events 2392 ecc = regs->nbsh & (0x3 << 13);
2409 * 2) MEMORY errors 2393 if (ecc)
2410 * 3) BUS errors 2394 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
2411 * 4) Unknown error 2395
2412 */ 2396 pr_cont("\n");
2413 if (TLB_ERROR(err_code)) { 2397
2398 if (TLB_ERROR(ec)) {
2414 /* 2399 /*
2415 * GART errors are intended to help graphics driver developers 2400 * GART errors are intended to help graphics driver developers
2416 * to detect bad GART PTEs. It is recommended by AMD to disable 2401 * to detect bad GART PTEs. It is recommended by AMD to disable
@@ -2423,52 +2408,34 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
2423 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for 2408 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2424 * AMD NPT family 0Fh processors 2409 * AMD NPT family 0Fh processors
2425 */ 2410 */
2426 if (report_gart_errors == 0) 2411 if (!report_gart_errors)
2427 return 1; 2412 return;
2428
2429 /*
2430 * Only if GART error reporting is requested should we generate
2431 * any logs.
2432 */
2433 gart_tlb_error = 1;
2434 2413
2435 debugf1("GART TLB error\n"); 2414 pr_emerg("GART TLB error\n");
2436 amd64_decode_gart_tlb_error(mci, regs); 2415 amd64_decode_gart_tlb_error(mci, regs);
2437 } else if (MEM_ERROR(err_code)) { 2416 } else if (MEM_ERROR(ec)) {
2438 debugf1("Memory/Cache error\n"); 2417 pr_emerg("Memory/Cache error\n");
2439 amd64_decode_mem_cache_error(mci, regs); 2418 amd64_decode_mem_cache_error(mci, regs);
2440 } else if (BUS_ERROR(err_code)) { 2419 } else if (BUS_ERROR(ec)) {
2441 debugf1("Bus (Link/DRAM) error\n"); 2420 pr_emerg("Bus (Link/DRAM) error\n");
2442 amd64_decode_bus_error(mci, regs); 2421 amd64_decode_bus_error(mci, regs);
2443 } else { 2422 } else {
2444 /* shouldn't reach here! */ 2423 /* shouldn't reach here! */
2445 amd64_mc_printk(mci, KERN_WARNING, 2424 amd64_mc_printk(mci, KERN_WARNING,
2446 "%s(): unknown MCE error 0x%x\n", __func__, 2425 "%s(): unknown MCE error 0x%x\n", __func__,
2447 err_code); 2426 ec);
2448 } 2427 }
2449 2428
2450 ext_ec = EXT_ERROR_CODE(regs->nbsl); 2429 pr_emerg("%s.\n", EXT_ERR_MSG(xec));
2451 amd64_mc_printk(mci, KERN_ERR,
2452 "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
2453 2430
2454 /* 2431 /*
2455 * Check the UE bit of the NB status high register, if set generate some 2432 * Check the UE bit of the NB status high register, if set generate some
2456 * logs. If NOT a GART error, then process the event as a NO-INFO event. 2433 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2457 * If it was a GART error, skip that process. 2434 * If it was a GART error, skip that process.
2458 */ 2435 */
2459 if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { 2436 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2460 amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); 2437 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2461 if (!gart_tlb_error)
2462 edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
2463 }
2464
2465 if (regs->nbsh & K8_NBSH_PCC)
2466 amd64_mc_printk(mci, KERN_CRIT,
2467 "PCC (processor context corrupt) set\n");
2468
2469 return 1;
2470} 2438}
2471EXPORT_SYMBOL_GPL(amd64_process_error_info);
2472 2439
2473/* 2440/*
2474 * The main polling 'check' function, called FROM the edac core to perform the 2441 * The main polling 'check' function, called FROM the edac core to perform the
@@ -2479,7 +2446,7 @@ static void amd64_check(struct mem_ctl_info *mci)
2479 struct err_regs regs; 2446 struct err_regs regs;
2480 2447
2481 if (amd64_get_error_info(mci, &regs)) 2448 if (amd64_get_error_info(mci, &regs))
2482 amd64_process_error_info(mci, &regs, 1); 2449 amd64_decode_nb_mce(mci, &regs, 1);
2483} 2450}
2484 2451
2485/* 2452/*
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index bde8f78551f..ecab0c9fd14 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -306,16 +306,7 @@ enum {
306 306
307/* Family F10h: Normalized Extended Error Codes */ 307/* Family F10h: Normalized Extended Error Codes */
308#define F10_NBSL_EXT_ERR_RES 0x0 308#define F10_NBSL_EXT_ERR_RES 0x0
309#define F10_NBSL_EXT_ERR_CRC 0x1
310#define F10_NBSL_EXT_ERR_SYNC 0x2
311#define F10_NBSL_EXT_ERR_MST 0x3
312#define F10_NBSL_EXT_ERR_TGT 0x4
313#define F10_NBSL_EXT_ERR_GART 0x5
314#define F10_NBSL_EXT_ERR_RMW 0x6
315#define F10_NBSL_EXT_ERR_WDT 0x7
316#define F10_NBSL_EXT_ERR_ECC 0x8 309#define F10_NBSL_EXT_ERR_ECC 0x8
317#define F10_NBSL_EXT_ERR_DEV 0x9
318#define F10_NBSL_EXT_ERR_LINK_DATA 0xA
319 310
320/* Next two are overloaded values */ 311/* Next two are overloaded values */
321#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB 312#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB
@@ -360,18 +351,15 @@ enum {
360 351
361#define K8_NBSH_VALID_BIT BIT(31) 352#define K8_NBSH_VALID_BIT BIT(31)
362#define K8_NBSH_OVERFLOW BIT(30) 353#define K8_NBSH_OVERFLOW BIT(30)
363#define K8_NBSH_UNCORRECTED_ERR BIT(29) 354#define K8_NBSH_UC_ERR BIT(29)
364#define K8_NBSH_ERR_ENABLE BIT(28) 355#define K8_NBSH_ERR_EN BIT(28)
365#define K8_NBSH_MISC_ERR_VALID BIT(27) 356#define K8_NBSH_MISCV BIT(27)
366#define K8_NBSH_VALID_ERROR_ADDR BIT(26) 357#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
367#define K8_NBSH_PCC BIT(25) 358#define K8_NBSH_PCC BIT(25)
359#define K8_NBSH_ERR_CPU_VAL BIT(24)
368#define K8_NBSH_CECC BIT(14) 360#define K8_NBSH_CECC BIT(14)
369#define K8_NBSH_UECC BIT(13) 361#define K8_NBSH_UECC BIT(13)
370#define K8_NBSH_ERR_SCRUBER BIT(8) 362#define K8_NBSH_ERR_SCRUBER BIT(8)
371#define K8_NBSH_CORE3 BIT(3)
372#define K8_NBSH_CORE2 BIT(2)
373#define K8_NBSH_CORE1 BIT(1)
374#define K8_NBSH_CORE0 BIT(0)
375 363
376#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) 364#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
377 365
@@ -622,8 +610,8 @@ static inline struct low_ops *family_ops(int index)
622#define F10_MIN_SCRUB_RATE_BITS 0x5 610#define F10_MIN_SCRUB_RATE_BITS 0x5
623#define F11_MIN_SCRUB_RATE_BITS 0x6 611#define F11_MIN_SCRUB_RATE_BITS 0x6
624 612
625int amd64_process_error_info(struct mem_ctl_info *mci, 613void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
626 struct err_regs *info, 614 int handle_errors);
627 int handle_errors); 615
628int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, 616int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
629 u64 *hole_offset, u64 *hole_size); 617 u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 0a41b248a4a..bcb4e2eba3d 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
24 24
25 /* Process the Mapping request */ 25 /* Process the Mapping request */
26 /* TODO: Add race prevention */ 26 /* TODO: Add race prevention */
27 amd64_process_error_info(mci, &pvt->ctl_error_info, 1); 27 amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);
28 28
29 return count; 29 return count;
30 } 30 }
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
index 81f9dcf9990..39971cdabb5 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/edac_mce_amd.h
@@ -1,5 +1,7 @@
1#define ERROR_CODE(x) ((x) & 0xffff) 1#define ERROR_CODE(x) ((x) & 0xffff)
2#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) 2#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
3#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
4
3#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) 5#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
4#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) 6#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
5 7