aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/i7core_edac.c
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2009-07-15 18:01:08 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2010-05-10 10:44:52 -0400
commit8a2f118e3a023a4e8cbe56a6e51f7b78fa8c76a0 (patch)
tree6efbaf685f741221cfff4f22a2fca70badc2bac1 /drivers/edac/i7core_edac.c
parentba6c5c62eeb877da638e43f1282f778432142eec (diff)
i7core_edac: decode mcelog error and send it via edac interface
Enriches mcelog error by using the encoded information at MCE status and misc registers (IA32_MCx_STATUS, IA32_MCx_MISC). Some fixes are still needed here, in order to properly fill the EDAC fields. Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac/i7core_edac.c')
-rw-r--r--drivers/edac/i7core_edac.c92
1 files changed, 70 insertions, 22 deletions
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index a93ebdf9c121..4397a3171c62 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1319,33 +1319,75 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
1319 pvt->last_ce_count[socket][0] = new0; 1319 pvt->last_ce_count[socket][0] = new0;
1320} 1320}
1321 1321
1322/*
1323 * According with tables E-11 and E-12 of chapter E.3.3 of Intel 64 and IA-32
1324 * Architectures Software Developer’s Manual Volume 3B.
1325 * The MCA registers are the following ones:
1326 * struct mce field MCA Register
1327 * m->status MSR_IA32_MC0_STATUS
1328 * m->addr MSR_IA32_MC0_ADDR
1329 * m->misc MSR_IA32_MC0_MISC
1330 * m->mcgstatus MSR_IA32_MCG_STATUS
1331 * In the case of Nehalem, the error information is masked at .status and .misc
1332 * fields
1333 */
1322static void i7core_mce_output_error(struct mem_ctl_info *mci, 1334static void i7core_mce_output_error(struct mem_ctl_info *mci,
1323 struct mce *m) 1335 struct mce *m)
1324{ 1336{
1325 debugf0("CPU %d: Machine Check Exception: %16Lx" 1337 char *type="NON-FATAL";
1326 "Bank %d: %016Lx\n", 1338 char *err, *msg;
1327 m->cpu, m->mcgstatus, m->bank, m->status); 1339 unsigned long error = m->status & 0x1ff0000l;
1328 if (m->ip) { 1340 u32 core_err_cnt = (m->status >> 38) && 0x7fff;
1329 debugf0("RIP%s %02x:<%016Lx>\n", 1341 u32 dimm = (m->misc >> 16) & 0x3;
1330 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 1342 u32 channel = (m->misc >> 18) & 0x3;
1331 m->cs, m->ip); 1343 u32 syndrome = m->misc >> 32;
1344 u32 errnum = find_first_bit(&error, 32);
1345
1346 switch (errnum) {
1347 case 16:
1348 err = "read ECC error";
1349 break;
1350 case 17:
1351 err = "RAS ECC error";
1352 break;
1353 case 18:
1354 err = "write parity error";
1355 break;
1356 case 19:
1357 err = "redundacy loss";
1358 break;
1359 case 20:
1360 err = "reserved";
1361 break;
1362 case 21:
1363 err = "memory range error";
1364 break;
1365 case 22:
1366 err = "RTID out of range";
1367 break;
1368 case 23:
1369 err = "address parity error";
1370 break;
1371 case 24:
1372 err = "byte enable parity error";
1373 break;
1374 default:
1375 err = "unknown";
1332 } 1376 }
1333 printk(KERN_EMERG "TSC %llx ", m->tsc);
1334 if (m->addr)
1335 printk("ADDR %llx ", m->addr);
1336 if (m->misc)
1337 printk("MISC %llx ", m->misc);
1338 1377
1339#if 0 1378 msg = kasprintf(GFP_ATOMIC,
1340 snprintf(msg, sizeof(msg), 1379 "%s (addr = 0x%08llx Bank=0x%08x, Dimm=%d, Channel=%d, "
1341 "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s " 1380 "syndrome=0x%08x total error count=%d Err=%d (%s))\n",
1342 "RAS=%d CAS=%d %s Err=0x%lx (%s))", 1381 type, (long long) m->addr, m->bank, dimm, channel,
1343 type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas, 1382 syndrome, core_err_cnt,errnum, err);
1344 type, allErrors, error_name[errnum]); 1383
1384 debugf0("%s", msg);
1345 1385
1346 /* Call the helper to output message */ 1386 /* Call the helper to output message */
1347 edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); 1387 edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */,
1348#endif 1388 0, 0 /* FIXME: should be channel here */, msg);
1389
1390 kfree(msg);
1349} 1391}
1350 1392
1351/* 1393/*
@@ -1398,6 +1440,13 @@ static int i7core_mce_check_error(void *priv, struct mce *mce)
1398 1440
1399 debugf0(__FILE__ ": %s()\n", __func__); 1441 debugf0(__FILE__ ": %s()\n", __func__);
1400 1442
1443 /*
1444 * Just let mcelog handle it if the error is
1445 * outside the memory controller
1446 */
1447 if (((mce->status & 0xffff) >> 7) != 1)
1448 return 0;
1449
1401 spin_lock_irqsave(&pvt->mce_lock, flags); 1450 spin_lock_irqsave(&pvt->mce_lock, flags);
1402 if (pvt->mce_count < MCE_LOG_LEN) { 1451 if (pvt->mce_count < MCE_LOG_LEN) {
1403 memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce)); 1452 memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce));
@@ -1406,8 +1455,7 @@ static int i7core_mce_check_error(void *priv, struct mce *mce)
1406 spin_unlock_irqrestore(&pvt->mce_lock, flags); 1455 spin_unlock_irqrestore(&pvt->mce_lock, flags);
1407 1456
1408 /* Advice mcelog that the error were handled */ 1457 /* Advice mcelog that the error were handled */
1409// return 1; 1458 return 1;
1410 return 0; // Let's duplicate the log
1411} 1459}
1412 1460
1413/* 1461/*