diff options
-rw-r--r-- | drivers/edac/i7core_edac.c | 92 |
1 files changed, 70 insertions, 22 deletions
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index a93ebdf9c121..4397a3171c62 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c | |||
@@ -1319,33 +1319,75 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) | |||
1319 | pvt->last_ce_count[socket][0] = new0; | 1319 | pvt->last_ce_count[socket][0] = new0; |
1320 | } | 1320 | } |
1321 | 1321 | ||
1322 | /* | ||
1323 | * According with tables E-11 and E-12 of chapter E.3.3 of Intel 64 and IA-32 | ||
1324 | * Architectures Software Developer’s Manual Volume 3B. | ||
1325 | * The MCA registers are the following ones: | ||
1326 | * struct mce field MCA Register | ||
1327 | * m->status MSR_IA32_MC0_STATUS | ||
1328 | * m->addr MSR_IA32_MC0_ADDR | ||
1329 | * m->misc MSR_IA32_MC0_MISC | ||
1330 | * m->mcgstatus MSR_IA32_MCG_STATUS | ||
1331 | * In the case of Nehalem, the error information is masked at .status and .misc | ||
1332 | * fields | ||
1333 | */ | ||
1322 | static void i7core_mce_output_error(struct mem_ctl_info *mci, | 1334 | static void i7core_mce_output_error(struct mem_ctl_info *mci, |
1323 | struct mce *m) | 1335 | struct mce *m) |
1324 | { | 1336 | { |
1325 | debugf0("CPU %d: Machine Check Exception: %16Lx" | 1337 | char *type="NON-FATAL"; |
1326 | "Bank %d: %016Lx\n", | 1338 | char *err, *msg; |
1327 | m->cpu, m->mcgstatus, m->bank, m->status); | 1339 | unsigned long error = m->status & 0x1ff0000l; |
1328 | if (m->ip) { | 1340 | u32 core_err_cnt = (m->status >> 38) && 0x7fff; |
1329 | debugf0("RIP%s %02x:<%016Lx>\n", | 1341 | u32 dimm = (m->misc >> 16) & 0x3; |
1330 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | 1342 | u32 channel = (m->misc >> 18) & 0x3; |
1331 | m->cs, m->ip); | 1343 | u32 syndrome = m->misc >> 32; |
1344 | u32 errnum = find_first_bit(&error, 32); | ||
1345 | |||
1346 | switch (errnum) { | ||
1347 | case 16: | ||
1348 | err = "read ECC error"; | ||
1349 | break; | ||
1350 | case 17: | ||
1351 | err = "RAS ECC error"; | ||
1352 | break; | ||
1353 | case 18: | ||
1354 | err = "write parity error"; | ||
1355 | break; | ||
1356 | case 19: | ||
1357 | err = "redundacy loss"; | ||
1358 | break; | ||
1359 | case 20: | ||
1360 | err = "reserved"; | ||
1361 | break; | ||
1362 | case 21: | ||
1363 | err = "memory range error"; | ||
1364 | break; | ||
1365 | case 22: | ||
1366 | err = "RTID out of range"; | ||
1367 | break; | ||
1368 | case 23: | ||
1369 | err = "address parity error"; | ||
1370 | break; | ||
1371 | case 24: | ||
1372 | err = "byte enable parity error"; | ||
1373 | break; | ||
1374 | default: | ||
1375 | err = "unknown"; | ||
1332 | } | 1376 | } |
1333 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
1334 | if (m->addr) | ||
1335 | printk("ADDR %llx ", m->addr); | ||
1336 | if (m->misc) | ||
1337 | printk("MISC %llx ", m->misc); | ||
1338 | 1377 | ||
1339 | #if 0 | 1378 | msg = kasprintf(GFP_ATOMIC, |
1340 | snprintf(msg, sizeof(msg), | 1379 | "%s (addr = 0x%08llx Bank=0x%08x, Dimm=%d, Channel=%d, " |
1341 | "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s " | 1380 | "syndrome=0x%08x total error count=%d Err=%d (%s))\n", |
1342 | "RAS=%d CAS=%d %s Err=0x%lx (%s))", | 1381 | type, (long long) m->addr, m->bank, dimm, channel, |
1343 | type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas, | 1382 | syndrome, core_err_cnt,errnum, err); |
1344 | type, allErrors, error_name[errnum]); | 1383 | |
1384 | debugf0("%s", msg); | ||
1345 | 1385 | ||
1346 | /* Call the helper to output message */ | 1386 | /* Call the helper to output message */ |
1347 | edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); | 1387 | edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */, |
1348 | #endif | 1388 | 0, 0 /* FIXME: should be channel here */, msg); |
1389 | |||
1390 | kfree(msg); | ||
1349 | } | 1391 | } |
1350 | 1392 | ||
1351 | /* | 1393 | /* |
@@ -1398,6 +1440,13 @@ static int i7core_mce_check_error(void *priv, struct mce *mce) | |||
1398 | 1440 | ||
1399 | debugf0(__FILE__ ": %s()\n", __func__); | 1441 | debugf0(__FILE__ ": %s()\n", __func__); |
1400 | 1442 | ||
1443 | /* | ||
1444 | * Just let mcelog handle it if the error is | ||
1445 | * outside the memory controller | ||
1446 | */ | ||
1447 | if (((mce->status & 0xffff) >> 7) != 1) | ||
1448 | return 0; | ||
1449 | |||
1401 | spin_lock_irqsave(&pvt->mce_lock, flags); | 1450 | spin_lock_irqsave(&pvt->mce_lock, flags); |
1402 | if (pvt->mce_count < MCE_LOG_LEN) { | 1451 | if (pvt->mce_count < MCE_LOG_LEN) { |
1403 | memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce)); | 1452 | memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce)); |
@@ -1406,8 +1455,7 @@ static int i7core_mce_check_error(void *priv, struct mce *mce) | |||
1406 | spin_unlock_irqrestore(&pvt->mce_lock, flags); | 1455 | spin_unlock_irqrestore(&pvt->mce_lock, flags); |
1407 | 1456 | ||
1408 | /* Advice mcelog that the error were handled */ | 1457 | /* Advice mcelog that the error were handled */ |
1409 | // return 1; | 1458 | return 1; |
1410 | return 0; // Let's duplicate the log | ||
1411 | } | 1459 | } |
1412 | 1460 | ||
1413 | /* | 1461 | /* |