aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2009-09-02 22:49:59 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2010-05-10 10:44:56 -0400
commitb4e8f0b6eaa1e99f1a64e539466a8ee2fb521d62 (patch)
tree564a3a5e2acb692c697744658d01dc5cd293516d /drivers/edac
parent61053fdedb2080dadc18dc37abbba90d2e74bc03 (diff)
i7core_edac: Use Device 3 function 2 to report errors with RDIMM's
Nehalem and upper chipsets provide an special device that has corrected memory error counters detected with registered dimms. This device is only seen if there are registered memories plugged. After this patch, on a machine fully equiped with RDIMM's, it will use the Device 3 function 2 to count corrected errors instead on relying at mcelog. For unregistered DIMMs, it will keep the old behavior, counting errors via mcelog. This patch were developed together with Keith Mannthey <kmannth@us.ibm.com> Signed-off-by: Keith Mannthey <kmannth@us.ibm.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/i7core_edac.c208
1 files changed, 178 insertions, 30 deletions
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 87d5695f5fb0..4758c208f39a 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -73,6 +73,18 @@
73 #define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff) 73 #define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff)
74 #define DIMM0_COR_ERR(r) ((r) & 0x7fff) 74 #define DIMM0_COR_ERR(r) ((r) & 0x7fff)
75 75
76/* OFFSETS for Device 3 Function 2, as inicated on Xeon 5500 datasheet */
77#define MC_COR_ECC_CNT_0 0x80
78#define MC_COR_ECC_CNT_1 0x84
79#define MC_COR_ECC_CNT_2 0x88
80#define MC_COR_ECC_CNT_3 0x8c
81#define MC_COR_ECC_CNT_4 0x90
82#define MC_COR_ECC_CNT_5 0x94
83
84#define DIMM_TOP_COR_ERR(r) (((r) >> 16) & 0x7fff)
85#define DIMM_BOT_COR_ERR(r) ((r) & 0x7fff)
86
87
76 /* OFFSETS for Devices 4,5 and 6 Function 0 */ 88 /* OFFSETS for Devices 4,5 and 6 Function 0 */
77 89
78#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58 90#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58
@@ -194,13 +206,20 @@ struct i7core_pvt {
194 struct i7core_inject inject; 206 struct i7core_inject inject;
195 struct i7core_channel channel[NUM_SOCKETS][NUM_CHANS]; 207 struct i7core_channel channel[NUM_SOCKETS][NUM_CHANS];
196 208
209 unsigned int is_registered:1; /* true if all memories are RDIMMs */
210
197 int sockets; /* Number of sockets */ 211 int sockets; /* Number of sockets */
198 int channels; /* Number of active channels */ 212 int channels; /* Number of active channels */
199 213
200 int ce_count_available[NUM_SOCKETS]; 214 int ce_count_available[NUM_SOCKETS];
201 /* ECC corrected errors counts per dimm */ 215 int csrow_map[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
202 unsigned long ce_count[NUM_SOCKETS][MAX_DIMMS]; 216
203 int last_ce_count[NUM_SOCKETS][MAX_DIMMS]; 217 /* ECC corrected errors counts per udimm */
218 unsigned long udimm_ce_count[NUM_SOCKETS][MAX_DIMMS];
219 int udimm_last_ce_count[NUM_SOCKETS][MAX_DIMMS];
220 /* ECC corrected errors counts per rdimm */
221 unsigned long rdimm_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
222 int rdimm_last_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
204 223
205 /* mcelog glue */ 224 /* mcelog glue */
206 struct edac_mce edac_mce; 225 struct edac_mce edac_mce;
@@ -471,6 +490,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
471 numrow(pvt->info.max_dod >> 6), 490 numrow(pvt->info.max_dod >> 6),
472 numcol(pvt->info.max_dod >> 9)); 491 numcol(pvt->info.max_dod >> 9));
473 492
493 pvt->is_registered = 1;
494
474 for (i = 0; i < NUM_CHANS; i++) { 495 for (i = 0; i < NUM_CHANS; i++) {
475 u32 data, dimm_dod[3], value[8]; 496 u32 data, dimm_dod[3], value[8];
476 497
@@ -492,8 +513,14 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
492 513
493 if (data & REGISTERED_DIMM) 514 if (data & REGISTERED_DIMM)
494 mtype = MEM_RDDR3; 515 mtype = MEM_RDDR3;
495 else 516 else {
496 mtype = MEM_DDR3; 517 mtype = MEM_DDR3;
518 /*
519 * FIXME: Currently, the driver will use dev 3:2
520 * counter registers only if all memories are registered
521 */
522 pvt->is_registered = 0;
523 }
497#if 0 524#if 0
498 if (data & THREE_DIMMS_PRESENT) 525 if (data & THREE_DIMMS_PRESENT)
499 pvt->channel[i].dimms = 3; 526 pvt->channel[i].dimms = 3;
@@ -562,6 +589,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
562 csr->channels[0].chan_idx = i; 589 csr->channels[0].chan_idx = i;
563 csr->channels[0].ce_count = 0; 590 csr->channels[0].ce_count = 0;
564 591
592 pvt->csrow_map[socket][i][j] = *csrow;
593
565 switch (banks) { 594 switch (banks) {
566 case 4: 595 case 4:
567 csr->dtype = DEV_X4; 596 csr->dtype = DEV_X4;
@@ -1031,19 +1060,31 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci,
1031 1060
1032static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data) 1061static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data)
1033{ 1062{
1034 unsigned i, count, total = 0; 1063 unsigned i, j, count, total = 0;
1035 struct i7core_pvt *pvt = mci->pvt_info; 1064 struct i7core_pvt *pvt = mci->pvt_info;
1036 1065
1037 for (i = 0; i < pvt->sockets; i++) { 1066 for (i = 0; i < pvt->sockets; i++) {
1038 if (!pvt->ce_count_available[i]) 1067 if (!pvt->ce_count_available[i]) {
1039 count = sprintf(data, "socket 0 data unavailable\n"); 1068 count = sprintf(data, "socket 0 data unavailable\n");
1040 else 1069 continue;
1070 }
1071 if (!pvt->is_registered)
1041 count = sprintf(data, "socket %d, dimm0: %lu\n" 1072 count = sprintf(data, "socket %d, dimm0: %lu\n"
1042 "dimm1: %lu\ndimm2: %lu\n", 1073 "dimm1: %lu\ndimm2: %lu\n",
1043 i, 1074 i,
1044 pvt->ce_count[i][0], 1075 pvt->udimm_ce_count[i][0],
1045 pvt->ce_count[i][1], 1076 pvt->udimm_ce_count[i][1],
1046 pvt->ce_count[i][2]); 1077 pvt->udimm_ce_count[i][2]);
1078 else
1079 for (j = 0; j < NUM_CHANS; j++) {
1080 count = sprintf(data, "socket %d, channel %d"
1081 "dimm0: %lu\n"
1082 "dimm1: %lu\ndimm2: %lu\n",
1083 i, j,
1084 pvt->rdimm_ce_count[i][j][0],
1085 pvt->rdimm_ce_count[i][j][1],
1086 pvt->rdimm_ce_count[i][j][2]);
1087 }
1047 data += count; 1088 data += count;
1048 total += count; 1089 total += count;
1049 } 1090 }
@@ -1308,6 +1349,103 @@ error:
1308/**************************************************************************** 1349/****************************************************************************
1309 Error check routines 1350 Error check routines
1310 ****************************************************************************/ 1351 ****************************************************************************/
1352static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci, int socket,
1353 int chan, int dimm, int add)
1354{
1355 char *msg;
1356 struct i7core_pvt *pvt = mci->pvt_info;
1357 int row = pvt->csrow_map[socket][chan][dimm], i;
1358
1359 for (i = 0; i < add; i++) {
1360 msg = kasprintf(GFP_KERNEL, "Corrected error "
1361 "(Socket=%d channel=%d dimm=%d",
1362 socket, chan, dimm);
1363
1364 edac_mc_handle_fbd_ce(mci, row, 0, msg);
1365 kfree (msg);
1366 }
1367}
1368
1369static void i7core_rdimm_update_ce_count(struct mem_ctl_info *mci,
1370 int socket, int chan, int new0, int new1, int new2)
1371{
1372 struct i7core_pvt *pvt = mci->pvt_info;
1373 int add0 = 0, add1 = 0, add2 = 0;
1374 /* Updates CE counters if it is not the first time here */
1375 if (pvt->ce_count_available[socket]) {
1376 /* Updates CE counters */
1377
1378 add2 = new2 - pvt->rdimm_last_ce_count[socket][chan][2];
1379 add1 = new1 - pvt->rdimm_last_ce_count[socket][chan][1];
1380 add0 = new0 - pvt->rdimm_last_ce_count[socket][chan][0];
1381
1382 if (add2 < 0)
1383 add2 += 0x7fff;
1384 pvt->rdimm_ce_count[socket][chan][2] += add2;
1385
1386 if (add1 < 0)
1387 add1 += 0x7fff;
1388 pvt->rdimm_ce_count[socket][chan][1] += add1;
1389
1390 if (add0 < 0)
1391 add0 += 0x7fff;
1392 pvt->rdimm_ce_count[socket][chan][0] += add0;
1393 } else
1394 pvt->ce_count_available[socket] = 1;
1395
1396 /* Store the new values */
1397 pvt->rdimm_last_ce_count[socket][chan][2] = new2;
1398 pvt->rdimm_last_ce_count[socket][chan][1] = new1;
1399 pvt->rdimm_last_ce_count[socket][chan][0] = new0;
1400
1401 /*updated the edac core */
1402 if (add0 != 0)
1403 i7core_rdimm_update_csrow(mci, socket, chan, 0, add0);
1404 if (add1 != 0)
1405 i7core_rdimm_update_csrow(mci, socket, chan, 1, add1);
1406 if (add2 != 0)
1407 i7core_rdimm_update_csrow(mci, socket, chan, 2, add2);
1408
1409}
1410
1411static void i7core_rdimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
1412{
1413 struct i7core_pvt *pvt = mci->pvt_info;
1414 u32 rcv[3][2];
1415 int i, new0, new1, new2;
1416
1417 /*Read DEV 3: FUN 2: MC_COR_ECC_CNT regs directly*/
1418 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_0,
1419 &rcv[0][0]);
1420 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_1,
1421 &rcv[0][1]);
1422 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_2,
1423 &rcv[1][0]);
1424 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_3,
1425 &rcv[1][1]);
1426 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_4,
1427 &rcv[2][0]);
1428 pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_5,
1429 &rcv[2][1]);
1430 for (i = 0 ; i < 3; i++) {
1431 debugf3("MC_COR_ECC_CNT%d = 0x%x; MC_COR_ECC_CNT%d = 0x%x\n",
1432 (i * 2), rcv[i][0], (i * 2) + 1, rcv[i][1]);
1433 /*if the channel has 3 dimms*/
1434 if (pvt->channel[socket][i].dimms > 2) {
1435 new0 = DIMM_BOT_COR_ERR(rcv[i][0]);
1436 new1 = DIMM_TOP_COR_ERR(rcv[i][0]);
1437 new2 = DIMM_BOT_COR_ERR(rcv[i][1]);
1438 } else {
1439 new0 = DIMM_TOP_COR_ERR(rcv[i][0]) +
1440 DIMM_BOT_COR_ERR(rcv[i][0]);
1441 new1 = DIMM_TOP_COR_ERR(rcv[i][1]) +
1442 DIMM_BOT_COR_ERR(rcv[i][1]);
1443 new2 = 0;
1444 }
1445
1446 i7core_rdimm_update_ce_count(mci, socket, i, new0, new1, new2);
1447 }
1448}
1311 1449
1312/* This function is based on the device 3 function 4 registers as described on: 1450/* This function is based on the device 3 function 4 registers as described on:
1313 * Intel Xeon Processor 5500 Series Datasheet Volume 2 1451 * Intel Xeon Processor 5500 Series Datasheet Volume 2
@@ -1315,7 +1453,7 @@ error:
1315 * also available at: 1453 * also available at:
1316 * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf 1454 * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
1317 */ 1455 */
1318static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) 1456static void i7core_udimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
1319{ 1457{
1320 struct i7core_pvt *pvt = mci->pvt_info; 1458 struct i7core_pvt *pvt = mci->pvt_info;
1321 u32 rcv1, rcv0; 1459 u32 rcv1, rcv0;
@@ -1326,7 +1464,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
1326 return; 1464 return;
1327 } 1465 }
1328 1466
1329 /* Corrected error reads */ 1467 /* Corrected test errors */
1330 pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV1, &rcv1); 1468 pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV1, &rcv1);
1331 pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV0, &rcv0); 1469 pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV0, &rcv0);
1332 1470
@@ -1335,39 +1473,38 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
1335 new1 = DIMM1_COR_ERR(rcv0); 1473 new1 = DIMM1_COR_ERR(rcv0);
1336 new0 = DIMM0_COR_ERR(rcv0); 1474 new0 = DIMM0_COR_ERR(rcv0);
1337 1475
1338#if 0
1339 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
1340 (pvt->ce_count_available ? "UPDATE" : "READ"),
1341 rcv1, rcv0, new0, new1, new2);
1342#endif
1343
1344 /* Updates CE counters if it is not the first time here */ 1476 /* Updates CE counters if it is not the first time here */
1345 if (pvt->ce_count_available[socket]) { 1477 if (pvt->ce_count_available[socket]) {
1346 /* Updates CE counters */ 1478 /* Updates CE counters */
1347 int add0, add1, add2; 1479 int add0, add1, add2;
1348 1480
1349 add2 = new2 - pvt->last_ce_count[socket][2]; 1481 add2 = new2 - pvt->udimm_last_ce_count[socket][2];
1350 add1 = new1 - pvt->last_ce_count[socket][1]; 1482 add1 = new1 - pvt->udimm_last_ce_count[socket][1];
1351 add0 = new0 - pvt->last_ce_count[socket][0]; 1483 add0 = new0 - pvt->udimm_last_ce_count[socket][0];
1352 1484
1353 if (add2 < 0) 1485 if (add2 < 0)
1354 add2 += 0x7fff; 1486 add2 += 0x7fff;
1355 pvt->ce_count[socket][2] += add2; 1487 pvt->udimm_ce_count[socket][2] += add2;
1356 1488
1357 if (add1 < 0) 1489 if (add1 < 0)
1358 add1 += 0x7fff; 1490 add1 += 0x7fff;
1359 pvt->ce_count[socket][1] += add1; 1491 pvt->udimm_ce_count[socket][1] += add1;
1360 1492
1361 if (add0 < 0) 1493 if (add0 < 0)
1362 add0 += 0x7fff; 1494 add0 += 0x7fff;
1363 pvt->ce_count[socket][0] += add0; 1495 pvt->udimm_ce_count[socket][0] += add0;
1496
1497 if (add0 | add1 | add2)
1498 i7core_printk(KERN_ERR, "New Corrected error(s): "
1499 "dimm0: +%d, dimm1: +%d, dimm2 +%d\n",
1500 add0, add1, add2);
1364 } else 1501 } else
1365 pvt->ce_count_available[socket] = 1; 1502 pvt->ce_count_available[socket] = 1;
1366 1503
1367 /* Store the new values */ 1504 /* Store the new values */
1368 pvt->last_ce_count[socket][2] = new2; 1505 pvt->udimm_last_ce_count[socket][2] = new2;
1369 pvt->last_ce_count[socket][1] = new1; 1506 pvt->udimm_last_ce_count[socket][1] = new1;
1370 pvt->last_ce_count[socket][0] = new0; 1507 pvt->udimm_last_ce_count[socket][0] = new0;
1371} 1508}
1372 1509
1373/* 1510/*
@@ -1386,6 +1523,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
1386static void i7core_mce_output_error(struct mem_ctl_info *mci, 1523static void i7core_mce_output_error(struct mem_ctl_info *mci,
1387 struct mce *m) 1524 struct mce *m)
1388{ 1525{
1526 struct i7core_pvt *pvt = mci->pvt_info;
1389 char *type, *optype, *err, *msg; 1527 char *type, *optype, *err, *msg;
1390 unsigned long error = m->status & 0x1ff0000l; 1528 unsigned long error = m->status & 0x1ff0000l;
1391 u32 optypenum = (m->status >> 4) & 0x07; 1529 u32 optypenum = (m->status >> 4) & 0x07;
@@ -1394,6 +1532,7 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
1394 u32 channel = (m->misc >> 18) & 0x3; 1532 u32 channel = (m->misc >> 18) & 0x3;
1395 u32 syndrome = m->misc >> 32; 1533 u32 syndrome = m->misc >> 32;
1396 u32 errnum = find_first_bit(&error, 32); 1534 u32 errnum = find_first_bit(&error, 32);
1535 int csrow;
1397 1536
1398 if (m->mcgstatus & 1) 1537 if (m->mcgstatus & 1)
1399 type = "FATAL"; 1538 type = "FATAL";
@@ -1463,9 +1602,15 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
1463 1602
1464 debugf0("%s", msg); 1603 debugf0("%s", msg);
1465 1604
1605 csrow = pvt->csrow_map[m->cpu][channel][dimm];
1606
1466 /* Call the helper to output message */ 1607 /* Call the helper to output message */
1467 edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */, 1608 if (m->mcgstatus & 1)
1468 0, 0 /* FIXME: should be channel here */, msg); 1609 edac_mc_handle_fbd_ue(mci, csrow, 0,
1610 0 /* FIXME: should be channel here */, msg);
1611 else if (!pvt->is_registered)
1612 edac_mc_handle_fbd_ce(mci, csrow,
1613 0 /* FIXME: should be channel here */, msg);
1469 1614
1470 kfree(msg); 1615 kfree(msg);
1471} 1616}
@@ -1502,7 +1647,10 @@ static void i7core_check_error(struct mem_ctl_info *mci)
1502 1647
1503 /* check memory count errors */ 1648 /* check memory count errors */
1504 for (i = 0; i < pvt->sockets; i++) 1649 for (i = 0; i < pvt->sockets; i++)
1505 check_mc_test_err(mci, i); 1650 if (!pvt->is_registered)
1651 i7core_udimm_check_mc_ecc_err(mci, i);
1652 else
1653 i7core_rdimm_check_mc_ecc_err(mci, i);
1506} 1654}
1507 1655
1508/* 1656/*