aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/sb_edac.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/edac/sb_edac.c')
-rw-r--r--drivers/edac/sb_edac.c212
1 files changed, 78 insertions, 134 deletions
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 123204f8e23..4adaf4b7da9 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -314,8 +314,6 @@ struct sbridge_pvt {
314 struct sbridge_info info; 314 struct sbridge_info info;
315 struct sbridge_channel channel[NUM_CHANNELS]; 315 struct sbridge_channel channel[NUM_CHANNELS];
316 316
317 int csrow_map[NUM_CHANNELS][MAX_DIMMS];
318
319 /* Memory type detection */ 317 /* Memory type detection */
320 bool is_mirrored, is_lockstep, is_close_pg; 318 bool is_mirrored, is_lockstep, is_close_pg;
321 319
@@ -487,29 +485,14 @@ static struct pci_dev *get_pdev_slot_func(u8 bus, unsigned slot,
487} 485}
488 486
489/** 487/**
490 * sbridge_get_active_channels() - gets the number of channels and csrows 488 * check_if_ecc_is_active() - Checks if ECC is active
491 * bus: Device bus 489 * bus: Device bus
492 * @channels: Number of channels that will be returned
493 * @csrows: Number of csrows found
494 *
495 * Since EDAC core needs to know in advance the number of available channels
496 * and csrows, in order to allocate memory for csrows/channels, it is needed
497 * to run two similar steps. At the first step, implemented on this function,
498 * it checks the number of csrows/channels present at one socket, identified
499 * by the associated PCI bus.
500 * this is used in order to properly allocate the size of mci components.
501 * Note: one csrow is one dimm.
502 */ 490 */
503static int sbridge_get_active_channels(const u8 bus, unsigned *channels, 491static int check_if_ecc_is_active(const u8 bus)
504 unsigned *csrows)
505{ 492{
506 struct pci_dev *pdev = NULL; 493 struct pci_dev *pdev = NULL;
507 int i, j;
508 u32 mcmtr; 494 u32 mcmtr;
509 495
510 *channels = 0;
511 *csrows = 0;
512
513 pdev = get_pdev_slot_func(bus, 15, 0); 496 pdev = get_pdev_slot_func(bus, 15, 0);
514 if (!pdev) { 497 if (!pdev) {
515 sbridge_printk(KERN_ERR, "Couldn't find PCI device " 498 sbridge_printk(KERN_ERR, "Couldn't find PCI device "
@@ -523,41 +506,14 @@ static int sbridge_get_active_channels(const u8 bus, unsigned *channels,
523 sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n"); 506 sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n");
524 return -ENODEV; 507 return -ENODEV;
525 } 508 }
526
527 for (i = 0; i < NUM_CHANNELS; i++) {
528 u32 mtr;
529
530 /* Device 15 functions 2 - 5 */
531 pdev = get_pdev_slot_func(bus, 15, 2 + i);
532 if (!pdev) {
533 sbridge_printk(KERN_ERR, "Couldn't find PCI device "
534 "%2x.%02d.%d!!!\n",
535 bus, 15, 2 + i);
536 return -ENODEV;
537 }
538 (*channels)++;
539
540 for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
541 pci_read_config_dword(pdev, mtr_regs[j], &mtr);
542 debugf1("Bus#%02x channel #%d MTR%d = %x\n", bus, i, j, mtr);
543 if (IS_DIMM_PRESENT(mtr))
544 (*csrows)++;
545 }
546 }
547
548 debugf0("Number of active channels: %d, number of active dimms: %d\n",
549 *channels, *csrows);
550
551 return 0; 509 return 0;
552} 510}
553 511
554static int get_dimm_config(const struct mem_ctl_info *mci) 512static int get_dimm_config(struct mem_ctl_info *mci)
555{ 513{
556 struct sbridge_pvt *pvt = mci->pvt_info; 514 struct sbridge_pvt *pvt = mci->pvt_info;
557 struct csrow_info *csr; 515 struct dimm_info *dimm;
558 int i, j, banks, ranks, rows, cols, size, npages; 516 int i, j, banks, ranks, rows, cols, size, npages;
559 int csrow = 0;
560 unsigned long last_page = 0;
561 u32 reg; 517 u32 reg;
562 enum edac_type mode; 518 enum edac_type mode;
563 enum mem_type mtype; 519 enum mem_type mtype;
@@ -616,6 +572,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
616 u32 mtr; 572 u32 mtr;
617 573
618 for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { 574 for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
575 dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
576 i, j, 0);
619 pci_read_config_dword(pvt->pci_tad[i], 577 pci_read_config_dword(pvt->pci_tad[i],
620 mtr_regs[j], &mtr); 578 mtr_regs[j], &mtr);
621 debugf4("Channel #%d MTR%d = %x\n", i, j, mtr); 579 debugf4("Channel #%d MTR%d = %x\n", i, j, mtr);
@@ -634,29 +592,15 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
634 pvt->sbridge_dev->mc, i, j, 592 pvt->sbridge_dev->mc, i, j,
635 size, npages, 593 size, npages,
636 banks, ranks, rows, cols); 594 banks, ranks, rows, cols);
637 csr = &mci->csrows[csrow]; 595
638 596 dimm->nr_pages = npages;
639 csr->first_page = last_page; 597 dimm->grain = 32;
640 csr->last_page = last_page + npages - 1; 598 dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
641 csr->page_mask = 0UL; /* Unused */ 599 dimm->mtype = mtype;
642 csr->nr_pages = npages; 600 dimm->edac_mode = mode;
643 csr->grain = 32; 601 snprintf(dimm->label, sizeof(dimm->label),
644 csr->csrow_idx = csrow;
645 csr->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
646 csr->ce_count = 0;
647 csr->ue_count = 0;
648 csr->mtype = mtype;
649 csr->edac_mode = mode;
650 csr->nr_channels = 1;
651 csr->channels[0].chan_idx = i;
652 csr->channels[0].ce_count = 0;
653 pvt->csrow_map[i][j] = csrow;
654 snprintf(csr->channels[0].label,
655 sizeof(csr->channels[0].label),
656 "CPU_SrcID#%u_Channel#%u_DIMM#%u", 602 "CPU_SrcID#%u_Channel#%u_DIMM#%u",
657 pvt->sbridge_dev->source_id, i, j); 603 pvt->sbridge_dev->source_id, i, j);
658 last_page += npages;
659 csrow++;
660 } 604 }
661 } 605 }
662 } 606 }
@@ -844,11 +788,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
844 u8 *socket, 788 u8 *socket,
845 long *channel_mask, 789 long *channel_mask,
846 u8 *rank, 790 u8 *rank,
847 char *area_type) 791 char **area_type, char *msg)
848{ 792{
849 struct mem_ctl_info *new_mci; 793 struct mem_ctl_info *new_mci;
850 struct sbridge_pvt *pvt = mci->pvt_info; 794 struct sbridge_pvt *pvt = mci->pvt_info;
851 char msg[256];
852 int n_rir, n_sads, n_tads, sad_way, sck_xch; 795 int n_rir, n_sads, n_tads, sad_way, sck_xch;
853 int sad_interl, idx, base_ch; 796 int sad_interl, idx, base_ch;
854 int interleave_mode; 797 int interleave_mode;
@@ -870,12 +813,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
870 */ 813 */
871 if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) { 814 if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) {
872 sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr); 815 sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr);
873 edac_mc_handle_ce_no_info(mci, msg);
874 return -EINVAL; 816 return -EINVAL;
875 } 817 }
876 if (addr >= (u64)pvt->tohm) { 818 if (addr >= (u64)pvt->tohm) {
877 sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr); 819 sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr);
878 edac_mc_handle_ce_no_info(mci, msg);
879 return -EINVAL; 820 return -EINVAL;
880 } 821 }
881 822
@@ -892,7 +833,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
892 limit = SAD_LIMIT(reg); 833 limit = SAD_LIMIT(reg);
893 if (limit <= prv) { 834 if (limit <= prv) {
894 sprintf(msg, "Can't discover the memory socket"); 835 sprintf(msg, "Can't discover the memory socket");
895 edac_mc_handle_ce_no_info(mci, msg);
896 return -EINVAL; 836 return -EINVAL;
897 } 837 }
898 if (addr <= limit) 838 if (addr <= limit)
@@ -901,10 +841,9 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
901 } 841 }
902 if (n_sads == MAX_SAD) { 842 if (n_sads == MAX_SAD) {
903 sprintf(msg, "Can't discover the memory socket"); 843 sprintf(msg, "Can't discover the memory socket");
904 edac_mc_handle_ce_no_info(mci, msg);
905 return -EINVAL; 844 return -EINVAL;
906 } 845 }
907 area_type = get_dram_attr(reg); 846 *area_type = get_dram_attr(reg);
908 interleave_mode = INTERLEAVE_MODE(reg); 847 interleave_mode = INTERLEAVE_MODE(reg);
909 848
910 pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads], 849 pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads],
@@ -942,7 +881,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
942 break; 881 break;
943 default: 882 default:
944 sprintf(msg, "Can't discover socket interleave"); 883 sprintf(msg, "Can't discover socket interleave");
945 edac_mc_handle_ce_no_info(mci, msg);
946 return -EINVAL; 884 return -EINVAL;
947 } 885 }
948 *socket = sad_interleave[idx]; 886 *socket = sad_interleave[idx];
@@ -957,7 +895,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
957 if (!new_mci) { 895 if (!new_mci) {
958 sprintf(msg, "Struct for socket #%u wasn't initialized", 896 sprintf(msg, "Struct for socket #%u wasn't initialized",
959 *socket); 897 *socket);
960 edac_mc_handle_ce_no_info(mci, msg);
961 return -EINVAL; 898 return -EINVAL;
962 } 899 }
963 mci = new_mci; 900 mci = new_mci;
@@ -973,7 +910,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
973 limit = TAD_LIMIT(reg); 910 limit = TAD_LIMIT(reg);
974 if (limit <= prv) { 911 if (limit <= prv) {
975 sprintf(msg, "Can't discover the memory channel"); 912 sprintf(msg, "Can't discover the memory channel");
976 edac_mc_handle_ce_no_info(mci, msg);
977 return -EINVAL; 913 return -EINVAL;
978 } 914 }
979 if (addr <= limit) 915 if (addr <= limit)
@@ -1013,7 +949,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
1013 break; 949 break;
1014 default: 950 default:
1015 sprintf(msg, "Can't discover the TAD target"); 951 sprintf(msg, "Can't discover the TAD target");
1016 edac_mc_handle_ce_no_info(mci, msg);
1017 return -EINVAL; 952 return -EINVAL;
1018 } 953 }
1019 *channel_mask = 1 << base_ch; 954 *channel_mask = 1 << base_ch;
@@ -1027,7 +962,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
1027 break; 962 break;
1028 default: 963 default:
1029 sprintf(msg, "Invalid mirror set. Can't decode addr"); 964 sprintf(msg, "Invalid mirror set. Can't decode addr");
1030 edac_mc_handle_ce_no_info(mci, msg);
1031 return -EINVAL; 965 return -EINVAL;
1032 } 966 }
1033 } else 967 } else
@@ -1055,7 +989,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
1055 if (offset > addr) { 989 if (offset > addr) {
1056 sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!", 990 sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!",
1057 offset, addr); 991 offset, addr);
1058 edac_mc_handle_ce_no_info(mci, msg);
1059 return -EINVAL; 992 return -EINVAL;
1060 } 993 }
1061 addr -= offset; 994 addr -= offset;
@@ -1095,7 +1028,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
1095 if (n_rir == MAX_RIR_RANGES) { 1028 if (n_rir == MAX_RIR_RANGES) {
1096 sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx", 1029 sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx",
1097 ch_addr); 1030 ch_addr);
1098 edac_mc_handle_ce_no_info(mci, msg);
1099 return -EINVAL; 1031 return -EINVAL;
1100 } 1032 }
1101 rir_way = RIR_WAY(reg); 1033 rir_way = RIR_WAY(reg);
@@ -1409,7 +1341,8 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1409{ 1341{
1410 struct mem_ctl_info *new_mci; 1342 struct mem_ctl_info *new_mci;
1411 struct sbridge_pvt *pvt = mci->pvt_info; 1343 struct sbridge_pvt *pvt = mci->pvt_info;
1412 char *type, *optype, *msg, *recoverable_msg; 1344 enum hw_event_mc_err_type tp_event;
1345 char *type, *optype, msg[256];
1413 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); 1346 bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
1414 bool overflow = GET_BITFIELD(m->status, 62, 62); 1347 bool overflow = GET_BITFIELD(m->status, 62, 62);
1415 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); 1348 bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -1421,13 +1354,21 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1421 u32 optypenum = GET_BITFIELD(m->status, 4, 6); 1354 u32 optypenum = GET_BITFIELD(m->status, 4, 6);
1422 long channel_mask, first_channel; 1355 long channel_mask, first_channel;
1423 u8 rank, socket; 1356 u8 rank, socket;
1424 int csrow, rc, dimm; 1357 int rc, dimm;
1425 char *area_type = "Unknown"; 1358 char *area_type = NULL;
1426 1359
1427 if (ripv) 1360 if (uncorrected_error) {
1428 type = "NON_FATAL"; 1361 if (ripv) {
1429 else 1362 type = "FATAL";
1430 type = "FATAL"; 1363 tp_event = HW_EVENT_ERR_FATAL;
1364 } else {
1365 type = "NON_FATAL";
1366 tp_event = HW_EVENT_ERR_UNCORRECTED;
1367 }
1368 } else {
1369 type = "CORRECTED";
1370 tp_event = HW_EVENT_ERR_CORRECTED;
1371 }
1431 1372
1432 /* 1373 /*
1433 * According with Table 15-9 of the Intel Architecture spec vol 3A, 1374 * According with Table 15-9 of the Intel Architecture spec vol 3A,
@@ -1445,19 +1386,19 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1445 } else { 1386 } else {
1446 switch (optypenum) { 1387 switch (optypenum) {
1447 case 0: 1388 case 0:
1448 optype = "generic undef request"; 1389 optype = "generic undef request error";
1449 break; 1390 break;
1450 case 1: 1391 case 1:
1451 optype = "memory read"; 1392 optype = "memory read error";
1452 break; 1393 break;
1453 case 2: 1394 case 2:
1454 optype = "memory write"; 1395 optype = "memory write error";
1455 break; 1396 break;
1456 case 3: 1397 case 3:
1457 optype = "addr/cmd"; 1398 optype = "addr/cmd error";
1458 break; 1399 break;
1459 case 4: 1400 case 4:
1460 optype = "memory scrubbing"; 1401 optype = "memory scrubbing error";
1461 break; 1402 break;
1462 default: 1403 default:
1463 optype = "reserved"; 1404 optype = "reserved";
@@ -1466,13 +1407,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1466 } 1407 }
1467 1408
1468 rc = get_memory_error_data(mci, m->addr, &socket, 1409 rc = get_memory_error_data(mci, m->addr, &socket,
1469 &channel_mask, &rank, area_type); 1410 &channel_mask, &rank, &area_type, msg);
1470 if (rc < 0) 1411 if (rc < 0)
1471 return; 1412 goto err_parsing;
1472 new_mci = get_mci_for_node_id(socket); 1413 new_mci = get_mci_for_node_id(socket);
1473 if (!new_mci) { 1414 if (!new_mci) {
1474 edac_mc_handle_ce_no_info(mci, "Error: socket got corrupted!"); 1415 strcpy(msg, "Error: socket got corrupted!");
1475 return; 1416 goto err_parsing;
1476 } 1417 }
1477 mci = new_mci; 1418 mci = new_mci;
1478 pvt = mci->pvt_info; 1419 pvt = mci->pvt_info;
@@ -1486,45 +1427,39 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
1486 else 1427 else
1487 dimm = 2; 1428 dimm = 2;
1488 1429
1489 csrow = pvt->csrow_map[first_channel][dimm];
1490
1491 if (uncorrected_error && recoverable)
1492 recoverable_msg = " recoverable";
1493 else
1494 recoverable_msg = "";
1495 1430
1496 /* 1431 /*
1497 * FIXME: What should we do with "channel" information on mcelog? 1432 * FIXME: On some memory configurations (mirror, lockstep), the
1498 * Probably, we can just discard it, as the channel information 1433 * Memory Controller can't point the error to a single DIMM. The
1499 * comes from the get_memory_error_data() address decoding 1434 * EDAC core should be handling the channel mask, in order to point
1435 * to the group of dimm's where the error may be happening.
1500 */ 1436 */
1501 msg = kasprintf(GFP_ATOMIC, 1437 snprintf(msg, sizeof(msg),
1502 "%d %s error(s): %s on %s area %s%s: cpu=%d Err=%04x:%04x (ch=%d), " 1438 "count:%d%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
1503 "addr = 0x%08llx => socket=%d, Channel=%ld(mask=%ld), rank=%d\n", 1439 core_err_cnt,
1504 core_err_cnt, 1440 overflow ? " OVERFLOW" : "",
1505 area_type, 1441 (uncorrected_error && recoverable) ? " recoverable" : "",
1506 optype, 1442 area_type,
1507 type, 1443 mscod, errcode,
1508 recoverable_msg, 1444 socket,
1509 overflow ? "OVERFLOW" : "", 1445 channel_mask,
1510 m->cpu, 1446 rank);
1511 mscod, errcode,
1512 channel, /* 1111b means not specified */
1513 (long long) m->addr,
1514 socket,
1515 first_channel, /* This is the real channel on SB */
1516 channel_mask,
1517 rank);
1518 1447
1519 debugf0("%s", msg); 1448 debugf0("%s", msg);
1520 1449
1450 /* FIXME: need support for channel mask */
1451
1521 /* Call the helper to output message */ 1452 /* Call the helper to output message */
1522 if (uncorrected_error) 1453 edac_mc_handle_error(tp_event, mci,
1523 edac_mc_handle_fbd_ue(mci, csrow, 0, 0, msg); 1454 m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
1524 else 1455 channel, dimm, -1,
1525 edac_mc_handle_fbd_ce(mci, csrow, 0, msg); 1456 optype, msg, m);
1457 return;
1458err_parsing:
1459 edac_mc_handle_error(tp_event, mci, 0, 0, 0,
1460 -1, -1, -1,
1461 msg, "", m);
1526 1462
1527 kfree(msg);
1528} 1463}
1529 1464
1530/* 1465/*
@@ -1683,16 +1618,25 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev)
1683static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) 1618static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
1684{ 1619{
1685 struct mem_ctl_info *mci; 1620 struct mem_ctl_info *mci;
1621 struct edac_mc_layer layers[2];
1686 struct sbridge_pvt *pvt; 1622 struct sbridge_pvt *pvt;
1687 int rc, channels, csrows; 1623 int rc;
1688 1624
1689 /* Check the number of active and not disabled channels */ 1625 /* Check the number of active and not disabled channels */
1690 rc = sbridge_get_active_channels(sbridge_dev->bus, &channels, &csrows); 1626 rc = check_if_ecc_is_active(sbridge_dev->bus);
1691 if (unlikely(rc < 0)) 1627 if (unlikely(rc < 0))
1692 return rc; 1628 return rc;
1693 1629
1694 /* allocate a new MC control structure */ 1630 /* allocate a new MC control structure */
1695 mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, sbridge_dev->mc); 1631 layers[0].type = EDAC_MC_LAYER_CHANNEL;
1632 layers[0].size = NUM_CHANNELS;
1633 layers[0].is_virt_csrow = false;
1634 layers[1].type = EDAC_MC_LAYER_SLOT;
1635 layers[1].size = MAX_DIMMS;
1636 layers[1].is_virt_csrow = true;
1637 mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers,
1638 sizeof(*pvt));
1639
1696 if (unlikely(!mci)) 1640 if (unlikely(!mci))
1697 return -ENOMEM; 1641 return -ENOMEM;
1698 1642