aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2012-08-30 12:01:36 -0400
committerBorislav Petkov <bp@alien8.de>2012-11-28 05:45:34 -0500
commit33ca0643c9a0ea50d0dc9bf0e9e9044502c7038c (patch)
tree70a65c1bcc5ea463c3afd38a02d3eccfff9e53ec
parentc8d1adf092d8aa1ed947da789a99eee1130aa304 (diff)
amd64_edac: Reorganize error reporting path
Rewrite CE/UE paths so that they use the same code and drop additional code duplication in handle_ue. Add a struct err_info which collects required info for the error reporting. This, in turn, helps slimming all edac_mc_handle_error() calls down to one. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
-rw-r--r--drivers/edac/amd64_edac.c194
-rw-r--r--drivers/edac/amd64_edac.h19
2 files changed, 89 insertions, 124 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 987d6acd8f4e..d21efb246d43 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -709,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
709 709
710/* Map the Error address to a PAGE and PAGE OFFSET. */ 710/* Map the Error address to a PAGE and PAGE OFFSET. */
711static inline void error_address_to_page_and_offset(u64 error_address, 711static inline void error_address_to_page_and_offset(u64 error_address,
712 u32 *page, u32 *offset) 712 struct err_info *err)
713{ 713{
714 *page = (u32) (error_address >> PAGE_SHIFT); 714 err->page = (u32) (error_address >> PAGE_SHIFT);
715 *offset = ((u32) error_address) & ~PAGE_MASK; 715 err->offset = ((u32) error_address) & ~PAGE_MASK;
716} 716}
717 717
718/* 718/*
@@ -1023,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
1023} 1023}
1024 1024
1025static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, 1025static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
1026 u16 syndrome) 1026 struct err_info *err)
1027{ 1027{
1028 struct mem_ctl_info *src_mci;
1029 struct amd64_pvt *pvt = mci->pvt_info; 1028 struct amd64_pvt *pvt = mci->pvt_info;
1030 int channel, csrow;
1031 u32 page, offset;
1032 1029
1033 error_address_to_page_and_offset(sys_addr, &page, &offset); 1030 error_address_to_page_and_offset(sys_addr, err);
1034 1031
1035 /* 1032 /*
1036 * Find out which node the error address belongs to. This may be 1033 * Find out which node the error address belongs to. This may be
1037 * different from the node that detected the error. 1034 * different from the node that detected the error.
1038 */ 1035 */
1039 src_mci = find_mc_by_sys_addr(mci, sys_addr); 1036 err->src_mci = find_mc_by_sys_addr(mci, sys_addr);
1040 if (!src_mci) { 1037 if (!err->src_mci) {
1041 amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", 1038 amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
1042 (unsigned long)sys_addr); 1039 (unsigned long)sys_addr);
1043 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, 1040 err->err_code = ERR_NODE;
1044 page, offset, syndrome,
1045 -1, -1, -1,
1046 "failed to map error addr to a node",
1047 "");
1048 return; 1041 return;
1049 } 1042 }
1050 1043
1051 /* Now map the sys_addr to a CSROW */ 1044 /* Now map the sys_addr to a CSROW */
1052 csrow = sys_addr_to_csrow(src_mci, sys_addr); 1045 err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr);
1053 if (csrow < 0) { 1046 if (err->csrow < 0) {
1054 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, 1047 err->err_code = ERR_CSROW;
1055 page, offset, syndrome,
1056 -1, -1, -1,
1057 "failed to map error addr to a csrow",
1058 "");
1059 return; 1048 return;
1060 } 1049 }
1061 1050
1062 /* CHIPKILL enabled */ 1051 /* CHIPKILL enabled */
1063 if (pvt->nbcfg & NBCFG_CHIPKILL) { 1052 if (pvt->nbcfg & NBCFG_CHIPKILL) {
1064 channel = get_channel_from_ecc_syndrome(mci, syndrome); 1053 err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
1065 if (channel < 0) { 1054 if (err->channel < 0) {
1066 /* 1055 /*
1067 * Syndrome didn't map, so we don't know which of the 1056 * Syndrome didn't map, so we don't know which of the
1068 * 2 DIMMs is in error. So we need to ID 'both' of them 1057 * 2 DIMMs is in error. So we need to ID 'both' of them
1069 * as suspect. 1058 * as suspect.
1070 */ 1059 */
1071 amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - " 1060 amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - "
1072 "possible error reporting race\n", 1061 "possible error reporting race\n",
1073 syndrome); 1062 err->syndrome);
1074 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, 1063 err->err_code = ERR_CHANNEL;
1075 page, offset, syndrome,
1076 csrow, -1, -1,
1077 "unknown syndrome - possible error reporting race",
1078 "");
1079 return; 1064 return;
1080 } 1065 }
1081 } else { 1066 } else {
@@ -1087,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
1087 * was obtained from email communication with someone at AMD. 1072 * was obtained from email communication with someone at AMD.
1088 * (Wish the email was placed in this comment - norsk) 1073 * (Wish the email was placed in this comment - norsk)
1089 */ 1074 */
1090 channel = ((sys_addr & BIT(3)) != 0); 1075 err->channel = ((sys_addr & BIT(3)) != 0);
1091 } 1076 }
1092
1093 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1,
1094 page, offset, syndrome,
1095 csrow, channel, -1,
1096 "", "");
1097} 1077}
1098 1078
1099static int ddr2_cs_size(unsigned i, bool dct_width) 1079static int ddr2_cs_size(unsigned i, bool dct_width)
@@ -1479,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr)
1479 1459
1480/* For a given @dram_range, check if @sys_addr falls within it. */ 1460/* For a given @dram_range, check if @sys_addr falls within it. */
1481static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, 1461static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
1482 u64 sys_addr, int *nid, int *chan_sel) 1462 u64 sys_addr, int *chan_sel)
1483{ 1463{
1484 int cs_found = -EINVAL; 1464 int cs_found = -EINVAL;
1485 u64 chan_addr; 1465 u64 chan_addr;
@@ -1552,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
1552 1532
1553 cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel); 1533 cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel);
1554 1534
1555 if (cs_found >= 0) { 1535 if (cs_found >= 0)
1556 *nid = node_id;
1557 *chan_sel = channel; 1536 *chan_sel = channel;
1558 } 1537
1559 return cs_found; 1538 return cs_found;
1560} 1539}
1561 1540
1562static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, 1541static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
1563 int *node, int *chan_sel) 1542 int *chan_sel)
1564{ 1543{
1565 int cs_found = -EINVAL; 1544 int cs_found = -EINVAL;
1566 unsigned range; 1545 unsigned range;
@@ -1574,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
1574 (get_dram_limit(pvt, range) >= sys_addr)) { 1553 (get_dram_limit(pvt, range) >= sys_addr)) {
1575 1554
1576 cs_found = f1x_match_to_this_node(pvt, range, 1555 cs_found = f1x_match_to_this_node(pvt, range,
1577 sys_addr, node, 1556 sys_addr, chan_sel);
1578 chan_sel);
1579 if (cs_found >= 0) 1557 if (cs_found >= 0)
1580 break; 1558 break;
1581 } 1559 }
@@ -1591,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
1591 * (MCX_ADDR). 1569 * (MCX_ADDR).
1592 */ 1570 */
1593static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, 1571static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
1594 u16 syndrome) 1572 struct err_info *err)
1595{ 1573{
1596 struct amd64_pvt *pvt = mci->pvt_info; 1574 struct amd64_pvt *pvt = mci->pvt_info;
1597 u32 page, offset;
1598 int nid, csrow, chan = 0;
1599 1575
1600 error_address_to_page_and_offset(sys_addr, &page, &offset); 1576 error_address_to_page_and_offset(sys_addr, err);
1601 1577
1602 csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan); 1578 err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel);
1603 1579 if (err->csrow < 0) {
1604 if (csrow < 0) { 1580 err->err_code = ERR_CSROW;
1605 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
1606 page, offset, syndrome,
1607 -1, -1, -1,
1608 "failed to map error addr to a csrow",
1609 "");
1610 return; 1581 return;
1611 } 1582 }
1612 1583
@@ -1616,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
1616 * this point. 1587 * this point.
1617 */ 1588 */
1618 if (dct_ganging_enabled(pvt)) 1589 if (dct_ganging_enabled(pvt))
1619 chan = get_channel_from_ecc_syndrome(mci, syndrome); 1590 err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
1620
1621 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
1622 page, offset, syndrome,
1623 csrow, chan, -1,
1624 "", "");
1625} 1591}
1626 1592
1627/* 1593/*
@@ -1890,78 +1856,54 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
1890 return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); 1856 return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
1891} 1857}
1892 1858
1893/* 1859static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err,
1894 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR 1860 u8 ecc_type)
1895 * ADDRESS and process.
1896 */
1897static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m)
1898{ 1861{
1899 struct amd64_pvt *pvt = mci->pvt_info; 1862 enum hw_event_mc_err_type err_type;
1900 u64 sys_addr; 1863 const char *string;
1901 u16 syndrome;
1902
1903 sys_addr = get_error_address(m);
1904 syndrome = extract_syndrome(m->status);
1905
1906 amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr);
1907 1864
1908 pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome); 1865 if (ecc_type == 2)
1909} 1866 err_type = HW_EVENT_ERR_CORRECTED;
1910 1867 else if (ecc_type == 1)
1911/* Handle any Un-correctable Errors (UEs) */ 1868 err_type = HW_EVENT_ERR_UNCORRECTED;
1912static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) 1869 else {
1913{ 1870 WARN(1, "Something is rotten in the state of Denmark.\n");
1914 struct mem_ctl_info *log_mci, *src_mci = NULL;
1915 int csrow;
1916 u64 sys_addr;
1917 u32 page, offset;
1918
1919 log_mci = mci;
1920
1921 sys_addr = get_error_address(m);
1922 error_address_to_page_and_offset(sys_addr, &page, &offset);
1923
1924 /*
1925 * Find out which node the error address belongs to. This may be
1926 * different from the node that detected the error.
1927 */
1928 src_mci = find_mc_by_sys_addr(mci, sys_addr);
1929 if (!src_mci) {
1930 amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n",
1931 (unsigned long)sys_addr);
1932 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
1933 page, offset, 0,
1934 -1, -1, -1,
1935 "ERROR ADDRESS NOT mapped to a MC",
1936 "");
1937 return; 1871 return;
1938 } 1872 }
1939 1873
1940 log_mci = src_mci; 1874 switch (err->err_code) {
1941 1875 case DECODE_OK:
1942 csrow = sys_addr_to_csrow(log_mci, sys_addr); 1876 string = "";
1943 if (csrow < 0) { 1877 break;
1944 amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n", 1878 case ERR_NODE:
1945 (unsigned long)sys_addr); 1879 string = "Failed to map error addr to a node";
1946 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 1880 break;
1947 page, offset, 0, 1881 case ERR_CSROW:
1948 -1, -1, -1, 1882 string = "Failed to map error addr to a csrow";
1949 "ERROR ADDRESS NOT mapped to CS", 1883 break;
1950 ""); 1884 case ERR_CHANNEL:
1951 } else { 1885 string = "unknown syndrome - possible error reporting race";
1952 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 1886 break;
1953 page, offset, 0, 1887 default:
1954 csrow, -1, -1, 1888 string = "WTF error";
1955 "", ""); 1889 break;
1956 } 1890 }
1891
1892 edac_mc_handle_error(err_type, mci, 1,
1893 err->page, err->offset, err->syndrome,
1894 err->csrow, err->channel, -1,
1895 string, "");
1957} 1896}
1958 1897
1959static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, 1898static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
1960 struct mce *m) 1899 struct mce *m)
1961{ 1900{
1901 struct amd64_pvt *pvt = mci->pvt_info;
1962 u8 ecc_type = (m->status >> 45) & 0x3; 1902 u8 ecc_type = (m->status >> 45) & 0x3;
1963 u8 xec = XEC(m->status, 0x1f); 1903 u8 xec = XEC(m->status, 0x1f);
1964 u16 ec = EC(m->status); 1904 u16 ec = EC(m->status);
1905 u64 sys_addr;
1906 struct err_info err;
1965 1907
1966 /* Bail out early if this was an 'observed' error */ 1908 /* Bail out early if this was an 'observed' error */
1967 if (PP(ec) == NBSL_PP_OBS) 1909 if (PP(ec) == NBSL_PP_OBS)
@@ -1971,10 +1913,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
1971 if (xec && xec != F10_NBSL_EXT_ERR_ECC) 1913 if (xec && xec != F10_NBSL_EXT_ERR_ECC)
1972 return; 1914 return;
1973 1915
1916 memset(&err, 0, sizeof(err));
1917
1918 sys_addr = get_error_address(m);
1919
1974 if (ecc_type == 2) 1920 if (ecc_type == 2)
1975 amd64_handle_ce(mci, m); 1921 err.syndrome = extract_syndrome(m->status);
1976 else if (ecc_type == 1) 1922
1977 amd64_handle_ue(mci, m); 1923 pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err);
1924
1925 __log_bus_error(mci, &err, ecc_type);
1978} 1926}
1979 1927
1980void amd64_decode_bus_error(int node_id, struct mce *m) 1928void amd64_decode_bus_error(int node_id, struct mce *m)
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index cf7981e1f063..abefab4722c2 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -376,6 +376,23 @@ struct amd64_pvt {
376 struct error_injection injection; 376 struct error_injection injection;
377}; 377};
378 378
379enum err_codes {
380 DECODE_OK = 0,
381 ERR_NODE = -1,
382 ERR_CSROW = -2,
383 ERR_CHANNEL = -3,
384};
385
386struct err_info {
387 int err_code;
388 struct mem_ctl_info *src_mci;
389 int csrow;
390 int channel;
391 u16 syndrome;
392 u32 page;
393 u32 offset;
394};
395
379static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i) 396static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
380{ 397{
381 u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; 398 u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
@@ -449,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci)
449struct low_ops { 466struct low_ops {
450 int (*early_channel_count) (struct amd64_pvt *pvt); 467 int (*early_channel_count) (struct amd64_pvt *pvt);
451 void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, 468 void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr,
452 u16 syndrome); 469 struct err_info *);
453 int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode); 470 int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode);
454 int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset, 471 int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset,
455 u32 *val, const char *func); 472 u32 *val, const char *func);