diff options
author | Borislav Petkov <borislav.petkov@amd.com> | 2012-08-30 12:01:36 -0400 |
---|---|---|
committer | Borislav Petkov <bp@alien8.de> | 2012-11-28 05:45:34 -0500 |
commit | 33ca0643c9a0ea50d0dc9bf0e9e9044502c7038c (patch) | |
tree | 70a65c1bcc5ea463c3afd38a02d3eccfff9e53ec /drivers/edac | |
parent | c8d1adf092d8aa1ed947da789a99eee1130aa304 (diff) |
amd64_edac: Reorganize error reporting path
Rewrite CE/UE paths so that they use the same code and drop additional
code duplication in handle_ue. Add a struct err_info which collects
required info for the error reporting. This, in turn, helps slimming all
edac_mc_handle_error() calls down to one.
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/amd64_edac.c | 194 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 19 |
2 files changed, 89 insertions, 124 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 987d6acd8f4e..d21efb246d43 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c | |||
@@ -709,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci, | |||
709 | 709 | ||
710 | /* Map the Error address to a PAGE and PAGE OFFSET. */ | 710 | /* Map the Error address to a PAGE and PAGE OFFSET. */ |
711 | static inline void error_address_to_page_and_offset(u64 error_address, | 711 | static inline void error_address_to_page_and_offset(u64 error_address, |
712 | u32 *page, u32 *offset) | 712 | struct err_info *err) |
713 | { | 713 | { |
714 | *page = (u32) (error_address >> PAGE_SHIFT); | 714 | err->page = (u32) (error_address >> PAGE_SHIFT); |
715 | *offset = ((u32) error_address) & ~PAGE_MASK; | 715 | err->offset = ((u32) error_address) & ~PAGE_MASK; |
716 | } | 716 | } |
717 | 717 | ||
718 | /* | 718 | /* |
@@ -1023,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range) | |||
1023 | } | 1023 | } |
1024 | 1024 | ||
1025 | static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, | 1025 | static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, |
1026 | u16 syndrome) | 1026 | struct err_info *err) |
1027 | { | 1027 | { |
1028 | struct mem_ctl_info *src_mci; | ||
1029 | struct amd64_pvt *pvt = mci->pvt_info; | 1028 | struct amd64_pvt *pvt = mci->pvt_info; |
1030 | int channel, csrow; | ||
1031 | u32 page, offset; | ||
1032 | 1029 | ||
1033 | error_address_to_page_and_offset(sys_addr, &page, &offset); | 1030 | error_address_to_page_and_offset(sys_addr, err); |
1034 | 1031 | ||
1035 | /* | 1032 | /* |
1036 | * Find out which node the error address belongs to. This may be | 1033 | * Find out which node the error address belongs to. This may be |
1037 | * different from the node that detected the error. | 1034 | * different from the node that detected the error. |
1038 | */ | 1035 | */ |
1039 | src_mci = find_mc_by_sys_addr(mci, sys_addr); | 1036 | err->src_mci = find_mc_by_sys_addr(mci, sys_addr); |
1040 | if (!src_mci) { | 1037 | if (!err->src_mci) { |
1041 | amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", | 1038 | amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", |
1042 | (unsigned long)sys_addr); | 1039 | (unsigned long)sys_addr); |
1043 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, | 1040 | err->err_code = ERR_NODE; |
1044 | page, offset, syndrome, | ||
1045 | -1, -1, -1, | ||
1046 | "failed to map error addr to a node", | ||
1047 | ""); | ||
1048 | return; | 1041 | return; |
1049 | } | 1042 | } |
1050 | 1043 | ||
1051 | /* Now map the sys_addr to a CSROW */ | 1044 | /* Now map the sys_addr to a CSROW */ |
1052 | csrow = sys_addr_to_csrow(src_mci, sys_addr); | 1045 | err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr); |
1053 | if (csrow < 0) { | 1046 | if (err->csrow < 0) { |
1054 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, | 1047 | err->err_code = ERR_CSROW; |
1055 | page, offset, syndrome, | ||
1056 | -1, -1, -1, | ||
1057 | "failed to map error addr to a csrow", | ||
1058 | ""); | ||
1059 | return; | 1048 | return; |
1060 | } | 1049 | } |
1061 | 1050 | ||
1062 | /* CHIPKILL enabled */ | 1051 | /* CHIPKILL enabled */ |
1063 | if (pvt->nbcfg & NBCFG_CHIPKILL) { | 1052 | if (pvt->nbcfg & NBCFG_CHIPKILL) { |
1064 | channel = get_channel_from_ecc_syndrome(mci, syndrome); | 1053 | err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); |
1065 | if (channel < 0) { | 1054 | if (err->channel < 0) { |
1066 | /* | 1055 | /* |
1067 | * Syndrome didn't map, so we don't know which of the | 1056 | * Syndrome didn't map, so we don't know which of the |
1068 | * 2 DIMMs is in error. So we need to ID 'both' of them | 1057 | * 2 DIMMs is in error. So we need to ID 'both' of them |
1069 | * as suspect. | 1058 | * as suspect. |
1070 | */ | 1059 | */ |
1071 | amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - " | 1060 | amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - " |
1072 | "possible error reporting race\n", | 1061 | "possible error reporting race\n", |
1073 | syndrome); | 1062 | err->syndrome); |
1074 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, | 1063 | err->err_code = ERR_CHANNEL; |
1075 | page, offset, syndrome, | ||
1076 | csrow, -1, -1, | ||
1077 | "unknown syndrome - possible error reporting race", | ||
1078 | ""); | ||
1079 | return; | 1064 | return; |
1080 | } | 1065 | } |
1081 | } else { | 1066 | } else { |
@@ -1087,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, | |||
1087 | * was obtained from email communication with someone at AMD. | 1072 | * was obtained from email communication with someone at AMD. |
1088 | * (Wish the email was placed in this comment - norsk) | 1073 | * (Wish the email was placed in this comment - norsk) |
1089 | */ | 1074 | */ |
1090 | channel = ((sys_addr & BIT(3)) != 0); | 1075 | err->channel = ((sys_addr & BIT(3)) != 0); |
1091 | } | 1076 | } |
1092 | |||
1093 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1, | ||
1094 | page, offset, syndrome, | ||
1095 | csrow, channel, -1, | ||
1096 | "", ""); | ||
1097 | } | 1077 | } |
1098 | 1078 | ||
1099 | static int ddr2_cs_size(unsigned i, bool dct_width) | 1079 | static int ddr2_cs_size(unsigned i, bool dct_width) |
@@ -1479,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr) | |||
1479 | 1459 | ||
1480 | /* For a given @dram_range, check if @sys_addr falls within it. */ | 1460 | /* For a given @dram_range, check if @sys_addr falls within it. */ |
1481 | static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, | 1461 | static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, |
1482 | u64 sys_addr, int *nid, int *chan_sel) | 1462 | u64 sys_addr, int *chan_sel) |
1483 | { | 1463 | { |
1484 | int cs_found = -EINVAL; | 1464 | int cs_found = -EINVAL; |
1485 | u64 chan_addr; | 1465 | u64 chan_addr; |
@@ -1552,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, | |||
1552 | 1532 | ||
1553 | cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel); | 1533 | cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel); |
1554 | 1534 | ||
1555 | if (cs_found >= 0) { | 1535 | if (cs_found >= 0) |
1556 | *nid = node_id; | ||
1557 | *chan_sel = channel; | 1536 | *chan_sel = channel; |
1558 | } | 1537 | |
1559 | return cs_found; | 1538 | return cs_found; |
1560 | } | 1539 | } |
1561 | 1540 | ||
1562 | static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, | 1541 | static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, |
1563 | int *node, int *chan_sel) | 1542 | int *chan_sel) |
1564 | { | 1543 | { |
1565 | int cs_found = -EINVAL; | 1544 | int cs_found = -EINVAL; |
1566 | unsigned range; | 1545 | unsigned range; |
@@ -1574,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, | |||
1574 | (get_dram_limit(pvt, range) >= sys_addr)) { | 1553 | (get_dram_limit(pvt, range) >= sys_addr)) { |
1575 | 1554 | ||
1576 | cs_found = f1x_match_to_this_node(pvt, range, | 1555 | cs_found = f1x_match_to_this_node(pvt, range, |
1577 | sys_addr, node, | 1556 | sys_addr, chan_sel); |
1578 | chan_sel); | ||
1579 | if (cs_found >= 0) | 1557 | if (cs_found >= 0) |
1580 | break; | 1558 | break; |
1581 | } | 1559 | } |
@@ -1591,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, | |||
1591 | * (MCX_ADDR). | 1569 | * (MCX_ADDR). |
1592 | */ | 1570 | */ |
1593 | static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, | 1571 | static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, |
1594 | u16 syndrome) | 1572 | struct err_info *err) |
1595 | { | 1573 | { |
1596 | struct amd64_pvt *pvt = mci->pvt_info; | 1574 | struct amd64_pvt *pvt = mci->pvt_info; |
1597 | u32 page, offset; | ||
1598 | int nid, csrow, chan = 0; | ||
1599 | 1575 | ||
1600 | error_address_to_page_and_offset(sys_addr, &page, &offset); | 1576 | error_address_to_page_and_offset(sys_addr, err); |
1601 | 1577 | ||
1602 | csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan); | 1578 | err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel); |
1603 | 1579 | if (err->csrow < 0) { | |
1604 | if (csrow < 0) { | 1580 | err->err_code = ERR_CSROW; |
1605 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, | ||
1606 | page, offset, syndrome, | ||
1607 | -1, -1, -1, | ||
1608 | "failed to map error addr to a csrow", | ||
1609 | ""); | ||
1610 | return; | 1581 | return; |
1611 | } | 1582 | } |
1612 | 1583 | ||
@@ -1616,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, | |||
1616 | * this point. | 1587 | * this point. |
1617 | */ | 1588 | */ |
1618 | if (dct_ganging_enabled(pvt)) | 1589 | if (dct_ganging_enabled(pvt)) |
1619 | chan = get_channel_from_ecc_syndrome(mci, syndrome); | 1590 | err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); |
1620 | |||
1621 | edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, | ||
1622 | page, offset, syndrome, | ||
1623 | csrow, chan, -1, | ||
1624 | "", ""); | ||
1625 | } | 1591 | } |
1626 | 1592 | ||
1627 | /* | 1593 | /* |
@@ -1890,78 +1856,54 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome) | |||
1890 | return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); | 1856 | return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); |
1891 | } | 1857 | } |
1892 | 1858 | ||
1893 | /* | 1859 | static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err, |
1894 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR | 1860 | u8 ecc_type) |
1895 | * ADDRESS and process. | ||
1896 | */ | ||
1897 | static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m) | ||
1898 | { | 1861 | { |
1899 | struct amd64_pvt *pvt = mci->pvt_info; | 1862 | enum hw_event_mc_err_type err_type; |
1900 | u64 sys_addr; | 1863 | const char *string; |
1901 | u16 syndrome; | ||
1902 | |||
1903 | sys_addr = get_error_address(m); | ||
1904 | syndrome = extract_syndrome(m->status); | ||
1905 | |||
1906 | amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr); | ||
1907 | 1864 | ||
1908 | pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome); | 1865 | if (ecc_type == 2) |
1909 | } | 1866 | err_type = HW_EVENT_ERR_CORRECTED; |
1910 | 1867 | else if (ecc_type == 1) | |
1911 | /* Handle any Un-correctable Errors (UEs) */ | 1868 | err_type = HW_EVENT_ERR_UNCORRECTED; |
1912 | static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) | 1869 | else { |
1913 | { | 1870 | WARN(1, "Something is rotten in the state of Denmark.\n"); |
1914 | struct mem_ctl_info *log_mci, *src_mci = NULL; | ||
1915 | int csrow; | ||
1916 | u64 sys_addr; | ||
1917 | u32 page, offset; | ||
1918 | |||
1919 | log_mci = mci; | ||
1920 | |||
1921 | sys_addr = get_error_address(m); | ||
1922 | error_address_to_page_and_offset(sys_addr, &page, &offset); | ||
1923 | |||
1924 | /* | ||
1925 | * Find out which node the error address belongs to. This may be | ||
1926 | * different from the node that detected the error. | ||
1927 | */ | ||
1928 | src_mci = find_mc_by_sys_addr(mci, sys_addr); | ||
1929 | if (!src_mci) { | ||
1930 | amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n", | ||
1931 | (unsigned long)sys_addr); | ||
1932 | edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, | ||
1933 | page, offset, 0, | ||
1934 | -1, -1, -1, | ||
1935 | "ERROR ADDRESS NOT mapped to a MC", | ||
1936 | ""); | ||
1937 | return; | 1871 | return; |
1938 | } | 1872 | } |
1939 | 1873 | ||
1940 | log_mci = src_mci; | 1874 | switch (err->err_code) { |
1941 | 1875 | case DECODE_OK: | |
1942 | csrow = sys_addr_to_csrow(log_mci, sys_addr); | 1876 | string = ""; |
1943 | if (csrow < 0) { | 1877 | break; |
1944 | amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n", | 1878 | case ERR_NODE: |
1945 | (unsigned long)sys_addr); | 1879 | string = "Failed to map error addr to a node"; |
1946 | edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, | 1880 | break; |
1947 | page, offset, 0, | 1881 | case ERR_CSROW: |
1948 | -1, -1, -1, | 1882 | string = "Failed to map error addr to a csrow"; |
1949 | "ERROR ADDRESS NOT mapped to CS", | 1883 | break; |
1950 | ""); | 1884 | case ERR_CHANNEL: |
1951 | } else { | 1885 | string = "unknown syndrome - possible error reporting race"; |
1952 | edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, | 1886 | break; |
1953 | page, offset, 0, | 1887 | default: |
1954 | csrow, -1, -1, | 1888 | string = "WTF error"; |
1955 | "", ""); | 1889 | break; |
1956 | } | 1890 | } |
1891 | |||
1892 | edac_mc_handle_error(err_type, mci, 1, | ||
1893 | err->page, err->offset, err->syndrome, | ||
1894 | err->csrow, err->channel, -1, | ||
1895 | string, ""); | ||
1957 | } | 1896 | } |
1958 | 1897 | ||
1959 | static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, | 1898 | static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, |
1960 | struct mce *m) | 1899 | struct mce *m) |
1961 | { | 1900 | { |
1901 | struct amd64_pvt *pvt = mci->pvt_info; | ||
1962 | u8 ecc_type = (m->status >> 45) & 0x3; | 1902 | u8 ecc_type = (m->status >> 45) & 0x3; |
1963 | u8 xec = XEC(m->status, 0x1f); | 1903 | u8 xec = XEC(m->status, 0x1f); |
1964 | u16 ec = EC(m->status); | 1904 | u16 ec = EC(m->status); |
1905 | u64 sys_addr; | ||
1906 | struct err_info err; | ||
1965 | 1907 | ||
1966 | /* Bail out early if this was an 'observed' error */ | 1908 | /* Bail out early if this was an 'observed' error */ |
1967 | if (PP(ec) == NBSL_PP_OBS) | 1909 | if (PP(ec) == NBSL_PP_OBS) |
@@ -1971,10 +1913,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, | |||
1971 | if (xec && xec != F10_NBSL_EXT_ERR_ECC) | 1913 | if (xec && xec != F10_NBSL_EXT_ERR_ECC) |
1972 | return; | 1914 | return; |
1973 | 1915 | ||
1916 | memset(&err, 0, sizeof(err)); | ||
1917 | |||
1918 | sys_addr = get_error_address(m); | ||
1919 | |||
1974 | if (ecc_type == 2) | 1920 | if (ecc_type == 2) |
1975 | amd64_handle_ce(mci, m); | 1921 | err.syndrome = extract_syndrome(m->status); |
1976 | else if (ecc_type == 1) | 1922 | |
1977 | amd64_handle_ue(mci, m); | 1923 | pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err); |
1924 | |||
1925 | __log_bus_error(mci, &err, ecc_type); | ||
1978 | } | 1926 | } |
1979 | 1927 | ||
1980 | void amd64_decode_bus_error(int node_id, struct mce *m) | 1928 | void amd64_decode_bus_error(int node_id, struct mce *m) |
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index cf7981e1f063..abefab4722c2 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h | |||
@@ -376,6 +376,23 @@ struct amd64_pvt { | |||
376 | struct error_injection injection; | 376 | struct error_injection injection; |
377 | }; | 377 | }; |
378 | 378 | ||
379 | enum err_codes { | ||
380 | DECODE_OK = 0, | ||
381 | ERR_NODE = -1, | ||
382 | ERR_CSROW = -2, | ||
383 | ERR_CHANNEL = -3, | ||
384 | }; | ||
385 | |||
386 | struct err_info { | ||
387 | int err_code; | ||
388 | struct mem_ctl_info *src_mci; | ||
389 | int csrow; | ||
390 | int channel; | ||
391 | u16 syndrome; | ||
392 | u32 page; | ||
393 | u32 offset; | ||
394 | }; | ||
395 | |||
379 | static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i) | 396 | static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i) |
380 | { | 397 | { |
381 | u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; | 398 | u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; |
@@ -449,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci) | |||
449 | struct low_ops { | 466 | struct low_ops { |
450 | int (*early_channel_count) (struct amd64_pvt *pvt); | 467 | int (*early_channel_count) (struct amd64_pvt *pvt); |
451 | void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, | 468 | void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, |
452 | u16 syndrome); | 469 | struct err_info *); |
453 | int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode); | 470 | int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode); |
454 | int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset, | 471 | int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset, |
455 | u32 *val, const char *func); | 472 | u32 *val, const char *func); |