diff options
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 12 | ||||
| -rw-r--r-- | drivers/edac/Makefile | 6 | ||||
| -rw-r--r-- | drivers/edac/amd64_edac.c | 328 | ||||
| -rw-r--r-- | drivers/edac/amd64_edac.h | 71 | ||||
| -rw-r--r-- | drivers/edac/amd64_edac_dbg.c | 2 | ||||
| -rw-r--r-- | drivers/edac/amd64_edac_err_types.c | 161 | ||||
| -rw-r--r-- | drivers/edac/edac_mce_amd.c | 422 | ||||
| -rw-r--r-- | drivers/edac/edac_mce_amd.h | 69 |
8 files changed, 619 insertions, 452 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62f..9bfe9d2ea615 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -183,6 +183,11 @@ void mce_log(struct mce *mce) | |||
| 183 | set_bit(0, &mce_need_notify); | 183 | set_bit(0, &mce_need_notify); |
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | void __weak decode_mce(struct mce *m) | ||
| 187 | { | ||
| 188 | return; | ||
| 189 | } | ||
| 190 | |||
| 186 | static void print_mce(struct mce *m) | 191 | static void print_mce(struct mce *m) |
| 187 | { | 192 | { |
| 188 | printk(KERN_EMERG | 193 | printk(KERN_EMERG |
| @@ -205,6 +210,8 @@ static void print_mce(struct mce *m) | |||
| 205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 210 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
| 206 | m->cpuvendor, m->cpuid, m->time, m->socketid, | 211 | m->cpuvendor, m->cpuid, m->time, m->socketid, |
| 207 | m->apicid); | 212 | m->apicid); |
| 213 | |||
| 214 | decode_mce(m); | ||
| 208 | } | 215 | } |
| 209 | 216 | ||
| 210 | static void print_mce_head(void) | 217 | static void print_mce_head(void) |
| @@ -215,7 +222,10 @@ static void print_mce_head(void) | |||
| 215 | static void print_mce_tail(void) | 222 | static void print_mce_tail(void) |
| 216 | { | 223 | { |
| 217 | printk(KERN_EMERG "This is not a software problem!\n" | 224 | printk(KERN_EMERG "This is not a software problem!\n" |
| 218 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | 225 | #if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) |
| 226 | "Run through mcelog --ascii to decode and contact your hardware vendor\n" | ||
| 227 | #endif | ||
| 228 | ); | ||
| 219 | } | 229 | } |
| 220 | 230 | ||
| 221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 231 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 98aa4a7db412..cfa033ce53a7 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile | |||
| @@ -17,6 +17,10 @@ ifdef CONFIG_PCI | |||
| 17 | edac_core-objs += edac_pci.o edac_pci_sysfs.o | 17 | edac_core-objs += edac_pci.o edac_pci_sysfs.o |
| 18 | endif | 18 | endif |
| 19 | 19 | ||
| 20 | ifdef CONFIG_CPU_SUP_AMD | ||
| 21 | edac_core-objs += edac_mce_amd.o | ||
| 22 | endif | ||
| 23 | |||
| 20 | obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o | 24 | obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o |
| 21 | obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o | 25 | obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o |
| 22 | obj-$(CONFIG_EDAC_I5000) += i5000_edac.o | 26 | obj-$(CONFIG_EDAC_I5000) += i5000_edac.o |
| @@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o | |||
| 32 | obj-$(CONFIG_EDAC_I82860) += i82860_edac.o | 36 | obj-$(CONFIG_EDAC_I82860) += i82860_edac.o |
| 33 | obj-$(CONFIG_EDAC_R82600) += r82600_edac.o | 37 | obj-$(CONFIG_EDAC_R82600) += r82600_edac.o |
| 34 | 38 | ||
| 35 | amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o | 39 | amd64_edac_mod-y := amd64_edac.o |
| 36 | amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o | 40 | amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o |
| 37 | amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o | 41 | amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o |
| 38 | 42 | ||
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a1..173dc4a84166 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c | |||
| @@ -19,6 +19,63 @@ static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; | |||
| 19 | static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; | 19 | static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; |
| 20 | 20 | ||
| 21 | /* | 21 | /* |
| 22 | * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only | ||
| 23 | * for DDR2 DRAM mapping. | ||
| 24 | */ | ||
| 25 | u32 revf_quad_ddr2_shift[] = { | ||
| 26 | 0, /* 0000b NULL DIMM (128mb) */ | ||
| 27 | 28, /* 0001b 256mb */ | ||
| 28 | 29, /* 0010b 512mb */ | ||
| 29 | 29, /* 0011b 512mb */ | ||
| 30 | 29, /* 0100b 512mb */ | ||
| 31 | 30, /* 0101b 1gb */ | ||
| 32 | 30, /* 0110b 1gb */ | ||
| 33 | 31, /* 0111b 2gb */ | ||
| 34 | 31, /* 1000b 2gb */ | ||
| 35 | 32, /* 1001b 4gb */ | ||
| 36 | 32, /* 1010b 4gb */ | ||
| 37 | 33, /* 1011b 8gb */ | ||
| 38 | 0, /* 1100b future */ | ||
| 39 | 0, /* 1101b future */ | ||
| 40 | 0, /* 1110b future */ | ||
| 41 | 0 /* 1111b future */ | ||
| 42 | }; | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing | ||
| 46 | * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- | ||
| 47 | * or higher value'. | ||
| 48 | * | ||
| 49 | *FIXME: Produce a better mapping/linearisation. | ||
| 50 | */ | ||
| 51 | |||
| 52 | struct scrubrate scrubrates[] = { | ||
| 53 | { 0x01, 1600000000UL}, | ||
| 54 | { 0x02, 800000000UL}, | ||
| 55 | { 0x03, 400000000UL}, | ||
| 56 | { 0x04, 200000000UL}, | ||
| 57 | { 0x05, 100000000UL}, | ||
| 58 | { 0x06, 50000000UL}, | ||
| 59 | { 0x07, 25000000UL}, | ||
| 60 | { 0x08, 12284069UL}, | ||
| 61 | { 0x09, 6274509UL}, | ||
| 62 | { 0x0A, 3121951UL}, | ||
| 63 | { 0x0B, 1560975UL}, | ||
| 64 | { 0x0C, 781440UL}, | ||
| 65 | { 0x0D, 390720UL}, | ||
| 66 | { 0x0E, 195300UL}, | ||
| 67 | { 0x0F, 97650UL}, | ||
| 68 | { 0x10, 48854UL}, | ||
| 69 | { 0x11, 24427UL}, | ||
| 70 | { 0x12, 12213UL}, | ||
| 71 | { 0x13, 6101UL}, | ||
| 72 | { 0x14, 3051UL}, | ||
| 73 | { 0x15, 1523UL}, | ||
| 74 | { 0x16, 761UL}, | ||
| 75 | { 0x00, 0UL}, /* scrubbing off */ | ||
| 76 | }; | ||
| 77 | |||
| 78 | /* | ||
| 22 | * Memory scrubber control interface. For K8, memory scrubbing is handled by | 79 | * Memory scrubber control interface. For K8, memory scrubbing is handled by |
| 23 | * hardware and can involve L2 cache, dcache as well as the main memory. With | 80 | * hardware and can involve L2 cache, dcache as well as the main memory. With |
| 24 | * F10, this is extended to L3 cache scrubbing on CPU models sporting that | 81 | * F10, this is extended to L3 cache scrubbing on CPU models sporting that |
| @@ -693,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, | |||
| 693 | * specific. | 750 | * specific. |
| 694 | */ | 751 | */ |
| 695 | static u64 extract_error_address(struct mem_ctl_info *mci, | 752 | static u64 extract_error_address(struct mem_ctl_info *mci, |
| 696 | struct amd64_error_info_regs *info) | 753 | struct err_regs *info) |
| 697 | { | 754 | { |
| 698 | struct amd64_pvt *pvt = mci->pvt_info; | 755 | struct amd64_pvt *pvt = mci->pvt_info; |
| 699 | 756 | ||
| @@ -1049,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt) | |||
| 1049 | 1106 | ||
| 1050 | /* extract the ERROR ADDRESS for the K8 CPUs */ | 1107 | /* extract the ERROR ADDRESS for the K8 CPUs */ |
| 1051 | static u64 k8_get_error_address(struct mem_ctl_info *mci, | 1108 | static u64 k8_get_error_address(struct mem_ctl_info *mci, |
| 1052 | struct amd64_error_info_regs *info) | 1109 | struct err_regs *info) |
| 1053 | { | 1110 | { |
| 1054 | return (((u64) (info->nbeah & 0xff)) << 32) + | 1111 | return (((u64) (info->nbeah & 0xff)) << 32) + |
| 1055 | (info->nbeal & ~0x03); | 1112 | (info->nbeal & ~0x03); |
| @@ -1092,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram) | |||
| 1092 | } | 1149 | } |
| 1093 | 1150 | ||
| 1094 | static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, | 1151 | static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, |
| 1095 | struct amd64_error_info_regs *info, | 1152 | struct err_regs *info, |
| 1096 | u64 SystemAddress) | 1153 | u64 SystemAddress) |
| 1097 | { | 1154 | { |
| 1098 | struct mem_ctl_info *src_mci; | 1155 | struct mem_ctl_info *src_mci; |
| @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, | |||
| 1101 | u32 page, offset; | 1158 | u32 page, offset; |
| 1102 | 1159 | ||
| 1103 | /* Extract the syndrome parts and form a 16-bit syndrome */ | 1160 | /* Extract the syndrome parts and form a 16-bit syndrome */ |
| 1104 | syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; | 1161 | syndrome = HIGH_SYNDROME(info->nbsl) << 8; |
| 1105 | syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); | 1162 | syndrome |= LOW_SYNDROME(info->nbsh); |
| 1106 | 1163 | ||
| 1107 | /* CHIPKILL enabled */ | 1164 | /* CHIPKILL enabled */ |
| 1108 | if (info->nbcfg & K8_NBCFG_CHIPKILL) { | 1165 | if (info->nbcfg & K8_NBCFG_CHIPKILL) { |
| @@ -1311,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt) | |||
| 1311 | } | 1368 | } |
| 1312 | 1369 | ||
| 1313 | static u64 f10_get_error_address(struct mem_ctl_info *mci, | 1370 | static u64 f10_get_error_address(struct mem_ctl_info *mci, |
| 1314 | struct amd64_error_info_regs *info) | 1371 | struct err_regs *info) |
| 1315 | { | 1372 | { |
| 1316 | return (((u64) (info->nbeah & 0xffff)) << 32) + | 1373 | return (((u64) (info->nbeah & 0xffff)) << 32) + |
| 1317 | (info->nbeal & ~0x01); | 1374 | (info->nbeal & ~0x01); |
| @@ -1688,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, | |||
| 1688 | * The @sys_addr is usually an error address received from the hardware. | 1745 | * The @sys_addr is usually an error address received from the hardware. |
| 1689 | */ | 1746 | */ |
| 1690 | static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, | 1747 | static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, |
| 1691 | struct amd64_error_info_regs *info, | 1748 | struct err_regs *info, |
| 1692 | u64 sys_addr) | 1749 | u64 sys_addr) |
| 1693 | { | 1750 | { |
| 1694 | struct amd64_pvt *pvt = mci->pvt_info; | 1751 | struct amd64_pvt *pvt = mci->pvt_info; |
| @@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, | |||
| 1701 | if (csrow >= 0) { | 1758 | if (csrow >= 0) { |
| 1702 | error_address_to_page_and_offset(sys_addr, &page, &offset); | 1759 | error_address_to_page_and_offset(sys_addr, &page, &offset); |
| 1703 | 1760 | ||
| 1704 | syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; | 1761 | syndrome = HIGH_SYNDROME(info->nbsl) << 8; |
| 1705 | syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); | 1762 | syndrome |= LOW_SYNDROME(info->nbsh); |
| 1706 | 1763 | ||
| 1707 | /* | 1764 | /* |
| 1708 | * Is CHIPKILL on? If so, then we can attempt to use the | 1765 | * Is CHIPKILL on? If so, then we can attempt to use the |
| @@ -2045,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) | |||
| 2045 | * - 0: if no valid error is indicated | 2102 | * - 0: if no valid error is indicated |
| 2046 | */ | 2103 | */ |
| 2047 | static int amd64_get_error_info_regs(struct mem_ctl_info *mci, | 2104 | static int amd64_get_error_info_regs(struct mem_ctl_info *mci, |
| 2048 | struct amd64_error_info_regs *regs) | 2105 | struct err_regs *regs) |
| 2049 | { | 2106 | { |
| 2050 | struct amd64_pvt *pvt; | 2107 | struct amd64_pvt *pvt; |
| 2051 | struct pci_dev *misc_f3_ctl; | 2108 | struct pci_dev *misc_f3_ctl; |
| @@ -2094,10 +2151,10 @@ err_reg: | |||
| 2094 | * - 0: if no error is found | 2151 | * - 0: if no error is found |
| 2095 | */ | 2152 | */ |
| 2096 | static int amd64_get_error_info(struct mem_ctl_info *mci, | 2153 | static int amd64_get_error_info(struct mem_ctl_info *mci, |
| 2097 | struct amd64_error_info_regs *info) | 2154 | struct err_regs *info) |
| 2098 | { | 2155 | { |
| 2099 | struct amd64_pvt *pvt; | 2156 | struct amd64_pvt *pvt; |
| 2100 | struct amd64_error_info_regs regs; | 2157 | struct err_regs regs; |
| 2101 | 2158 | ||
| 2102 | pvt = mci->pvt_info; | 2159 | pvt = mci->pvt_info; |
| 2103 | 2160 | ||
| @@ -2152,48 +2209,12 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, | |||
| 2152 | return 1; | 2209 | return 1; |
| 2153 | } | 2210 | } |
| 2154 | 2211 | ||
| 2155 | static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, | ||
| 2156 | struct amd64_error_info_regs *info) | ||
| 2157 | { | ||
| 2158 | u32 err_code; | ||
| 2159 | u32 ec_tt; /* error code transaction type (2b) */ | ||
| 2160 | u32 ec_ll; /* error code cache level (2b) */ | ||
| 2161 | |||
| 2162 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | ||
| 2163 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
| 2164 | ec_tt = EXTRACT_TT_CODE(err_code); | ||
| 2165 | |||
| 2166 | amd64_mc_printk(mci, KERN_ERR, | ||
| 2167 | "GART TLB event: transaction type(%s), " | ||
| 2168 | "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); | ||
| 2169 | } | ||
| 2170 | |||
| 2171 | static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, | ||
| 2172 | struct amd64_error_info_regs *info) | ||
| 2173 | { | ||
| 2174 | u32 err_code; | ||
| 2175 | u32 ec_rrrr; /* error code memory transaction (4b) */ | ||
| 2176 | u32 ec_tt; /* error code transaction type (2b) */ | ||
| 2177 | u32 ec_ll; /* error code cache level (2b) */ | ||
| 2178 | |||
| 2179 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | ||
| 2180 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
| 2181 | ec_tt = EXTRACT_TT_CODE(err_code); | ||
| 2182 | ec_rrrr = EXTRACT_RRRR_CODE(err_code); | ||
| 2183 | |||
| 2184 | amd64_mc_printk(mci, KERN_ERR, | ||
| 2185 | "cache hierarchy error: memory transaction type(%s), " | ||
| 2186 | "transaction type(%s), cache level(%s)\n", | ||
| 2187 | rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); | ||
| 2188 | } | ||
| 2189 | |||
| 2190 | |||
| 2191 | /* | 2212 | /* |
| 2192 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR | 2213 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR |
| 2193 | * ADDRESS and process. | 2214 | * ADDRESS and process. |
| 2194 | */ | 2215 | */ |
| 2195 | static void amd64_handle_ce(struct mem_ctl_info *mci, | 2216 | static void amd64_handle_ce(struct mem_ctl_info *mci, |
| 2196 | struct amd64_error_info_regs *info) | 2217 | struct err_regs *info) |
| 2197 | { | 2218 | { |
| 2198 | struct amd64_pvt *pvt = mci->pvt_info; | 2219 | struct amd64_pvt *pvt = mci->pvt_info; |
| 2199 | u64 SystemAddress; | 2220 | u64 SystemAddress; |
| @@ -2216,7 +2237,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, | |||
| 2216 | 2237 | ||
| 2217 | /* Handle any Un-correctable Errors (UEs) */ | 2238 | /* Handle any Un-correctable Errors (UEs) */ |
| 2218 | static void amd64_handle_ue(struct mem_ctl_info *mci, | 2239 | static void amd64_handle_ue(struct mem_ctl_info *mci, |
| 2219 | struct amd64_error_info_regs *info) | 2240 | struct err_regs *info) |
| 2220 | { | 2241 | { |
| 2221 | int csrow; | 2242 | int csrow; |
| 2222 | u64 SystemAddress; | 2243 | u64 SystemAddress; |
| @@ -2261,59 +2282,24 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, | |||
| 2261 | } | 2282 | } |
| 2262 | } | 2283 | } |
| 2263 | 2284 | ||
| 2264 | static void amd64_decode_bus_error(struct mem_ctl_info *mci, | 2285 | static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, |
| 2265 | struct amd64_error_info_regs *info) | 2286 | struct err_regs *info) |
| 2266 | { | 2287 | { |
| 2267 | u32 err_code, ext_ec; | 2288 | u32 ec = ERROR_CODE(info->nbsl); |
| 2268 | u32 ec_pp; /* error code participating processor (2p) */ | 2289 | u32 xec = EXT_ERROR_CODE(info->nbsl); |
| 2269 | u32 ec_to; /* error code timed out (1b) */ | 2290 | int ecc_type = info->nbsh & (0x3 << 13); |
| 2270 | u32 ec_rrrr; /* error code memory transaction (4b) */ | ||
| 2271 | u32 ec_ii; /* error code memory or I/O (2b) */ | ||
| 2272 | u32 ec_ll; /* error code cache level (2b) */ | ||
| 2273 | 2291 | ||
| 2274 | ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); | 2292 | /* Bail early out if this was an 'observed' error */ |
| 2275 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | 2293 | if (PP(ec) == K8_NBSL_PP_OBS) |
| 2276 | 2294 | return; | |
| 2277 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
| 2278 | ec_ii = EXTRACT_II_CODE(err_code); | ||
| 2279 | ec_rrrr = EXTRACT_RRRR_CODE(err_code); | ||
| 2280 | ec_to = EXTRACT_TO_CODE(err_code); | ||
| 2281 | ec_pp = EXTRACT_PP_CODE(err_code); | ||
| 2282 | |||
| 2283 | amd64_mc_printk(mci, KERN_ERR, | ||
| 2284 | "BUS ERROR:\n" | ||
| 2285 | " time-out(%s) mem or i/o(%s)\n" | ||
| 2286 | " participating processor(%s)\n" | ||
| 2287 | " memory transaction type(%s)\n" | ||
| 2288 | " cache level(%s) Error Found by: %s\n", | ||
| 2289 | to_msgs[ec_to], | ||
| 2290 | ii_msgs[ec_ii], | ||
| 2291 | pp_msgs[ec_pp], | ||
| 2292 | rrrr_msgs[ec_rrrr], | ||
| 2293 | ll_msgs[ec_ll], | ||
| 2294 | (info->nbsh & K8_NBSH_ERR_SCRUBER) ? | ||
| 2295 | "Scrubber" : "Normal Operation"); | ||
| 2296 | |||
| 2297 | /* If this was an 'observed' error, early out */ | ||
| 2298 | if (ec_pp == K8_NBSL_PP_OBS) | ||
| 2299 | return; /* We aren't the node involved */ | ||
| 2300 | |||
| 2301 | /* Parse out the extended error code for ECC events */ | ||
| 2302 | switch (ext_ec) { | ||
| 2303 | /* F10 changed to one Extended ECC error code */ | ||
| 2304 | case F10_NBSL_EXT_ERR_RES: /* Reserved field */ | ||
| 2305 | case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ | ||
| 2306 | break; | ||
| 2307 | 2295 | ||
| 2308 | default: | 2296 | /* Do only ECC errors */ |
| 2309 | amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " | 2297 | if (xec && xec != F10_NBSL_EXT_ERR_ECC) |
| 2310 | "handling for this error\n"); | ||
| 2311 | return; | 2298 | return; |
| 2312 | } | ||
| 2313 | 2299 | ||
| 2314 | if (info->nbsh & K8_NBSH_CECC) | 2300 | if (ecc_type == 2) |
| 2315 | amd64_handle_ce(mci, info); | 2301 | amd64_handle_ce(mci, info); |
| 2316 | else if (info->nbsh & K8_NBSH_UECC) | 2302 | else if (ecc_type == 1) |
| 2317 | amd64_handle_ue(mci, info); | 2303 | amd64_handle_ue(mci, info); |
| 2318 | 2304 | ||
| 2319 | /* | 2305 | /* |
| @@ -2324,139 +2310,26 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, | |||
| 2324 | * catastrophic. | 2310 | * catastrophic. |
| 2325 | */ | 2311 | */ |
| 2326 | if (info->nbsh & K8_NBSH_OVERFLOW) | 2312 | if (info->nbsh & K8_NBSH_OVERFLOW) |
| 2327 | edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR | 2313 | edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); |
| 2328 | "Error Overflow set"); | ||
| 2329 | } | 2314 | } |
| 2330 | 2315 | ||
| 2331 | int amd64_process_error_info(struct mem_ctl_info *mci, | 2316 | void amd64_decode_bus_error(int node_id, struct err_regs *regs) |
| 2332 | struct amd64_error_info_regs *info, | ||
| 2333 | int handle_errors) | ||
| 2334 | { | 2317 | { |
| 2335 | struct amd64_pvt *pvt; | 2318 | struct mem_ctl_info *mci = mci_lookup[node_id]; |
| 2336 | struct amd64_error_info_regs *regs; | ||
| 2337 | u32 err_code, ext_ec; | ||
| 2338 | int gart_tlb_error = 0; | ||
| 2339 | |||
| 2340 | pvt = mci->pvt_info; | ||
| 2341 | |||
| 2342 | /* If caller doesn't want us to process the error, return */ | ||
| 2343 | if (!handle_errors) | ||
| 2344 | return 1; | ||
| 2345 | |||
| 2346 | regs = info; | ||
| 2347 | |||
| 2348 | debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); | ||
| 2349 | debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", | ||
| 2350 | pvt->mc_node_id, regs->nbeah, regs->nbeal); | ||
| 2351 | debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", | ||
| 2352 | regs->nbsh, regs->nbsl); | ||
| 2353 | debugf1(" Valid Error=%s Overflow=%s\n", | ||
| 2354 | (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", | ||
| 2355 | (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); | ||
| 2356 | debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", | ||
| 2357 | (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? | ||
| 2358 | "True" : "False", | ||
| 2359 | (regs->nbsh & K8_NBSH_ERR_ENABLE) ? | ||
| 2360 | "True" : "False"); | ||
| 2361 | debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", | ||
| 2362 | (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? | ||
| 2363 | "True" : "False", | ||
| 2364 | (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? | ||
| 2365 | "True" : "False", | ||
| 2366 | (regs->nbsh & K8_NBSH_PCC) ? | ||
| 2367 | "True" : "False"); | ||
| 2368 | debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", | ||
| 2369 | (regs->nbsh & K8_NBSH_CECC) ? | ||
| 2370 | "True" : "False", | ||
| 2371 | (regs->nbsh & K8_NBSH_UECC) ? | ||
| 2372 | "True" : "False", | ||
| 2373 | (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? | ||
| 2374 | "True" : "False"); | ||
| 2375 | debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", | ||
| 2376 | (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", | ||
| 2377 | (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", | ||
| 2378 | (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", | ||
| 2379 | (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); | ||
| 2380 | |||
| 2381 | |||
| 2382 | err_code = EXTRACT_ERROR_CODE(regs->nbsl); | ||
| 2383 | |||
| 2384 | /* Determine which error type: | ||
| 2385 | * 1) GART errors - non-fatal, developmental events | ||
| 2386 | * 2) MEMORY errors | ||
| 2387 | * 3) BUS errors | ||
| 2388 | * 4) Unknown error | ||
| 2389 | */ | ||
| 2390 | if (TEST_TLB_ERROR(err_code)) { | ||
| 2391 | /* | ||
| 2392 | * GART errors are intended to help graphics driver developers | ||
| 2393 | * to detect bad GART PTEs. It is recommended by AMD to disable | ||
| 2394 | * GART table walk error reporting by default[1] (currently | ||
| 2395 | * being disabled in mce_cpu_quirks()) and according to the | ||
| 2396 | * comment in mce_cpu_quirks(), such GART errors can be | ||
| 2397 | * incorrectly triggered. We may see these errors anyway and | ||
| 2398 | * unless requested by the user, they won't be reported. | ||
| 2399 | * | ||
| 2400 | * [1] section 13.10.1 on BIOS and Kernel Developers Guide for | ||
| 2401 | * AMD NPT family 0Fh processors | ||
| 2402 | */ | ||
| 2403 | if (report_gart_errors == 0) | ||
| 2404 | return 1; | ||
| 2405 | |||
| 2406 | /* | ||
| 2407 | * Only if GART error reporting is requested should we generate | ||
| 2408 | * any logs. | ||
| 2409 | */ | ||
| 2410 | gart_tlb_error = 1; | ||
| 2411 | |||
| 2412 | debugf1("GART TLB error\n"); | ||
| 2413 | amd64_decode_gart_tlb_error(mci, info); | ||
| 2414 | } else if (TEST_MEM_ERROR(err_code)) { | ||
| 2415 | debugf1("Memory/Cache error\n"); | ||
| 2416 | amd64_decode_mem_cache_error(mci, info); | ||
| 2417 | } else if (TEST_BUS_ERROR(err_code)) { | ||
| 2418 | debugf1("Bus (Link/DRAM) error\n"); | ||
| 2419 | amd64_decode_bus_error(mci, info); | ||
| 2420 | } else { | ||
| 2421 | /* shouldn't reach here! */ | ||
| 2422 | amd64_mc_printk(mci, KERN_WARNING, | ||
| 2423 | "%s(): unknown MCE error 0x%x\n", __func__, | ||
| 2424 | err_code); | ||
| 2425 | } | ||
| 2426 | |||
| 2427 | ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); | ||
| 2428 | amd64_mc_printk(mci, KERN_ERR, | ||
| 2429 | "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); | ||
| 2430 | 2319 | ||
| 2431 | if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && | 2320 | __amd64_decode_bus_error(mci, regs); |
| 2432 | ext_ec <= F10_NBSL_EXT_ERR_TGT) || | ||
| 2433 | (ext_ec == F10_NBSL_EXT_ERR_RMW)) && | ||
| 2434 | EXTRACT_LDT_LINK(info->nbsh)) { | ||
| 2435 | |||
| 2436 | amd64_mc_printk(mci, KERN_ERR, | ||
| 2437 | "Error on hypertransport link: %s\n", | ||
| 2438 | htlink_msgs[ | ||
| 2439 | EXTRACT_LDT_LINK(info->nbsh)]); | ||
| 2440 | } | ||
| 2441 | 2321 | ||
| 2442 | /* | 2322 | /* |
| 2443 | * Check the UE bit of the NB status high register, if set generate some | 2323 | * Check the UE bit of the NB status high register, if set generate some |
| 2444 | * logs. If NOT a GART error, then process the event as a NO-INFO event. | 2324 | * logs. If NOT a GART error, then process the event as a NO-INFO event. |
| 2445 | * If it was a GART error, skip that process. | 2325 | * If it was a GART error, skip that process. |
| 2326 | * | ||
| 2327 | * FIXME: this should go somewhere else, if at all. | ||
| 2446 | */ | 2328 | */ |
| 2447 | if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { | 2329 | if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) |
| 2448 | amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); | 2330 | edac_mc_handle_ue_no_info(mci, "UE bit is set"); |
| 2449 | if (!gart_tlb_error) | ||
| 2450 | edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); | ||
| 2451 | } | ||
| 2452 | |||
| 2453 | if (regs->nbsh & K8_NBSH_PCC) | ||
| 2454 | amd64_mc_printk(mci, KERN_CRIT, | ||
| 2455 | "PCC (processor context corrupt) set\n"); | ||
| 2456 | 2331 | ||
| 2457 | return 1; | ||
| 2458 | } | 2332 | } |
| 2459 | EXPORT_SYMBOL_GPL(amd64_process_error_info); | ||
| 2460 | 2333 | ||
| 2461 | /* | 2334 | /* |
| 2462 | * The main polling 'check' function, called FROM the edac core to perform the | 2335 | * The main polling 'check' function, called FROM the edac core to perform the |
| @@ -2464,10 +2337,12 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info); | |||
| 2464 | */ | 2337 | */ |
| 2465 | static void amd64_check(struct mem_ctl_info *mci) | 2338 | static void amd64_check(struct mem_ctl_info *mci) |
| 2466 | { | 2339 | { |
| 2467 | struct amd64_error_info_regs info; | 2340 | struct err_regs regs; |
| 2468 | 2341 | ||
| 2469 | if (amd64_get_error_info(mci, &info)) | 2342 | if (amd64_get_error_info(mci, ®s)) { |
| 2470 | amd64_process_error_info(mci, &info, 1); | 2343 | struct amd64_pvt *pvt = mci->pvt_info; |
| 2344 | amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); | ||
| 2345 | } | ||
| 2471 | } | 2346 | } |
| 2472 | 2347 | ||
| 2473 | /* | 2348 | /* |
| @@ -3163,6 +3038,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt) | |||
| 3163 | 3038 | ||
| 3164 | mci_lookup[node_id] = mci; | 3039 | mci_lookup[node_id] = mci; |
| 3165 | pvt_lookup[node_id] = NULL; | 3040 | pvt_lookup[node_id] = NULL; |
| 3041 | |||
| 3042 | /* register stuff with EDAC MCE */ | ||
| 3043 | if (report_gart_errors) | ||
| 3044 | amd_report_gart_errors(true); | ||
| 3045 | |||
| 3046 | amd_register_ecc_decoder(amd64_decode_bus_error); | ||
| 3047 | |||
| 3166 | return 0; | 3048 | return 0; |
| 3167 | 3049 | ||
| 3168 | err_add_mc: | 3050 | err_add_mc: |
| @@ -3229,6 +3111,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) | |||
| 3229 | 3111 | ||
| 3230 | mci_lookup[pvt->mc_node_id] = NULL; | 3112 | mci_lookup[pvt->mc_node_id] = NULL; |
| 3231 | 3113 | ||
| 3114 | /* unregister from EDAC MCE */ | ||
| 3115 | amd_report_gart_errors(false); | ||
| 3116 | amd_unregister_ecc_decoder(amd64_decode_bus_error); | ||
| 3117 | |||
| 3232 | /* Free the EDAC CORE resources */ | 3118 | /* Free the EDAC CORE resources */ |
| 3233 | edac_mc_free(mci); | 3119 | edac_mc_free(mci); |
| 3234 | } | 3120 | } |
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ba73015af8e4..8ea07e2715dc 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h | |||
| @@ -72,6 +72,7 @@ | |||
| 72 | #include <linux/edac.h> | 72 | #include <linux/edac.h> |
| 73 | #include <asm/msr.h> | 73 | #include <asm/msr.h> |
| 74 | #include "edac_core.h" | 74 | #include "edac_core.h" |
| 75 | #include "edac_mce_amd.h" | ||
| 75 | 76 | ||
| 76 | #define amd64_printk(level, fmt, arg...) \ | 77 | #define amd64_printk(level, fmt, arg...) \ |
| 77 | edac_printk(level, "amd64", fmt, ##arg) | 78 | edac_printk(level, "amd64", fmt, ##arg) |
| @@ -303,21 +304,9 @@ enum { | |||
| 303 | #define K8_NBSL 0x48 | 304 | #define K8_NBSL 0x48 |
| 304 | 305 | ||
| 305 | 306 | ||
| 306 | #define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff) | ||
| 307 | #define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) | ||
| 308 | |||
| 309 | /* Family F10h: Normalized Extended Error Codes */ | 307 | /* Family F10h: Normalized Extended Error Codes */ |
| 310 | #define F10_NBSL_EXT_ERR_RES 0x0 | 308 | #define F10_NBSL_EXT_ERR_RES 0x0 |
| 311 | #define F10_NBSL_EXT_ERR_CRC 0x1 | ||
| 312 | #define F10_NBSL_EXT_ERR_SYNC 0x2 | ||
| 313 | #define F10_NBSL_EXT_ERR_MST 0x3 | ||
| 314 | #define F10_NBSL_EXT_ERR_TGT 0x4 | ||
| 315 | #define F10_NBSL_EXT_ERR_GART 0x5 | ||
| 316 | #define F10_NBSL_EXT_ERR_RMW 0x6 | ||
| 317 | #define F10_NBSL_EXT_ERR_WDT 0x7 | ||
| 318 | #define F10_NBSL_EXT_ERR_ECC 0x8 | 309 | #define F10_NBSL_EXT_ERR_ECC 0x8 |
| 319 | #define F10_NBSL_EXT_ERR_DEV 0x9 | ||
| 320 | #define F10_NBSL_EXT_ERR_LINK_DATA 0xA | ||
| 321 | 310 | ||
| 322 | /* Next two are overloaded values */ | 311 | /* Next two are overloaded values */ |
| 323 | #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB | 312 | #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB |
| @@ -348,17 +337,6 @@ enum { | |||
| 348 | #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 | 337 | #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 |
| 349 | #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD | 338 | #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD |
| 350 | 339 | ||
| 351 | #define EXTRACT_ERROR_CODE(x) ((x) & 0xffff) | ||
| 352 | #define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) | ||
| 353 | #define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) | ||
| 354 | #define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800) | ||
| 355 | #define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3) | ||
| 356 | #define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3) | ||
| 357 | #define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3) | ||
| 358 | #define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf) | ||
| 359 | #define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1) | ||
| 360 | #define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3) | ||
| 361 | |||
| 362 | /* | 340 | /* |
| 363 | * The following are for BUS type errors AFTER values have been normalized by | 341 | * The following are for BUS type errors AFTER values have been normalized by |
| 364 | * shifting right | 342 | * shifting right |
| @@ -368,28 +346,7 @@ enum { | |||
| 368 | #define K8_NBSL_PP_OBS 0x2 | 346 | #define K8_NBSL_PP_OBS 0x2 |
| 369 | #define K8_NBSL_PP_GENERIC 0x3 | 347 | #define K8_NBSL_PP_GENERIC 0x3 |
| 370 | 348 | ||
| 371 | |||
| 372 | #define K8_NBSH 0x4C | ||
| 373 | |||
| 374 | #define K8_NBSH_VALID_BIT BIT(31) | ||
| 375 | #define K8_NBSH_OVERFLOW BIT(30) | ||
| 376 | #define K8_NBSH_UNCORRECTED_ERR BIT(29) | ||
| 377 | #define K8_NBSH_ERR_ENABLE BIT(28) | ||
| 378 | #define K8_NBSH_MISC_ERR_VALID BIT(27) | ||
| 379 | #define K8_NBSH_VALID_ERROR_ADDR BIT(26) | ||
| 380 | #define K8_NBSH_PCC BIT(25) | ||
| 381 | #define K8_NBSH_CECC BIT(14) | ||
| 382 | #define K8_NBSH_UECC BIT(13) | ||
| 383 | #define K8_NBSH_ERR_SCRUBER BIT(8) | ||
| 384 | #define K8_NBSH_CORE3 BIT(3) | ||
| 385 | #define K8_NBSH_CORE2 BIT(2) | ||
| 386 | #define K8_NBSH_CORE1 BIT(1) | ||
| 387 | #define K8_NBSH_CORE0 BIT(0) | ||
| 388 | |||
| 389 | #define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7) | ||
| 390 | #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) | 349 | #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) |
| 391 | #define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff) | ||
| 392 | |||
| 393 | 350 | ||
| 394 | #define K8_NBEAL 0x50 | 351 | #define K8_NBEAL 0x50 |
| 395 | #define K8_NBEAH 0x54 | 352 | #define K8_NBEAH 0x54 |
| @@ -455,23 +412,6 @@ enum amd64_chipset_families { | |||
| 455 | F11_CPUS, | 412 | F11_CPUS, |
| 456 | }; | 413 | }; |
| 457 | 414 | ||
| 458 | /* | ||
| 459 | * Structure to hold: | ||
| 460 | * | ||
| 461 | * 1) dynamically read status and error address HW registers | ||
| 462 | * 2) sysfs entered values | ||
| 463 | * 3) MCE values | ||
| 464 | * | ||
| 465 | * Depends on entry into the modules | ||
| 466 | */ | ||
| 467 | struct amd64_error_info_regs { | ||
| 468 | u32 nbcfg; | ||
| 469 | u32 nbsh; | ||
| 470 | u32 nbsl; | ||
| 471 | u32 nbeah; | ||
| 472 | u32 nbeal; | ||
| 473 | }; | ||
| 474 | |||
| 475 | /* Error injection control structure */ | 415 | /* Error injection control structure */ |
| 476 | struct error_injection { | 416 | struct error_injection { |
| 477 | u32 section; | 417 | u32 section; |
| @@ -542,7 +482,7 @@ struct amd64_pvt { | |||
| 542 | u32 online_spare; /* On-Line spare Reg */ | 482 | u32 online_spare; /* On-Line spare Reg */ |
| 543 | 483 | ||
| 544 | /* temp storage for when input is received from sysfs */ | 484 | /* temp storage for when input is received from sysfs */ |
| 545 | struct amd64_error_info_regs ctl_error_info; | 485 | struct err_regs ctl_error_info; |
| 546 | 486 | ||
| 547 | /* place to store error injection parameters prior to issue */ | 487 | /* place to store error injection parameters prior to issue */ |
| 548 | struct error_injection injection; | 488 | struct error_injection injection; |
| @@ -601,11 +541,11 @@ struct low_ops { | |||
| 601 | int (*early_channel_count)(struct amd64_pvt *pvt); | 541 | int (*early_channel_count)(struct amd64_pvt *pvt); |
| 602 | 542 | ||
| 603 | u64 (*get_error_address)(struct mem_ctl_info *mci, | 543 | u64 (*get_error_address)(struct mem_ctl_info *mci, |
| 604 | struct amd64_error_info_regs *info); | 544 | struct err_regs *info); |
| 605 | void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); | 545 | void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); |
| 606 | void (*read_dram_ctl_register)(struct amd64_pvt *pvt); | 546 | void (*read_dram_ctl_register)(struct amd64_pvt *pvt); |
| 607 | void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, | 547 | void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, |
| 608 | struct amd64_error_info_regs *info, | 548 | struct err_regs *info, |
| 609 | u64 SystemAddr); | 549 | u64 SystemAddr); |
| 610 | int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); | 550 | int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); |
| 611 | }; | 551 | }; |
| @@ -637,8 +577,5 @@ static inline struct low_ops *family_ops(int index) | |||
| 637 | #define F10_MIN_SCRUB_RATE_BITS 0x5 | 577 | #define F10_MIN_SCRUB_RATE_BITS 0x5 |
| 638 | #define F11_MIN_SCRUB_RATE_BITS 0x6 | 578 | #define F11_MIN_SCRUB_RATE_BITS 0x6 |
| 639 | 579 | ||
| 640 | int amd64_process_error_info(struct mem_ctl_info *mci, | ||
| 641 | struct amd64_error_info_regs *info, | ||
| 642 | int handle_errors); | ||
| 643 | int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, | 580 | int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, |
| 644 | u64 *hole_offset, u64 *hole_size); | 581 | u64 *hole_offset, u64 *hole_size); |
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 0a41b248a4ad..59cf2cf6e11e 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c | |||
| @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, | |||
| 24 | 24 | ||
| 25 | /* Process the Mapping request */ | 25 | /* Process the Mapping request */ |
| 26 | /* TODO: Add race prevention */ | 26 | /* TODO: Add race prevention */ |
| 27 | amd64_process_error_info(mci, &pvt->ctl_error_info, 1); | 27 | amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); |
| 28 | 28 | ||
| 29 | return count; | 29 | return count; |
| 30 | } | 30 | } |
diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c deleted file mode 100644 index f212ff12a9d8..000000000000 --- a/drivers/edac/amd64_edac_err_types.c +++ /dev/null | |||
| @@ -1,161 +0,0 @@ | |||
| 1 | #include "amd64_edac.h" | ||
| 2 | |||
| 3 | /* | ||
| 4 | * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only | ||
| 5 | * for DDR2 DRAM mapping. | ||
| 6 | */ | ||
| 7 | u32 revf_quad_ddr2_shift[] = { | ||
| 8 | 0, /* 0000b NULL DIMM (128mb) */ | ||
| 9 | 28, /* 0001b 256mb */ | ||
| 10 | 29, /* 0010b 512mb */ | ||
| 11 | 29, /* 0011b 512mb */ | ||
| 12 | 29, /* 0100b 512mb */ | ||
| 13 | 30, /* 0101b 1gb */ | ||
| 14 | 30, /* 0110b 1gb */ | ||
| 15 | 31, /* 0111b 2gb */ | ||
| 16 | 31, /* 1000b 2gb */ | ||
| 17 | 32, /* 1001b 4gb */ | ||
| 18 | 32, /* 1010b 4gb */ | ||
| 19 | 33, /* 1011b 8gb */ | ||
| 20 | 0, /* 1100b future */ | ||
| 21 | 0, /* 1101b future */ | ||
| 22 | 0, /* 1110b future */ | ||
| 23 | 0 /* 1111b future */ | ||
| 24 | }; | ||
| 25 | |||
| 26 | /* | ||
| 27 | * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing | ||
| 28 | * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- | ||
| 29 | * or higher value'. | ||
| 30 | * | ||
| 31 | *FIXME: Produce a better mapping/linearisation. | ||
| 32 | */ | ||
| 33 | |||
| 34 | struct scrubrate scrubrates[] = { | ||
| 35 | { 0x01, 1600000000UL}, | ||
| 36 | { 0x02, 800000000UL}, | ||
| 37 | { 0x03, 400000000UL}, | ||
| 38 | { 0x04, 200000000UL}, | ||
| 39 | { 0x05, 100000000UL}, | ||
| 40 | { 0x06, 50000000UL}, | ||
| 41 | { 0x07, 25000000UL}, | ||
| 42 | { 0x08, 12284069UL}, | ||
| 43 | { 0x09, 6274509UL}, | ||
| 44 | { 0x0A, 3121951UL}, | ||
| 45 | { 0x0B, 1560975UL}, | ||
| 46 | { 0x0C, 781440UL}, | ||
| 47 | { 0x0D, 390720UL}, | ||
| 48 | { 0x0E, 195300UL}, | ||
| 49 | { 0x0F, 97650UL}, | ||
| 50 | { 0x10, 48854UL}, | ||
| 51 | { 0x11, 24427UL}, | ||
| 52 | { 0x12, 12213UL}, | ||
| 53 | { 0x13, 6101UL}, | ||
| 54 | { 0x14, 3051UL}, | ||
| 55 | { 0x15, 1523UL}, | ||
| 56 | { 0x16, 761UL}, | ||
| 57 | { 0x00, 0UL}, /* scrubbing off */ | ||
| 58 | }; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * string representation for the different MCA reported error types, see F3x48 | ||
| 62 | * or MSR0000_0411. | ||
| 63 | */ | ||
| 64 | const char *tt_msgs[] = { /* transaction type */ | ||
| 65 | "instruction", | ||
| 66 | "data", | ||
| 67 | "generic", | ||
| 68 | "reserved" | ||
| 69 | }; | ||
| 70 | |||
| 71 | const char *ll_msgs[] = { /* cache level */ | ||
| 72 | "L0", | ||
| 73 | "L1", | ||
| 74 | "L2", | ||
| 75 | "L3/generic" | ||
| 76 | }; | ||
| 77 | |||
| 78 | const char *rrrr_msgs[] = { | ||
| 79 | "generic", | ||
| 80 | "generic read", | ||
| 81 | "generic write", | ||
| 82 | "data read", | ||
| 83 | "data write", | ||
| 84 | "inst fetch", | ||
| 85 | "prefetch", | ||
| 86 | "evict", | ||
| 87 | "snoop", | ||
| 88 | "reserved RRRR= 9", | ||
| 89 | "reserved RRRR= 10", | ||
| 90 | "reserved RRRR= 11", | ||
| 91 | "reserved RRRR= 12", | ||
| 92 | "reserved RRRR= 13", | ||
| 93 | "reserved RRRR= 14", | ||
| 94 | "reserved RRRR= 15" | ||
| 95 | }; | ||
| 96 | |||
| 97 | const char *pp_msgs[] = { /* participating processor */ | ||
| 98 | "local node originated (SRC)", | ||
| 99 | "local node responded to request (RES)", | ||
| 100 | "local node observed as 3rd party (OBS)", | ||
| 101 | "generic" | ||
| 102 | }; | ||
| 103 | |||
| 104 | const char *to_msgs[] = { | ||
| 105 | "no timeout", | ||
| 106 | "timed out" | ||
| 107 | }; | ||
| 108 | |||
| 109 | const char *ii_msgs[] = { /* memory or i/o */ | ||
| 110 | "mem access", | ||
| 111 | "reserved", | ||
| 112 | "i/o access", | ||
| 113 | "generic" | ||
| 114 | }; | ||
| 115 | |||
| 116 | /* Map the 5 bits of Extended Error code to the string table. */ | ||
| 117 | const char *ext_msgs[] = { /* extended error */ | ||
| 118 | "K8 ECC error/F10 reserved", /* 0_0000b */ | ||
| 119 | "CRC error", /* 0_0001b */ | ||
| 120 | "sync error", /* 0_0010b */ | ||
| 121 | "mst abort", /* 0_0011b */ | ||
| 122 | "tgt abort", /* 0_0100b */ | ||
| 123 | "GART error", /* 0_0101b */ | ||
| 124 | "RMW error", /* 0_0110b */ | ||
| 125 | "Wdog timer error", /* 0_0111b */ | ||
| 126 | "F10-ECC/K8-Chipkill error", /* 0_1000b */ | ||
| 127 | "DEV Error", /* 0_1001b */ | ||
| 128 | "Link Data error", /* 0_1010b */ | ||
| 129 | "Link or L3 Protocol error", /* 0_1011b */ | ||
| 130 | "NB Array error", /* 0_1100b */ | ||
| 131 | "DRAM Parity error", /* 0_1101b */ | ||
| 132 | "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ | ||
| 133 | "Res 0x0ff error", /* 0_1111b */ | ||
| 134 | "Res 0x100 error", /* 1_0000b */ | ||
| 135 | "Res 0x101 error", /* 1_0001b */ | ||
| 136 | "Res 0x102 error", /* 1_0010b */ | ||
| 137 | "Res 0x103 error", /* 1_0011b */ | ||
| 138 | "Res 0x104 error", /* 1_0100b */ | ||
| 139 | "Res 0x105 error", /* 1_0101b */ | ||
| 140 | "Res 0x106 error", /* 1_0110b */ | ||
| 141 | "Res 0x107 error", /* 1_0111b */ | ||
| 142 | "Res 0x108 error", /* 1_1000b */ | ||
| 143 | "Res 0x109 error", /* 1_1001b */ | ||
| 144 | "Res 0x10A error", /* 1_1010b */ | ||
| 145 | "Res 0x10B error", /* 1_1011b */ | ||
| 146 | "L3 Cache Data error", /* 1_1100b */ | ||
| 147 | "L3 CacheTag error", /* 1_1101b */ | ||
| 148 | "L3 Cache LRU error", /* 1_1110b */ | ||
| 149 | "Res 0x1FF error" /* 1_1111b */ | ||
| 150 | }; | ||
| 151 | |||
| 152 | const char *htlink_msgs[] = { | ||
| 153 | "none", | ||
| 154 | "1", | ||
| 155 | "2", | ||
| 156 | "1 2", | ||
| 157 | "3", | ||
| 158 | "1 3", | ||
| 159 | "2 3", | ||
| 160 | "1 2 3" | ||
| 161 | }; | ||
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c new file mode 100644 index 000000000000..c8ca7136dacc --- /dev/null +++ b/drivers/edac/edac_mce_amd.c | |||
| @@ -0,0 +1,422 @@ | |||
| 1 | #include <linux/module.h> | ||
| 2 | #include "edac_mce_amd.h" | ||
| 3 | |||
| 4 | static bool report_gart_errors; | ||
| 5 | static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); | ||
| 6 | |||
| 7 | void amd_report_gart_errors(bool v) | ||
| 8 | { | ||
| 9 | report_gart_errors = v; | ||
| 10 | } | ||
| 11 | EXPORT_SYMBOL_GPL(amd_report_gart_errors); | ||
| 12 | |||
| 13 | void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) | ||
| 14 | { | ||
| 15 | nb_bus_decoder = f; | ||
| 16 | } | ||
| 17 | EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); | ||
| 18 | |||
| 19 | void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) | ||
| 20 | { | ||
| 21 | if (nb_bus_decoder) { | ||
| 22 | WARN_ON(nb_bus_decoder != f); | ||
| 23 | |||
| 24 | nb_bus_decoder = NULL; | ||
| 25 | } | ||
| 26 | } | ||
| 27 | EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); | ||
| 28 | |||
| 29 | /* | ||
| 30 | * string representation for the different MCA reported error types, see F3x48 | ||
| 31 | * or MSR0000_0411. | ||
| 32 | */ | ||
| 33 | const char *tt_msgs[] = { /* transaction type */ | ||
| 34 | "instruction", | ||
| 35 | "data", | ||
| 36 | "generic", | ||
| 37 | "reserved" | ||
| 38 | }; | ||
| 39 | EXPORT_SYMBOL_GPL(tt_msgs); | ||
| 40 | |||
| 41 | const char *ll_msgs[] = { /* cache level */ | ||
| 42 | "L0", | ||
| 43 | "L1", | ||
| 44 | "L2", | ||
| 45 | "L3/generic" | ||
| 46 | }; | ||
| 47 | EXPORT_SYMBOL_GPL(ll_msgs); | ||
| 48 | |||
| 49 | const char *rrrr_msgs[] = { | ||
| 50 | "generic", | ||
| 51 | "generic read", | ||
| 52 | "generic write", | ||
| 53 | "data read", | ||
| 54 | "data write", | ||
| 55 | "inst fetch", | ||
| 56 | "prefetch", | ||
| 57 | "evict", | ||
| 58 | "snoop", | ||
| 59 | "reserved RRRR= 9", | ||
| 60 | "reserved RRRR= 10", | ||
| 61 | "reserved RRRR= 11", | ||
| 62 | "reserved RRRR= 12", | ||
| 63 | "reserved RRRR= 13", | ||
| 64 | "reserved RRRR= 14", | ||
| 65 | "reserved RRRR= 15" | ||
| 66 | }; | ||
| 67 | EXPORT_SYMBOL_GPL(rrrr_msgs); | ||
| 68 | |||
| 69 | const char *pp_msgs[] = { /* participating processor */ | ||
| 70 | "local node originated (SRC)", | ||
| 71 | "local node responded to request (RES)", | ||
| 72 | "local node observed as 3rd party (OBS)", | ||
| 73 | "generic" | ||
| 74 | }; | ||
| 75 | EXPORT_SYMBOL_GPL(pp_msgs); | ||
| 76 | |||
| 77 | const char *to_msgs[] = { | ||
| 78 | "no timeout", | ||
| 79 | "timed out" | ||
| 80 | }; | ||
| 81 | EXPORT_SYMBOL_GPL(to_msgs); | ||
| 82 | |||
| 83 | const char *ii_msgs[] = { /* memory or i/o */ | ||
| 84 | "mem access", | ||
| 85 | "reserved", | ||
| 86 | "i/o access", | ||
| 87 | "generic" | ||
| 88 | }; | ||
| 89 | EXPORT_SYMBOL_GPL(ii_msgs); | ||
| 90 | |||
| 91 | /* | ||
| 92 | * Map the 4 or 5 (family-specific) bits of Extended Error code to the | ||
| 93 | * string table. | ||
| 94 | */ | ||
| 95 | const char *ext_msgs[] = { | ||
| 96 | "K8 ECC error", /* 0_0000b */ | ||
| 97 | "CRC error on link", /* 0_0001b */ | ||
| 98 | "Sync error packets on link", /* 0_0010b */ | ||
| 99 | "Master Abort during link operation", /* 0_0011b */ | ||
| 100 | "Target Abort during link operation", /* 0_0100b */ | ||
| 101 | "Invalid GART PTE entry during table walk", /* 0_0101b */ | ||
| 102 | "Unsupported atomic RMW command received", /* 0_0110b */ | ||
| 103 | "WDT error: NB transaction timeout", /* 0_0111b */ | ||
| 104 | "ECC/ChipKill ECC error", /* 0_1000b */ | ||
| 105 | "SVM DEV Error", /* 0_1001b */ | ||
| 106 | "Link Data error", /* 0_1010b */ | ||
| 107 | "Link/L3/Probe Filter Protocol error", /* 0_1011b */ | ||
| 108 | "NB Internal Arrays Parity error", /* 0_1100b */ | ||
| 109 | "DRAM Address/Control Parity error", /* 0_1101b */ | ||
| 110 | "Link Transmission error", /* 0_1110b */ | ||
| 111 | "GART/DEV Table Walk Data error" /* 0_1111b */ | ||
| 112 | "Res 0x100 error", /* 1_0000b */ | ||
| 113 | "Res 0x101 error", /* 1_0001b */ | ||
| 114 | "Res 0x102 error", /* 1_0010b */ | ||
| 115 | "Res 0x103 error", /* 1_0011b */ | ||
| 116 | "Res 0x104 error", /* 1_0100b */ | ||
| 117 | "Res 0x105 error", /* 1_0101b */ | ||
| 118 | "Res 0x106 error", /* 1_0110b */ | ||
| 119 | "Res 0x107 error", /* 1_0111b */ | ||
| 120 | "Res 0x108 error", /* 1_1000b */ | ||
| 121 | "Res 0x109 error", /* 1_1001b */ | ||
| 122 | "Res 0x10A error", /* 1_1010b */ | ||
| 123 | "Res 0x10B error", /* 1_1011b */ | ||
| 124 | "ECC error in L3 Cache Data", /* 1_1100b */ | ||
| 125 | "L3 Cache Tag error", /* 1_1101b */ | ||
| 126 | "L3 Cache LRU Parity error", /* 1_1110b */ | ||
| 127 | "Probe Filter error" /* 1_1111b */ | ||
| 128 | }; | ||
| 129 | EXPORT_SYMBOL_GPL(ext_msgs); | ||
| 130 | |||
| 131 | static void amd_decode_dc_mce(u64 mc0_status) | ||
| 132 | { | ||
| 133 | u32 ec = mc0_status & 0xffff; | ||
| 134 | u32 xec = (mc0_status >> 16) & 0xf; | ||
| 135 | |||
| 136 | pr_emerg(" Data Cache Error"); | ||
| 137 | |||
| 138 | if (xec == 1 && TLB_ERROR(ec)) | ||
| 139 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); | ||
| 140 | else if (xec == 0) { | ||
| 141 | if (mc0_status & (1ULL << 40)) | ||
| 142 | pr_cont(" during Data Scrub.\n"); | ||
| 143 | else if (TLB_ERROR(ec)) | ||
| 144 | pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); | ||
| 145 | else if (MEM_ERROR(ec)) { | ||
| 146 | u8 ll = ec & 0x3; | ||
| 147 | u8 tt = (ec >> 2) & 0x3; | ||
| 148 | u8 rrrr = (ec >> 4) & 0xf; | ||
| 149 | |||
| 150 | /* see F10h BKDG (31116), Table 92. */ | ||
| 151 | if (ll == 0x1) { | ||
| 152 | if (tt != 0x1) | ||
| 153 | goto wrong_dc_mce; | ||
| 154 | |||
| 155 | pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); | ||
| 156 | |||
| 157 | } else if (ll == 0x2 && rrrr == 0x3) | ||
| 158 | pr_cont(" during L1 linefill from L2.\n"); | ||
| 159 | else | ||
| 160 | goto wrong_dc_mce; | ||
| 161 | } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) | ||
| 162 | pr_cont(" during system linefill.\n"); | ||
| 163 | else | ||
| 164 | goto wrong_dc_mce; | ||
| 165 | } else | ||
| 166 | goto wrong_dc_mce; | ||
| 167 | |||
| 168 | return; | ||
| 169 | |||
| 170 | wrong_dc_mce: | ||
| 171 | pr_warning("Corrupted DC MCE info?\n"); | ||
| 172 | } | ||
| 173 | |||
| 174 | static void amd_decode_ic_mce(u64 mc1_status) | ||
| 175 | { | ||
| 176 | u32 ec = mc1_status & 0xffff; | ||
| 177 | u32 xec = (mc1_status >> 16) & 0xf; | ||
| 178 | |||
| 179 | pr_emerg(" Instruction Cache Error"); | ||
| 180 | |||
| 181 | if (xec == 1 && TLB_ERROR(ec)) | ||
| 182 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); | ||
| 183 | else if (xec == 0) { | ||
| 184 | if (TLB_ERROR(ec)) | ||
| 185 | pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); | ||
| 186 | else if (BUS_ERROR(ec)) { | ||
| 187 | if (boot_cpu_data.x86 == 0xf && | ||
| 188 | (mc1_status & (1ULL << 58))) | ||
| 189 | pr_cont(" during system linefill.\n"); | ||
| 190 | else | ||
| 191 | pr_cont(" during attempted NB data read.\n"); | ||
| 192 | } else if (MEM_ERROR(ec)) { | ||
| 193 | u8 ll = ec & 0x3; | ||
| 194 | u8 rrrr = (ec >> 4) & 0xf; | ||
| 195 | |||
| 196 | if (ll == 0x2) | ||
| 197 | pr_cont(" during a linefill from L2.\n"); | ||
| 198 | else if (ll == 0x1) { | ||
| 199 | |||
| 200 | switch (rrrr) { | ||
| 201 | case 0x5: | ||
| 202 | pr_cont(": Parity error during " | ||
| 203 | "data load.\n"); | ||
| 204 | break; | ||
| 205 | |||
| 206 | case 0x7: | ||
| 207 | pr_cont(": Copyback Parity/Victim" | ||
| 208 | " error.\n"); | ||
| 209 | break; | ||
| 210 | |||
| 211 | case 0x8: | ||
| 212 | pr_cont(": Tag Snoop error.\n"); | ||
| 213 | break; | ||
| 214 | |||
| 215 | default: | ||
| 216 | goto wrong_ic_mce; | ||
| 217 | break; | ||
| 218 | } | ||
| 219 | } | ||
| 220 | } else | ||
| 221 | goto wrong_ic_mce; | ||
| 222 | } else | ||
| 223 | goto wrong_ic_mce; | ||
| 224 | |||
| 225 | return; | ||
| 226 | |||
| 227 | wrong_ic_mce: | ||
| 228 | pr_warning("Corrupted IC MCE info?\n"); | ||
| 229 | } | ||
| 230 | |||
| 231 | static void amd_decode_bu_mce(u64 mc2_status) | ||
| 232 | { | ||
| 233 | u32 ec = mc2_status & 0xffff; | ||
| 234 | u32 xec = (mc2_status >> 16) & 0xf; | ||
| 235 | |||
| 236 | pr_emerg(" Bus Unit Error"); | ||
| 237 | |||
| 238 | if (xec == 0x1) | ||
| 239 | pr_cont(" in the write data buffers.\n"); | ||
| 240 | else if (xec == 0x3) | ||
| 241 | pr_cont(" in the victim data buffers.\n"); | ||
| 242 | else if (xec == 0x2 && MEM_ERROR(ec)) | ||
| 243 | pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); | ||
| 244 | else if (xec == 0x0) { | ||
| 245 | if (TLB_ERROR(ec)) | ||
| 246 | pr_cont(": %s error in a Page Descriptor Cache or " | ||
| 247 | "Guest TLB.\n", TT_MSG(ec)); | ||
| 248 | else if (BUS_ERROR(ec)) | ||
| 249 | pr_cont(": %s/ECC error in data read from NB: %s.\n", | ||
| 250 | RRRR_MSG(ec), PP_MSG(ec)); | ||
| 251 | else if (MEM_ERROR(ec)) { | ||
| 252 | u8 rrrr = (ec >> 4) & 0xf; | ||
| 253 | |||
| 254 | if (rrrr >= 0x7) | ||
| 255 | pr_cont(": %s error during data copyback.\n", | ||
| 256 | RRRR_MSG(ec)); | ||
| 257 | else if (rrrr <= 0x1) | ||
| 258 | pr_cont(": %s parity/ECC error during data " | ||
| 259 | "access from L2.\n", RRRR_MSG(ec)); | ||
| 260 | else | ||
| 261 | goto wrong_bu_mce; | ||
| 262 | } else | ||
| 263 | goto wrong_bu_mce; | ||
| 264 | } else | ||
| 265 | goto wrong_bu_mce; | ||
| 266 | |||
| 267 | return; | ||
| 268 | |||
| 269 | wrong_bu_mce: | ||
| 270 | pr_warning("Corrupted BU MCE info?\n"); | ||
| 271 | } | ||
| 272 | |||
| 273 | static void amd_decode_ls_mce(u64 mc3_status) | ||
| 274 | { | ||
| 275 | u32 ec = mc3_status & 0xffff; | ||
| 276 | u32 xec = (mc3_status >> 16) & 0xf; | ||
| 277 | |||
| 278 | pr_emerg(" Load Store Error"); | ||
| 279 | |||
| 280 | if (xec == 0x0) { | ||
| 281 | u8 rrrr = (ec >> 4) & 0xf; | ||
| 282 | |||
| 283 | if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) | ||
| 284 | goto wrong_ls_mce; | ||
| 285 | |||
| 286 | pr_cont(" during %s.\n", RRRR_MSG(ec)); | ||
| 287 | } | ||
| 288 | return; | ||
| 289 | |||
| 290 | wrong_ls_mce: | ||
| 291 | pr_warning("Corrupted LS MCE info?\n"); | ||
| 292 | } | ||
| 293 | |||
| 294 | void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) | ||
| 295 | { | ||
| 296 | u32 ec = ERROR_CODE(regs->nbsl); | ||
| 297 | u32 xec = EXT_ERROR_CODE(regs->nbsl); | ||
| 298 | |||
| 299 | if (!handle_errors) | ||
| 300 | return; | ||
| 301 | |||
| 302 | pr_emerg(" Northbridge Error, node %d", node_id); | ||
| 303 | |||
| 304 | /* | ||
| 305 | * F10h, revD can disable ErrCpu[3:0] so check that first and also the | ||
| 306 | * value encoding has changed so interpret those differently | ||
| 307 | */ | ||
| 308 | if ((boot_cpu_data.x86 == 0x10) && | ||
| 309 | (boot_cpu_data.x86_model > 8)) { | ||
| 310 | if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) | ||
| 311 | pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); | ||
| 312 | } else { | ||
| 313 | pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); | ||
| 314 | } | ||
| 315 | |||
| 316 | |||
| 317 | pr_emerg("%s.\n", EXT_ERR_MSG(xec)); | ||
| 318 | |||
| 319 | if (BUS_ERROR(ec) && nb_bus_decoder) | ||
| 320 | nb_bus_decoder(node_id, regs); | ||
| 321 | } | ||
| 322 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); | ||
| 323 | |||
| 324 | static void amd_decode_fr_mce(u64 mc5_status) | ||
| 325 | { | ||
| 326 | /* we have only one error signature so match all fields at once. */ | ||
| 327 | if ((mc5_status & 0xffff) == 0x0f0f) | ||
| 328 | pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); | ||
| 329 | else | ||
| 330 | pr_warning("Corrupted FR MCE info?\n"); | ||
| 331 | } | ||
| 332 | |||
| 333 | static inline void amd_decode_err_code(unsigned int ec) | ||
| 334 | { | ||
| 335 | if (TLB_ERROR(ec)) { | ||
| 336 | /* | ||
| 337 | * GART errors are intended to help graphics driver developers | ||
| 338 | * to detect bad GART PTEs. It is recommended by AMD to disable | ||
| 339 | * GART table walk error reporting by default[1] (currently | ||
| 340 | * being disabled in mce_cpu_quirks()) and according to the | ||
| 341 | * comment in mce_cpu_quirks(), such GART errors can be | ||
| 342 | * incorrectly triggered. We may see these errors anyway and | ||
| 343 | * unless requested by the user, they won't be reported. | ||
| 344 | * | ||
| 345 | * [1] section 13.10.1 on BIOS and Kernel Developers Guide for | ||
| 346 | * AMD NPT family 0Fh processors | ||
| 347 | */ | ||
| 348 | if (!report_gart_errors) | ||
| 349 | return; | ||
| 350 | |||
| 351 | pr_emerg(" Transaction: %s, Cache Level %s\n", | ||
| 352 | TT_MSG(ec), LL_MSG(ec)); | ||
| 353 | } else if (MEM_ERROR(ec)) { | ||
| 354 | pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", | ||
| 355 | RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); | ||
| 356 | } else if (BUS_ERROR(ec)) { | ||
| 357 | pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " | ||
| 358 | "Participating Processor: %s\n", | ||
| 359 | RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), | ||
| 360 | PP_MSG(ec)); | ||
| 361 | } else | ||
| 362 | pr_warning("Huh? Unknown MCE error 0x%x\n", ec); | ||
| 363 | } | ||
| 364 | |||
| 365 | void decode_mce(struct mce *m) | ||
| 366 | { | ||
| 367 | struct err_regs regs; | ||
| 368 | int node, ecc; | ||
| 369 | |||
| 370 | pr_emerg("MC%d_STATUS: ", m->bank); | ||
| 371 | |||
| 372 | pr_cont("%sorrected error, report: %s, MiscV: %svalid, " | ||
| 373 | "CPU context corrupt: %s", | ||
| 374 | ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), | ||
| 375 | ((m->status & MCI_STATUS_EN) ? "yes" : "no"), | ||
| 376 | ((m->status & MCI_STATUS_MISCV) ? "" : "in"), | ||
| 377 | ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); | ||
| 378 | |||
| 379 | /* do the two bits[14:13] together */ | ||
| 380 | ecc = m->status & (3ULL << 45); | ||
| 381 | if (ecc) | ||
| 382 | pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); | ||
| 383 | |||
| 384 | pr_cont("\n"); | ||
| 385 | |||
| 386 | switch (m->bank) { | ||
| 387 | case 0: | ||
| 388 | amd_decode_dc_mce(m->status); | ||
| 389 | break; | ||
| 390 | |||
| 391 | case 1: | ||
| 392 | amd_decode_ic_mce(m->status); | ||
| 393 | break; | ||
| 394 | |||
| 395 | case 2: | ||
| 396 | amd_decode_bu_mce(m->status); | ||
| 397 | break; | ||
| 398 | |||
| 399 | case 3: | ||
| 400 | amd_decode_ls_mce(m->status); | ||
| 401 | break; | ||
| 402 | |||
| 403 | case 4: | ||
| 404 | regs.nbsl = (u32) m->status; | ||
| 405 | regs.nbsh = (u32)(m->status >> 32); | ||
| 406 | regs.nbeal = (u32) m->addr; | ||
| 407 | regs.nbeah = (u32)(m->addr >> 32); | ||
| 408 | node = per_cpu(cpu_llc_id, m->extcpu); | ||
| 409 | |||
| 410 | amd_decode_nb_mce(node, ®s, 1); | ||
| 411 | break; | ||
| 412 | |||
| 413 | case 5: | ||
| 414 | amd_decode_fr_mce(m->status); | ||
| 415 | break; | ||
| 416 | |||
| 417 | default: | ||
| 418 | break; | ||
| 419 | } | ||
| 420 | |||
| 421 | amd_decode_err_code(m->status & 0xffff); | ||
| 422 | } | ||
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h new file mode 100644 index 000000000000..df23ee065f79 --- /dev/null +++ b/drivers/edac/edac_mce_amd.h | |||
| @@ -0,0 +1,69 @@ | |||
| 1 | #ifndef _EDAC_MCE_AMD_H | ||
| 2 | #define _EDAC_MCE_AMD_H | ||
| 3 | |||
| 4 | #include <asm/mce.h> | ||
| 5 | |||
| 6 | #define ERROR_CODE(x) ((x) & 0xffff) | ||
| 7 | #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) | ||
| 8 | #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] | ||
| 9 | |||
| 10 | #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) | ||
| 11 | #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) | ||
| 12 | |||
| 13 | #define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) | ||
| 14 | #define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) | ||
| 15 | #define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) | ||
| 16 | |||
| 17 | #define TT(x) (((x) >> 2) & 0x3) | ||
| 18 | #define TT_MSG(x) tt_msgs[TT(x)] | ||
| 19 | #define II(x) (((x) >> 2) & 0x3) | ||
| 20 | #define II_MSG(x) ii_msgs[II(x)] | ||
| 21 | #define LL(x) (((x) >> 0) & 0x3) | ||
| 22 | #define LL_MSG(x) ll_msgs[LL(x)] | ||
| 23 | #define RRRR(x) (((x) >> 4) & 0xf) | ||
| 24 | #define RRRR_MSG(x) rrrr_msgs[RRRR(x)] | ||
| 25 | #define TO(x) (((x) >> 8) & 0x1) | ||
| 26 | #define TO_MSG(x) to_msgs[TO(x)] | ||
| 27 | #define PP(x) (((x) >> 9) & 0x3) | ||
| 28 | #define PP_MSG(x) pp_msgs[PP(x)] | ||
| 29 | |||
| 30 | #define K8_NBSH 0x4C | ||
| 31 | |||
| 32 | #define K8_NBSH_VALID_BIT BIT(31) | ||
| 33 | #define K8_NBSH_OVERFLOW BIT(30) | ||
| 34 | #define K8_NBSH_UC_ERR BIT(29) | ||
| 35 | #define K8_NBSH_ERR_EN BIT(28) | ||
| 36 | #define K8_NBSH_MISCV BIT(27) | ||
| 37 | #define K8_NBSH_VALID_ERROR_ADDR BIT(26) | ||
| 38 | #define K8_NBSH_PCC BIT(25) | ||
| 39 | #define K8_NBSH_ERR_CPU_VAL BIT(24) | ||
| 40 | #define K8_NBSH_CECC BIT(14) | ||
| 41 | #define K8_NBSH_UECC BIT(13) | ||
| 42 | #define K8_NBSH_ERR_SCRUBER BIT(8) | ||
| 43 | |||
| 44 | extern const char *tt_msgs[]; | ||
| 45 | extern const char *ll_msgs[]; | ||
| 46 | extern const char *rrrr_msgs[]; | ||
| 47 | extern const char *pp_msgs[]; | ||
| 48 | extern const char *to_msgs[]; | ||
| 49 | extern const char *ii_msgs[]; | ||
| 50 | extern const char *ext_msgs[]; | ||
| 51 | |||
| 52 | /* | ||
| 53 | * relevant NB regs | ||
| 54 | */ | ||
| 55 | struct err_regs { | ||
| 56 | u32 nbcfg; | ||
| 57 | u32 nbsh; | ||
| 58 | u32 nbsl; | ||
| 59 | u32 nbeah; | ||
| 60 | u32 nbeal; | ||
| 61 | }; | ||
| 62 | |||
| 63 | |||
| 64 | void amd_report_gart_errors(bool); | ||
| 65 | void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); | ||
| 66 | void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); | ||
| 67 | void amd_decode_nb_mce(int, struct err_regs *, int); | ||
| 68 | |||
| 69 | #endif /* _EDAC_MCE_AMD_H */ | ||
