aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 20:38:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 20:38:38 -0400
commitf65ac45e20b03081ed64f41ce91bb982f8ac258d (patch)
tree615e966b6c792ccd840f994f38591ff5d3d85f72
parent4142e0d1def2c0176c27fd2e810243045a62eb6d (diff)
parent22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: x86, mce: do not compile mcelog message on AMD EDAC, AMD: decode FR MCEs EDAC, AMD: decode load store MCEs EDAC, AMD: decode bus unit MCEs EDAC, AMD: decode instruction cache MCEs EDAC, AMD: decode data cache MCEs EDAC, AMD: carve out decoding of MCi_STATUS ErrorCode EDAC, AMD: carve out MCi_STATUS decoding x86, mce: pass mce info to EDAC for decoding amd64_edac: cleanup amd64_decode_bus_error amd64_edac: remove memory and GART TLB error decoders amd64_edac: cleanup/complete NB MCE decoding amd64_edac: cleanup amd64_process_error_info EDAC: beef up ErrorCodeExt error signatures EDAC: move MCE error descriptions to EDAC core
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c12
-rw-r--r--drivers/edac/Makefile6
-rw-r--r--drivers/edac/amd64_edac.c328
-rw-r--r--drivers/edac/amd64_edac.h71
-rw-r--r--drivers/edac/amd64_edac_dbg.c2
-rw-r--r--drivers/edac/amd64_edac_err_types.c161
-rw-r--r--drivers/edac/edac_mce_amd.c422
-rw-r--r--drivers/edac/edac_mce_amd.h69
8 files changed, 619 insertions, 452 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62f..9bfe9d2ea615 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 183 set_bit(0, &mce_need_notify);
184} 184}
185 185
186void __weak decode_mce(struct mce *m)
187{
188 return;
189}
190
186static void print_mce(struct mce *m) 191static void print_mce(struct mce *m)
187{ 192{
188 printk(KERN_EMERG 193 printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 210 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 211 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 212 m->apicid);
213
214 decode_mce(m);
208} 215}
209 216
210static void print_mce_head(void) 217static void print_mce_head(void)
@@ -215,7 +222,10 @@ static void print_mce_head(void)
215static void print_mce_tail(void) 222static void print_mce_tail(void)
216{ 223{
217 printk(KERN_EMERG "This is not a software problem!\n" 224 printk(KERN_EMERG "This is not a software problem!\n"
218 "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 225#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
226 "Run through mcelog --ascii to decode and contact your hardware vendor\n"
227#endif
228 );
219} 229}
220 230
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 231#define PANIC_TIMEOUT 5 /* 5 seconds */
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 98aa4a7db412..cfa033ce53a7 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -17,6 +17,10 @@ ifdef CONFIG_PCI
17edac_core-objs += edac_pci.o edac_pci_sysfs.o 17edac_core-objs += edac_pci.o edac_pci_sysfs.o
18endif 18endif
19 19
20ifdef CONFIG_CPU_SUP_AMD
21edac_core-objs += edac_mce_amd.o
22endif
23
20obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o 24obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
21obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o 25obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o
22obj-$(CONFIG_EDAC_I5000) += i5000_edac.o 26obj-$(CONFIG_EDAC_I5000) += i5000_edac.o
@@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o
32obj-$(CONFIG_EDAC_I82860) += i82860_edac.o 36obj-$(CONFIG_EDAC_I82860) += i82860_edac.o
33obj-$(CONFIG_EDAC_R82600) += r82600_edac.o 37obj-$(CONFIG_EDAC_R82600) += r82600_edac.o
34 38
35amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o 39amd64_edac_mod-y := amd64_edac.o
36amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o 40amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o
37amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o 41amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o
38 42
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e2a10bcba7a1..173dc4a84166 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -19,6 +19,63 @@ static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
19static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; 19static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
20 20
21/* 21/*
22 * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
23 * for DDR2 DRAM mapping.
24 */
25u32 revf_quad_ddr2_shift[] = {
26 0, /* 0000b NULL DIMM (128mb) */
27 28, /* 0001b 256mb */
28 29, /* 0010b 512mb */
29 29, /* 0011b 512mb */
30 29, /* 0100b 512mb */
31 30, /* 0101b 1gb */
32 30, /* 0110b 1gb */
33 31, /* 0111b 2gb */
34 31, /* 1000b 2gb */
35 32, /* 1001b 4gb */
36 32, /* 1010b 4gb */
37 33, /* 1011b 8gb */
38 0, /* 1100b future */
39 0, /* 1101b future */
40 0, /* 1110b future */
41 0 /* 1111b future */
42};
43
44/*
45 * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
46 * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
47 * or higher value'.
48 *
49 *FIXME: Produce a better mapping/linearisation.
50 */
51
52struct scrubrate scrubrates[] = {
53 { 0x01, 1600000000UL},
54 { 0x02, 800000000UL},
55 { 0x03, 400000000UL},
56 { 0x04, 200000000UL},
57 { 0x05, 100000000UL},
58 { 0x06, 50000000UL},
59 { 0x07, 25000000UL},
60 { 0x08, 12284069UL},
61 { 0x09, 6274509UL},
62 { 0x0A, 3121951UL},
63 { 0x0B, 1560975UL},
64 { 0x0C, 781440UL},
65 { 0x0D, 390720UL},
66 { 0x0E, 195300UL},
67 { 0x0F, 97650UL},
68 { 0x10, 48854UL},
69 { 0x11, 24427UL},
70 { 0x12, 12213UL},
71 { 0x13, 6101UL},
72 { 0x14, 3051UL},
73 { 0x15, 1523UL},
74 { 0x16, 761UL},
75 { 0x00, 0UL}, /* scrubbing off */
76};
77
78/*
22 * Memory scrubber control interface. For K8, memory scrubbing is handled by 79 * Memory scrubber control interface. For K8, memory scrubbing is handled by
23 * hardware and can involve L2 cache, dcache as well as the main memory. With 80 * hardware and can involve L2 cache, dcache as well as the main memory. With
24 * F10, this is extended to L3 cache scrubbing on CPU models sporting that 81 * F10, this is extended to L3 cache scrubbing on CPU models sporting that
@@ -693,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
693 * specific. 750 * specific.
694 */ 751 */
695static u64 extract_error_address(struct mem_ctl_info *mci, 752static u64 extract_error_address(struct mem_ctl_info *mci,
696 struct amd64_error_info_regs *info) 753 struct err_regs *info)
697{ 754{
698 struct amd64_pvt *pvt = mci->pvt_info; 755 struct amd64_pvt *pvt = mci->pvt_info;
699 756
@@ -1049,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt)
1049 1106
1050/* extract the ERROR ADDRESS for the K8 CPUs */ 1107/* extract the ERROR ADDRESS for the K8 CPUs */
1051static u64 k8_get_error_address(struct mem_ctl_info *mci, 1108static u64 k8_get_error_address(struct mem_ctl_info *mci,
1052 struct amd64_error_info_regs *info) 1109 struct err_regs *info)
1053{ 1110{
1054 return (((u64) (info->nbeah & 0xff)) << 32) + 1111 return (((u64) (info->nbeah & 0xff)) << 32) +
1055 (info->nbeal & ~0x03); 1112 (info->nbeal & ~0x03);
@@ -1092,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
1092} 1149}
1093 1150
1094static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, 1151static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
1095 struct amd64_error_info_regs *info, 1152 struct err_regs *info,
1096 u64 SystemAddress) 1153 u64 SystemAddress)
1097{ 1154{
1098 struct mem_ctl_info *src_mci; 1155 struct mem_ctl_info *src_mci;
@@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
1101 u32 page, offset; 1158 u32 page, offset;
1102 1159
1103 /* Extract the syndrome parts and form a 16-bit syndrome */ 1160 /* Extract the syndrome parts and form a 16-bit syndrome */
1104 syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; 1161 syndrome = HIGH_SYNDROME(info->nbsl) << 8;
1105 syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); 1162 syndrome |= LOW_SYNDROME(info->nbsh);
1106 1163
1107 /* CHIPKILL enabled */ 1164 /* CHIPKILL enabled */
1108 if (info->nbcfg & K8_NBCFG_CHIPKILL) { 1165 if (info->nbcfg & K8_NBCFG_CHIPKILL) {
@@ -1311,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt)
1311} 1368}
1312 1369
1313static u64 f10_get_error_address(struct mem_ctl_info *mci, 1370static u64 f10_get_error_address(struct mem_ctl_info *mci,
1314 struct amd64_error_info_regs *info) 1371 struct err_regs *info)
1315{ 1372{
1316 return (((u64) (info->nbeah & 0xffff)) << 32) + 1373 return (((u64) (info->nbeah & 0xffff)) << 32) +
1317 (info->nbeal & ~0x01); 1374 (info->nbeal & ~0x01);
@@ -1688,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
1688 * The @sys_addr is usually an error address received from the hardware. 1745 * The @sys_addr is usually an error address received from the hardware.
1689 */ 1746 */
1690static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, 1747static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
1691 struct amd64_error_info_regs *info, 1748 struct err_regs *info,
1692 u64 sys_addr) 1749 u64 sys_addr)
1693{ 1750{
1694 struct amd64_pvt *pvt = mci->pvt_info; 1751 struct amd64_pvt *pvt = mci->pvt_info;
@@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
1701 if (csrow >= 0) { 1758 if (csrow >= 0) {
1702 error_address_to_page_and_offset(sys_addr, &page, &offset); 1759 error_address_to_page_and_offset(sys_addr, &page, &offset);
1703 1760
1704 syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; 1761 syndrome = HIGH_SYNDROME(info->nbsl) << 8;
1705 syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); 1762 syndrome |= LOW_SYNDROME(info->nbsh);
1706 1763
1707 /* 1764 /*
1708 * Is CHIPKILL on? If so, then we can attempt to use the 1765 * Is CHIPKILL on? If so, then we can attempt to use the
@@ -2045,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
2045 * - 0: if no valid error is indicated 2102 * - 0: if no valid error is indicated
2046 */ 2103 */
2047static int amd64_get_error_info_regs(struct mem_ctl_info *mci, 2104static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
2048 struct amd64_error_info_regs *regs) 2105 struct err_regs *regs)
2049{ 2106{
2050 struct amd64_pvt *pvt; 2107 struct amd64_pvt *pvt;
2051 struct pci_dev *misc_f3_ctl; 2108 struct pci_dev *misc_f3_ctl;
@@ -2094,10 +2151,10 @@ err_reg:
2094 * - 0: if no error is found 2151 * - 0: if no error is found
2095 */ 2152 */
2096static int amd64_get_error_info(struct mem_ctl_info *mci, 2153static int amd64_get_error_info(struct mem_ctl_info *mci,
2097 struct amd64_error_info_regs *info) 2154 struct err_regs *info)
2098{ 2155{
2099 struct amd64_pvt *pvt; 2156 struct amd64_pvt *pvt;
2100 struct amd64_error_info_regs regs; 2157 struct err_regs regs;
2101 2158
2102 pvt = mci->pvt_info; 2159 pvt = mci->pvt_info;
2103 2160
@@ -2152,48 +2209,12 @@ static int amd64_get_error_info(struct mem_ctl_info *mci,
2152 return 1; 2209 return 1;
2153} 2210}
2154 2211
2155static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
2156 struct amd64_error_info_regs *info)
2157{
2158 u32 err_code;
2159 u32 ec_tt; /* error code transaction type (2b) */
2160 u32 ec_ll; /* error code cache level (2b) */
2161
2162 err_code = EXTRACT_ERROR_CODE(info->nbsl);
2163 ec_ll = EXTRACT_LL_CODE(err_code);
2164 ec_tt = EXTRACT_TT_CODE(err_code);
2165
2166 amd64_mc_printk(mci, KERN_ERR,
2167 "GART TLB event: transaction type(%s), "
2168 "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
2169}
2170
2171static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
2172 struct amd64_error_info_regs *info)
2173{
2174 u32 err_code;
2175 u32 ec_rrrr; /* error code memory transaction (4b) */
2176 u32 ec_tt; /* error code transaction type (2b) */
2177 u32 ec_ll; /* error code cache level (2b) */
2178
2179 err_code = EXTRACT_ERROR_CODE(info->nbsl);
2180 ec_ll = EXTRACT_LL_CODE(err_code);
2181 ec_tt = EXTRACT_TT_CODE(err_code);
2182 ec_rrrr = EXTRACT_RRRR_CODE(err_code);
2183
2184 amd64_mc_printk(mci, KERN_ERR,
2185 "cache hierarchy error: memory transaction type(%s), "
2186 "transaction type(%s), cache level(%s)\n",
2187 rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
2188}
2189
2190
2191/* 2212/*
2192 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR 2213 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2193 * ADDRESS and process. 2214 * ADDRESS and process.
2194 */ 2215 */
2195static void amd64_handle_ce(struct mem_ctl_info *mci, 2216static void amd64_handle_ce(struct mem_ctl_info *mci,
2196 struct amd64_error_info_regs *info) 2217 struct err_regs *info)
2197{ 2218{
2198 struct amd64_pvt *pvt = mci->pvt_info; 2219 struct amd64_pvt *pvt = mci->pvt_info;
2199 u64 SystemAddress; 2220 u64 SystemAddress;
@@ -2216,7 +2237,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci,
2216 2237
2217/* Handle any Un-correctable Errors (UEs) */ 2238/* Handle any Un-correctable Errors (UEs) */
2218static void amd64_handle_ue(struct mem_ctl_info *mci, 2239static void amd64_handle_ue(struct mem_ctl_info *mci,
2219 struct amd64_error_info_regs *info) 2240 struct err_regs *info)
2220{ 2241{
2221 int csrow; 2242 int csrow;
2222 u64 SystemAddress; 2243 u64 SystemAddress;
@@ -2261,59 +2282,24 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
2261 } 2282 }
2262} 2283}
2263 2284
2264static void amd64_decode_bus_error(struct mem_ctl_info *mci, 2285static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2265 struct amd64_error_info_regs *info) 2286 struct err_regs *info)
2266{ 2287{
2267 u32 err_code, ext_ec; 2288 u32 ec = ERROR_CODE(info->nbsl);
2268 u32 ec_pp; /* error code participating processor (2p) */ 2289 u32 xec = EXT_ERROR_CODE(info->nbsl);
2269 u32 ec_to; /* error code timed out (1b) */ 2290 int ecc_type = info->nbsh & (0x3 << 13);
2270 u32 ec_rrrr; /* error code memory transaction (4b) */
2271 u32 ec_ii; /* error code memory or I/O (2b) */
2272 u32 ec_ll; /* error code cache level (2b) */
2273 2291
2274 ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); 2292 /* Bail early out if this was an 'observed' error */
2275 err_code = EXTRACT_ERROR_CODE(info->nbsl); 2293 if (PP(ec) == K8_NBSL_PP_OBS)
2276 2294 return;
2277 ec_ll = EXTRACT_LL_CODE(err_code);
2278 ec_ii = EXTRACT_II_CODE(err_code);
2279 ec_rrrr = EXTRACT_RRRR_CODE(err_code);
2280 ec_to = EXTRACT_TO_CODE(err_code);
2281 ec_pp = EXTRACT_PP_CODE(err_code);
2282
2283 amd64_mc_printk(mci, KERN_ERR,
2284 "BUS ERROR:\n"
2285 " time-out(%s) mem or i/o(%s)\n"
2286 " participating processor(%s)\n"
2287 " memory transaction type(%s)\n"
2288 " cache level(%s) Error Found by: %s\n",
2289 to_msgs[ec_to],
2290 ii_msgs[ec_ii],
2291 pp_msgs[ec_pp],
2292 rrrr_msgs[ec_rrrr],
2293 ll_msgs[ec_ll],
2294 (info->nbsh & K8_NBSH_ERR_SCRUBER) ?
2295 "Scrubber" : "Normal Operation");
2296
2297 /* If this was an 'observed' error, early out */
2298 if (ec_pp == K8_NBSL_PP_OBS)
2299 return; /* We aren't the node involved */
2300
2301 /* Parse out the extended error code for ECC events */
2302 switch (ext_ec) {
2303 /* F10 changed to one Extended ECC error code */
2304 case F10_NBSL_EXT_ERR_RES: /* Reserved field */
2305 case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
2306 break;
2307 2295
2308 default: 2296 /* Do only ECC errors */
2309 amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " 2297 if (xec && xec != F10_NBSL_EXT_ERR_ECC)
2310 "handling for this error\n");
2311 return; 2298 return;
2312 }
2313 2299
2314 if (info->nbsh & K8_NBSH_CECC) 2300 if (ecc_type == 2)
2315 amd64_handle_ce(mci, info); 2301 amd64_handle_ce(mci, info);
2316 else if (info->nbsh & K8_NBSH_UECC) 2302 else if (ecc_type == 1)
2317 amd64_handle_ue(mci, info); 2303 amd64_handle_ue(mci, info);
2318 2304
2319 /* 2305 /*
@@ -2324,139 +2310,26 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2324 * catastrophic. 2310 * catastrophic.
2325 */ 2311 */
2326 if (info->nbsh & K8_NBSH_OVERFLOW) 2312 if (info->nbsh & K8_NBSH_OVERFLOW)
2327 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR 2313 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
2328 "Error Overflow set");
2329} 2314}
2330 2315
2331int amd64_process_error_info(struct mem_ctl_info *mci, 2316void amd64_decode_bus_error(int node_id, struct err_regs *regs)
2332 struct amd64_error_info_regs *info,
2333 int handle_errors)
2334{ 2317{
2335 struct amd64_pvt *pvt; 2318 struct mem_ctl_info *mci = mci_lookup[node_id];
2336 struct amd64_error_info_regs *regs;
2337 u32 err_code, ext_ec;
2338 int gart_tlb_error = 0;
2339
2340 pvt = mci->pvt_info;
2341
2342 /* If caller doesn't want us to process the error, return */
2343 if (!handle_errors)
2344 return 1;
2345
2346 regs = info;
2347
2348 debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
2349 debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
2350 pvt->mc_node_id, regs->nbeah, regs->nbeal);
2351 debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
2352 regs->nbsh, regs->nbsl);
2353 debugf1(" Valid Error=%s Overflow=%s\n",
2354 (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
2355 (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
2356 debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
2357 (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
2358 "True" : "False",
2359 (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
2360 "True" : "False");
2361 debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
2362 (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
2363 "True" : "False",
2364 (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
2365 "True" : "False",
2366 (regs->nbsh & K8_NBSH_PCC) ?
2367 "True" : "False");
2368 debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
2369 (regs->nbsh & K8_NBSH_CECC) ?
2370 "True" : "False",
2371 (regs->nbsh & K8_NBSH_UECC) ?
2372 "True" : "False",
2373 (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
2374 "True" : "False");
2375 debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
2376 (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
2377 (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
2378 (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
2379 (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
2380
2381
2382 err_code = EXTRACT_ERROR_CODE(regs->nbsl);
2383
2384 /* Determine which error type:
2385 * 1) GART errors - non-fatal, developmental events
2386 * 2) MEMORY errors
2387 * 3) BUS errors
2388 * 4) Unknown error
2389 */
2390 if (TEST_TLB_ERROR(err_code)) {
2391 /*
2392 * GART errors are intended to help graphics driver developers
2393 * to detect bad GART PTEs. It is recommended by AMD to disable
2394 * GART table walk error reporting by default[1] (currently
2395 * being disabled in mce_cpu_quirks()) and according to the
2396 * comment in mce_cpu_quirks(), such GART errors can be
2397 * incorrectly triggered. We may see these errors anyway and
2398 * unless requested by the user, they won't be reported.
2399 *
2400 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2401 * AMD NPT family 0Fh processors
2402 */
2403 if (report_gart_errors == 0)
2404 return 1;
2405
2406 /*
2407 * Only if GART error reporting is requested should we generate
2408 * any logs.
2409 */
2410 gart_tlb_error = 1;
2411
2412 debugf1("GART TLB error\n");
2413 amd64_decode_gart_tlb_error(mci, info);
2414 } else if (TEST_MEM_ERROR(err_code)) {
2415 debugf1("Memory/Cache error\n");
2416 amd64_decode_mem_cache_error(mci, info);
2417 } else if (TEST_BUS_ERROR(err_code)) {
2418 debugf1("Bus (Link/DRAM) error\n");
2419 amd64_decode_bus_error(mci, info);
2420 } else {
2421 /* shouldn't reach here! */
2422 amd64_mc_printk(mci, KERN_WARNING,
2423 "%s(): unknown MCE error 0x%x\n", __func__,
2424 err_code);
2425 }
2426
2427 ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
2428 amd64_mc_printk(mci, KERN_ERR,
2429 "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
2430 2319
2431 if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && 2320 __amd64_decode_bus_error(mci, regs);
2432 ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
2433 (ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
2434 EXTRACT_LDT_LINK(info->nbsh)) {
2435
2436 amd64_mc_printk(mci, KERN_ERR,
2437 "Error on hypertransport link: %s\n",
2438 htlink_msgs[
2439 EXTRACT_LDT_LINK(info->nbsh)]);
2440 }
2441 2321
2442 /* 2322 /*
2443 * Check the UE bit of the NB status high register, if set generate some 2323 * Check the UE bit of the NB status high register, if set generate some
2444 * logs. If NOT a GART error, then process the event as a NO-INFO event. 2324 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2445 * If it was a GART error, skip that process. 2325 * If it was a GART error, skip that process.
2326 *
2327 * FIXME: this should go somewhere else, if at all.
2446 */ 2328 */
2447 if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { 2329 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2448 amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); 2330 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2449 if (!gart_tlb_error)
2450 edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
2451 }
2452
2453 if (regs->nbsh & K8_NBSH_PCC)
2454 amd64_mc_printk(mci, KERN_CRIT,
2455 "PCC (processor context corrupt) set\n");
2456 2331
2457 return 1;
2458} 2332}
2459EXPORT_SYMBOL_GPL(amd64_process_error_info);
2460 2333
2461/* 2334/*
2462 * The main polling 'check' function, called FROM the edac core to perform the 2335 * The main polling 'check' function, called FROM the edac core to perform the
@@ -2464,10 +2337,12 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info);
2464 */ 2337 */
2465static void amd64_check(struct mem_ctl_info *mci) 2338static void amd64_check(struct mem_ctl_info *mci)
2466{ 2339{
2467 struct amd64_error_info_regs info; 2340 struct err_regs regs;
2468 2341
2469 if (amd64_get_error_info(mci, &info)) 2342 if (amd64_get_error_info(mci, &regs)) {
2470 amd64_process_error_info(mci, &info, 1); 2343 struct amd64_pvt *pvt = mci->pvt_info;
2344 amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2345 }
2471} 2346}
2472 2347
2473/* 2348/*
@@ -3163,6 +3038,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
3163 3038
3164 mci_lookup[node_id] = mci; 3039 mci_lookup[node_id] = mci;
3165 pvt_lookup[node_id] = NULL; 3040 pvt_lookup[node_id] = NULL;
3041
3042 /* register stuff with EDAC MCE */
3043 if (report_gart_errors)
3044 amd_report_gart_errors(true);
3045
3046 amd_register_ecc_decoder(amd64_decode_bus_error);
3047
3166 return 0; 3048 return 0;
3167 3049
3168err_add_mc: 3050err_add_mc:
@@ -3229,6 +3111,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
3229 3111
3230 mci_lookup[pvt->mc_node_id] = NULL; 3112 mci_lookup[pvt->mc_node_id] = NULL;
3231 3113
3114 /* unregister from EDAC MCE */
3115 amd_report_gart_errors(false);
3116 amd_unregister_ecc_decoder(amd64_decode_bus_error);
3117
3232 /* Free the EDAC CORE resources */ 3118 /* Free the EDAC CORE resources */
3233 edac_mc_free(mci); 3119 edac_mc_free(mci);
3234} 3120}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index ba73015af8e4..8ea07e2715dc 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -72,6 +72,7 @@
72#include <linux/edac.h> 72#include <linux/edac.h>
73#include <asm/msr.h> 73#include <asm/msr.h>
74#include "edac_core.h" 74#include "edac_core.h"
75#include "edac_mce_amd.h"
75 76
76#define amd64_printk(level, fmt, arg...) \ 77#define amd64_printk(level, fmt, arg...) \
77 edac_printk(level, "amd64", fmt, ##arg) 78 edac_printk(level, "amd64", fmt, ##arg)
@@ -303,21 +304,9 @@ enum {
303#define K8_NBSL 0x48 304#define K8_NBSL 0x48
304 305
305 306
306#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
307#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
308
309/* Family F10h: Normalized Extended Error Codes */ 307/* Family F10h: Normalized Extended Error Codes */
310#define F10_NBSL_EXT_ERR_RES 0x0 308#define F10_NBSL_EXT_ERR_RES 0x0
311#define F10_NBSL_EXT_ERR_CRC 0x1
312#define F10_NBSL_EXT_ERR_SYNC 0x2
313#define F10_NBSL_EXT_ERR_MST 0x3
314#define F10_NBSL_EXT_ERR_TGT 0x4
315#define F10_NBSL_EXT_ERR_GART 0x5
316#define F10_NBSL_EXT_ERR_RMW 0x6
317#define F10_NBSL_EXT_ERR_WDT 0x7
318#define F10_NBSL_EXT_ERR_ECC 0x8 309#define F10_NBSL_EXT_ERR_ECC 0x8
319#define F10_NBSL_EXT_ERR_DEV 0x9
320#define F10_NBSL_EXT_ERR_LINK_DATA 0xA
321 310
322/* Next two are overloaded values */ 311/* Next two are overloaded values */
323#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB 312#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB
@@ -348,17 +337,6 @@ enum {
348#define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 337#define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8
349#define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD 338#define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD
350 339
351#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff)
352#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
353#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
354#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
355#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3)
356#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3)
357#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3)
358#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf)
359#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1)
360#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3)
361
362/* 340/*
363 * The following are for BUS type errors AFTER values have been normalized by 341 * The following are for BUS type errors AFTER values have been normalized by
364 * shifting right 342 * shifting right
@@ -368,28 +346,7 @@ enum {
368#define K8_NBSL_PP_OBS 0x2 346#define K8_NBSL_PP_OBS 0x2
369#define K8_NBSL_PP_GENERIC 0x3 347#define K8_NBSL_PP_GENERIC 0x3
370 348
371
372#define K8_NBSH 0x4C
373
374#define K8_NBSH_VALID_BIT BIT(31)
375#define K8_NBSH_OVERFLOW BIT(30)
376#define K8_NBSH_UNCORRECTED_ERR BIT(29)
377#define K8_NBSH_ERR_ENABLE BIT(28)
378#define K8_NBSH_MISC_ERR_VALID BIT(27)
379#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
380#define K8_NBSH_PCC BIT(25)
381#define K8_NBSH_CECC BIT(14)
382#define K8_NBSH_UECC BIT(13)
383#define K8_NBSH_ERR_SCRUBER BIT(8)
384#define K8_NBSH_CORE3 BIT(3)
385#define K8_NBSH_CORE2 BIT(2)
386#define K8_NBSH_CORE1 BIT(1)
387#define K8_NBSH_CORE0 BIT(0)
388
389#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7)
390#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) 349#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
391#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff)
392
393 350
394#define K8_NBEAL 0x50 351#define K8_NBEAL 0x50
395#define K8_NBEAH 0x54 352#define K8_NBEAH 0x54
@@ -455,23 +412,6 @@ enum amd64_chipset_families {
455 F11_CPUS, 412 F11_CPUS,
456}; 413};
457 414
458/*
459 * Structure to hold:
460 *
461 * 1) dynamically read status and error address HW registers
462 * 2) sysfs entered values
463 * 3) MCE values
464 *
465 * Depends on entry into the modules
466 */
467struct amd64_error_info_regs {
468 u32 nbcfg;
469 u32 nbsh;
470 u32 nbsl;
471 u32 nbeah;
472 u32 nbeal;
473};
474
475/* Error injection control structure */ 415/* Error injection control structure */
476struct error_injection { 416struct error_injection {
477 u32 section; 417 u32 section;
@@ -542,7 +482,7 @@ struct amd64_pvt {
542 u32 online_spare; /* On-Line spare Reg */ 482 u32 online_spare; /* On-Line spare Reg */
543 483
544 /* temp storage for when input is received from sysfs */ 484 /* temp storage for when input is received from sysfs */
545 struct amd64_error_info_regs ctl_error_info; 485 struct err_regs ctl_error_info;
546 486
547 /* place to store error injection parameters prior to issue */ 487 /* place to store error injection parameters prior to issue */
548 struct error_injection injection; 488 struct error_injection injection;
@@ -601,11 +541,11 @@ struct low_ops {
601 int (*early_channel_count)(struct amd64_pvt *pvt); 541 int (*early_channel_count)(struct amd64_pvt *pvt);
602 542
603 u64 (*get_error_address)(struct mem_ctl_info *mci, 543 u64 (*get_error_address)(struct mem_ctl_info *mci,
604 struct amd64_error_info_regs *info); 544 struct err_regs *info);
605 void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); 545 void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram);
606 void (*read_dram_ctl_register)(struct amd64_pvt *pvt); 546 void (*read_dram_ctl_register)(struct amd64_pvt *pvt);
607 void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, 547 void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci,
608 struct amd64_error_info_regs *info, 548 struct err_regs *info,
609 u64 SystemAddr); 549 u64 SystemAddr);
610 int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); 550 int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map);
611}; 551};
@@ -637,8 +577,5 @@ static inline struct low_ops *family_ops(int index)
637#define F10_MIN_SCRUB_RATE_BITS 0x5 577#define F10_MIN_SCRUB_RATE_BITS 0x5
638#define F11_MIN_SCRUB_RATE_BITS 0x6 578#define F11_MIN_SCRUB_RATE_BITS 0x6
639 579
640int amd64_process_error_info(struct mem_ctl_info *mci,
641 struct amd64_error_info_regs *info,
642 int handle_errors);
643int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, 580int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
644 u64 *hole_offset, u64 *hole_size); 581 u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 0a41b248a4ad..59cf2cf6e11e 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
24 24
25 /* Process the Mapping request */ 25 /* Process the Mapping request */
26 /* TODO: Add race prevention */ 26 /* TODO: Add race prevention */
27 amd64_process_error_info(mci, &pvt->ctl_error_info, 1); 27 amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
28 28
29 return count; 29 return count;
30 } 30 }
diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
deleted file mode 100644
index f212ff12a9d8..000000000000
--- a/drivers/edac/amd64_edac_err_types.c
+++ /dev/null
@@ -1,161 +0,0 @@
1#include "amd64_edac.h"
2
3/*
4 * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
5 * for DDR2 DRAM mapping.
6 */
7u32 revf_quad_ddr2_shift[] = {
8 0, /* 0000b NULL DIMM (128mb) */
9 28, /* 0001b 256mb */
10 29, /* 0010b 512mb */
11 29, /* 0011b 512mb */
12 29, /* 0100b 512mb */
13 30, /* 0101b 1gb */
14 30, /* 0110b 1gb */
15 31, /* 0111b 2gb */
16 31, /* 1000b 2gb */
17 32, /* 1001b 4gb */
18 32, /* 1010b 4gb */
19 33, /* 1011b 8gb */
20 0, /* 1100b future */
21 0, /* 1101b future */
22 0, /* 1110b future */
23 0 /* 1111b future */
24};
25
26/*
27 * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
28 * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
29 * or higher value'.
30 *
31 *FIXME: Produce a better mapping/linearisation.
32 */
33
34struct scrubrate scrubrates[] = {
35 { 0x01, 1600000000UL},
36 { 0x02, 800000000UL},
37 { 0x03, 400000000UL},
38 { 0x04, 200000000UL},
39 { 0x05, 100000000UL},
40 { 0x06, 50000000UL},
41 { 0x07, 25000000UL},
42 { 0x08, 12284069UL},
43 { 0x09, 6274509UL},
44 { 0x0A, 3121951UL},
45 { 0x0B, 1560975UL},
46 { 0x0C, 781440UL},
47 { 0x0D, 390720UL},
48 { 0x0E, 195300UL},
49 { 0x0F, 97650UL},
50 { 0x10, 48854UL},
51 { 0x11, 24427UL},
52 { 0x12, 12213UL},
53 { 0x13, 6101UL},
54 { 0x14, 3051UL},
55 { 0x15, 1523UL},
56 { 0x16, 761UL},
57 { 0x00, 0UL}, /* scrubbing off */
58};
59
60/*
61 * string representation for the different MCA reported error types, see F3x48
62 * or MSR0000_0411.
63 */
64const char *tt_msgs[] = { /* transaction type */
65 "instruction",
66 "data",
67 "generic",
68 "reserved"
69};
70
71const char *ll_msgs[] = { /* cache level */
72 "L0",
73 "L1",
74 "L2",
75 "L3/generic"
76};
77
78const char *rrrr_msgs[] = {
79 "generic",
80 "generic read",
81 "generic write",
82 "data read",
83 "data write",
84 "inst fetch",
85 "prefetch",
86 "evict",
87 "snoop",
88 "reserved RRRR= 9",
89 "reserved RRRR= 10",
90 "reserved RRRR= 11",
91 "reserved RRRR= 12",
92 "reserved RRRR= 13",
93 "reserved RRRR= 14",
94 "reserved RRRR= 15"
95};
96
97const char *pp_msgs[] = { /* participating processor */
98 "local node originated (SRC)",
99 "local node responded to request (RES)",
100 "local node observed as 3rd party (OBS)",
101 "generic"
102};
103
104const char *to_msgs[] = {
105 "no timeout",
106 "timed out"
107};
108
109const char *ii_msgs[] = { /* memory or i/o */
110 "mem access",
111 "reserved",
112 "i/o access",
113 "generic"
114};
115
116/* Map the 5 bits of Extended Error code to the string table. */
117const char *ext_msgs[] = { /* extended error */
118 "K8 ECC error/F10 reserved", /* 0_0000b */
119 "CRC error", /* 0_0001b */
120 "sync error", /* 0_0010b */
121 "mst abort", /* 0_0011b */
122 "tgt abort", /* 0_0100b */
123 "GART error", /* 0_0101b */
124 "RMW error", /* 0_0110b */
125 "Wdog timer error", /* 0_0111b */
126 "F10-ECC/K8-Chipkill error", /* 0_1000b */
127 "DEV Error", /* 0_1001b */
128 "Link Data error", /* 0_1010b */
129 "Link or L3 Protocol error", /* 0_1011b */
130 "NB Array error", /* 0_1100b */
131 "DRAM Parity error", /* 0_1101b */
132 "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */
133 "Res 0x0ff error", /* 0_1111b */
134 "Res 0x100 error", /* 1_0000b */
135 "Res 0x101 error", /* 1_0001b */
136 "Res 0x102 error", /* 1_0010b */
137 "Res 0x103 error", /* 1_0011b */
138 "Res 0x104 error", /* 1_0100b */
139 "Res 0x105 error", /* 1_0101b */
140 "Res 0x106 error", /* 1_0110b */
141 "Res 0x107 error", /* 1_0111b */
142 "Res 0x108 error", /* 1_1000b */
143 "Res 0x109 error", /* 1_1001b */
144 "Res 0x10A error", /* 1_1010b */
145 "Res 0x10B error", /* 1_1011b */
146 "L3 Cache Data error", /* 1_1100b */
147 "L3 CacheTag error", /* 1_1101b */
148 "L3 Cache LRU error", /* 1_1110b */
149 "Res 0x1FF error" /* 1_1111b */
150};
151
152const char *htlink_msgs[] = {
153 "none",
154 "1",
155 "2",
156 "1 2",
157 "3",
158 "1 3",
159 "2 3",
160 "1 2 3"
161};
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
new file mode 100644
index 000000000000..c8ca7136dacc
--- /dev/null
+++ b/drivers/edac/edac_mce_amd.c
@@ -0,0 +1,422 @@
1#include <linux/module.h>
2#include "edac_mce_amd.h"
3
4static bool report_gart_errors;
5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
6
7void amd_report_gart_errors(bool v)
8{
9 report_gart_errors = v;
10}
11EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12
13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
14{
15 nb_bus_decoder = f;
16}
17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18
19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
20{
21 if (nb_bus_decoder) {
22 WARN_ON(nb_bus_decoder != f);
23
24 nb_bus_decoder = NULL;
25 }
26}
27EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
28
29/*
30 * string representation for the different MCA reported error types, see F3x48
31 * or MSR0000_0411.
32 */
33const char *tt_msgs[] = { /* transaction type */
34 "instruction",
35 "data",
36 "generic",
37 "reserved"
38};
39EXPORT_SYMBOL_GPL(tt_msgs);
40
41const char *ll_msgs[] = { /* cache level */
42 "L0",
43 "L1",
44 "L2",
45 "L3/generic"
46};
47EXPORT_SYMBOL_GPL(ll_msgs);
48
49const char *rrrr_msgs[] = {
50 "generic",
51 "generic read",
52 "generic write",
53 "data read",
54 "data write",
55 "inst fetch",
56 "prefetch",
57 "evict",
58 "snoop",
59 "reserved RRRR= 9",
60 "reserved RRRR= 10",
61 "reserved RRRR= 11",
62 "reserved RRRR= 12",
63 "reserved RRRR= 13",
64 "reserved RRRR= 14",
65 "reserved RRRR= 15"
66};
67EXPORT_SYMBOL_GPL(rrrr_msgs);
68
69const char *pp_msgs[] = { /* participating processor */
70 "local node originated (SRC)",
71 "local node responded to request (RES)",
72 "local node observed as 3rd party (OBS)",
73 "generic"
74};
75EXPORT_SYMBOL_GPL(pp_msgs);
76
77const char *to_msgs[] = {
78 "no timeout",
79 "timed out"
80};
81EXPORT_SYMBOL_GPL(to_msgs);
82
83const char *ii_msgs[] = { /* memory or i/o */
84 "mem access",
85 "reserved",
86 "i/o access",
87 "generic"
88};
89EXPORT_SYMBOL_GPL(ii_msgs);
90
91/*
92 * Map the 4 or 5 (family-specific) bits of Extended Error code to the
93 * string table.
94 */
95const char *ext_msgs[] = {
96 "K8 ECC error", /* 0_0000b */
97 "CRC error on link", /* 0_0001b */
98 "Sync error packets on link", /* 0_0010b */
99 "Master Abort during link operation", /* 0_0011b */
100 "Target Abort during link operation", /* 0_0100b */
101 "Invalid GART PTE entry during table walk", /* 0_0101b */
102 "Unsupported atomic RMW command received", /* 0_0110b */
103 "WDT error: NB transaction timeout", /* 0_0111b */
104 "ECC/ChipKill ECC error", /* 0_1000b */
105 "SVM DEV Error", /* 0_1001b */
106 "Link Data error", /* 0_1010b */
107 "Link/L3/Probe Filter Protocol error", /* 0_1011b */
108 "NB Internal Arrays Parity error", /* 0_1100b */
109 "DRAM Address/Control Parity error", /* 0_1101b */
110 "Link Transmission error", /* 0_1110b */
111 "GART/DEV Table Walk Data error" /* 0_1111b */
112 "Res 0x100 error", /* 1_0000b */
113 "Res 0x101 error", /* 1_0001b */
114 "Res 0x102 error", /* 1_0010b */
115 "Res 0x103 error", /* 1_0011b */
116 "Res 0x104 error", /* 1_0100b */
117 "Res 0x105 error", /* 1_0101b */
118 "Res 0x106 error", /* 1_0110b */
119 "Res 0x107 error", /* 1_0111b */
120 "Res 0x108 error", /* 1_1000b */
121 "Res 0x109 error", /* 1_1001b */
122 "Res 0x10A error", /* 1_1010b */
123 "Res 0x10B error", /* 1_1011b */
124 "ECC error in L3 Cache Data", /* 1_1100b */
125 "L3 Cache Tag error", /* 1_1101b */
126 "L3 Cache LRU Parity error", /* 1_1110b */
127 "Probe Filter error" /* 1_1111b */
128};
129EXPORT_SYMBOL_GPL(ext_msgs);
130
131static void amd_decode_dc_mce(u64 mc0_status)
132{
133 u32 ec = mc0_status & 0xffff;
134 u32 xec = (mc0_status >> 16) & 0xf;
135
136 pr_emerg(" Data Cache Error");
137
138 if (xec == 1 && TLB_ERROR(ec))
139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
140 else if (xec == 0) {
141 if (mc0_status & (1ULL << 40))
142 pr_cont(" during Data Scrub.\n");
143 else if (TLB_ERROR(ec))
144 pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
145 else if (MEM_ERROR(ec)) {
146 u8 ll = ec & 0x3;
147 u8 tt = (ec >> 2) & 0x3;
148 u8 rrrr = (ec >> 4) & 0xf;
149
150 /* see F10h BKDG (31116), Table 92. */
151 if (ll == 0x1) {
152 if (tt != 0x1)
153 goto wrong_dc_mce;
154
155 pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
156
157 } else if (ll == 0x2 && rrrr == 0x3)
158 pr_cont(" during L1 linefill from L2.\n");
159 else
160 goto wrong_dc_mce;
161 } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
162 pr_cont(" during system linefill.\n");
163 else
164 goto wrong_dc_mce;
165 } else
166 goto wrong_dc_mce;
167
168 return;
169
170wrong_dc_mce:
171 pr_warning("Corrupted DC MCE info?\n");
172}
173
174static void amd_decode_ic_mce(u64 mc1_status)
175{
176 u32 ec = mc1_status & 0xffff;
177 u32 xec = (mc1_status >> 16) & 0xf;
178
179 pr_emerg(" Instruction Cache Error");
180
181 if (xec == 1 && TLB_ERROR(ec))
182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
183 else if (xec == 0) {
184 if (TLB_ERROR(ec))
185 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
186 else if (BUS_ERROR(ec)) {
187 if (boot_cpu_data.x86 == 0xf &&
188 (mc1_status & (1ULL << 58)))
189 pr_cont(" during system linefill.\n");
190 else
191 pr_cont(" during attempted NB data read.\n");
192 } else if (MEM_ERROR(ec)) {
193 u8 ll = ec & 0x3;
194 u8 rrrr = (ec >> 4) & 0xf;
195
196 if (ll == 0x2)
197 pr_cont(" during a linefill from L2.\n");
198 else if (ll == 0x1) {
199
200 switch (rrrr) {
201 case 0x5:
202 pr_cont(": Parity error during "
203 "data load.\n");
204 break;
205
206 case 0x7:
207 pr_cont(": Copyback Parity/Victim"
208 " error.\n");
209 break;
210
211 case 0x8:
212 pr_cont(": Tag Snoop error.\n");
213 break;
214
215 default:
216 goto wrong_ic_mce;
217 break;
218 }
219 }
220 } else
221 goto wrong_ic_mce;
222 } else
223 goto wrong_ic_mce;
224
225 return;
226
227wrong_ic_mce:
228 pr_warning("Corrupted IC MCE info?\n");
229}
230
231static void amd_decode_bu_mce(u64 mc2_status)
232{
233 u32 ec = mc2_status & 0xffff;
234 u32 xec = (mc2_status >> 16) & 0xf;
235
236 pr_emerg(" Bus Unit Error");
237
238 if (xec == 0x1)
239 pr_cont(" in the write data buffers.\n");
240 else if (xec == 0x3)
241 pr_cont(" in the victim data buffers.\n");
242 else if (xec == 0x2 && MEM_ERROR(ec))
243 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
244 else if (xec == 0x0) {
245 if (TLB_ERROR(ec))
246 pr_cont(": %s error in a Page Descriptor Cache or "
247 "Guest TLB.\n", TT_MSG(ec));
248 else if (BUS_ERROR(ec))
249 pr_cont(": %s/ECC error in data read from NB: %s.\n",
250 RRRR_MSG(ec), PP_MSG(ec));
251 else if (MEM_ERROR(ec)) {
252 u8 rrrr = (ec >> 4) & 0xf;
253
254 if (rrrr >= 0x7)
255 pr_cont(": %s error during data copyback.\n",
256 RRRR_MSG(ec));
257 else if (rrrr <= 0x1)
258 pr_cont(": %s parity/ECC error during data "
259 "access from L2.\n", RRRR_MSG(ec));
260 else
261 goto wrong_bu_mce;
262 } else
263 goto wrong_bu_mce;
264 } else
265 goto wrong_bu_mce;
266
267 return;
268
269wrong_bu_mce:
270 pr_warning("Corrupted BU MCE info?\n");
271}
272
273static void amd_decode_ls_mce(u64 mc3_status)
274{
275 u32 ec = mc3_status & 0xffff;
276 u32 xec = (mc3_status >> 16) & 0xf;
277
278 pr_emerg(" Load Store Error");
279
280 if (xec == 0x0) {
281 u8 rrrr = (ec >> 4) & 0xf;
282
283 if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
284 goto wrong_ls_mce;
285
286 pr_cont(" during %s.\n", RRRR_MSG(ec));
287 }
288 return;
289
290wrong_ls_mce:
291 pr_warning("Corrupted LS MCE info?\n");
292}
293
294void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
295{
296 u32 ec = ERROR_CODE(regs->nbsl);
297 u32 xec = EXT_ERROR_CODE(regs->nbsl);
298
299 if (!handle_errors)
300 return;
301
302 pr_emerg(" Northbridge Error, node %d", node_id);
303
304 /*
305 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
306 * value encoding has changed so interpret those differently
307 */
308 if ((boot_cpu_data.x86 == 0x10) &&
309 (boot_cpu_data.x86_model > 8)) {
310 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
311 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
312 } else {
313 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
314 }
315
316
317 pr_emerg("%s.\n", EXT_ERR_MSG(xec));
318
319 if (BUS_ERROR(ec) && nb_bus_decoder)
320 nb_bus_decoder(node_id, regs);
321}
322EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
323
324static void amd_decode_fr_mce(u64 mc5_status)
325{
326 /* we have only one error signature so match all fields at once. */
327 if ((mc5_status & 0xffff) == 0x0f0f)
328 pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
329 else
330 pr_warning("Corrupted FR MCE info?\n");
331}
332
333static inline void amd_decode_err_code(unsigned int ec)
334{
335 if (TLB_ERROR(ec)) {
336 /*
337 * GART errors are intended to help graphics driver developers
338 * to detect bad GART PTEs. It is recommended by AMD to disable
339 * GART table walk error reporting by default[1] (currently
340 * being disabled in mce_cpu_quirks()) and according to the
341 * comment in mce_cpu_quirks(), such GART errors can be
342 * incorrectly triggered. We may see these errors anyway and
343 * unless requested by the user, they won't be reported.
344 *
345 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
346 * AMD NPT family 0Fh processors
347 */
348 if (!report_gart_errors)
349 return;
350
351 pr_emerg(" Transaction: %s, Cache Level %s\n",
352 TT_MSG(ec), LL_MSG(ec));
353 } else if (MEM_ERROR(ec)) {
354 pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
355 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
356 } else if (BUS_ERROR(ec)) {
357 pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
358 "Participating Processor: %s\n",
359 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
360 PP_MSG(ec));
361 } else
362 pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
363}
364
365void decode_mce(struct mce *m)
366{
367 struct err_regs regs;
368 int node, ecc;
369
370 pr_emerg("MC%d_STATUS: ", m->bank);
371
372 pr_cont("%sorrected error, report: %s, MiscV: %svalid, "
373 "CPU context corrupt: %s",
374 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
375 ((m->status & MCI_STATUS_EN) ? "yes" : "no"),
376 ((m->status & MCI_STATUS_MISCV) ? "" : "in"),
377 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
378
379 /* do the two bits[14:13] together */
380 ecc = m->status & (3ULL << 45);
381 if (ecc)
382 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
383
384 pr_cont("\n");
385
386 switch (m->bank) {
387 case 0:
388 amd_decode_dc_mce(m->status);
389 break;
390
391 case 1:
392 amd_decode_ic_mce(m->status);
393 break;
394
395 case 2:
396 amd_decode_bu_mce(m->status);
397 break;
398
399 case 3:
400 amd_decode_ls_mce(m->status);
401 break;
402
403 case 4:
404 regs.nbsl = (u32) m->status;
405 regs.nbsh = (u32)(m->status >> 32);
406 regs.nbeal = (u32) m->addr;
407 regs.nbeah = (u32)(m->addr >> 32);
408 node = per_cpu(cpu_llc_id, m->extcpu);
409
410 amd_decode_nb_mce(node, &regs, 1);
411 break;
412
413 case 5:
414 amd_decode_fr_mce(m->status);
415 break;
416
417 default:
418 break;
419 }
420
421 amd_decode_err_code(m->status & 0xffff);
422}
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
new file mode 100644
index 000000000000..df23ee065f79
--- /dev/null
+++ b/drivers/edac/edac_mce_amd.h
@@ -0,0 +1,69 @@
1#ifndef _EDAC_MCE_AMD_H
2#define _EDAC_MCE_AMD_H
3
4#include <asm/mce.h>
5
6#define ERROR_CODE(x) ((x) & 0xffff)
7#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
8#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
9
10#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
11#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
12
13#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
14#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
15#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
16
17#define TT(x) (((x) >> 2) & 0x3)
18#define TT_MSG(x) tt_msgs[TT(x)]
19#define II(x) (((x) >> 2) & 0x3)
20#define II_MSG(x) ii_msgs[II(x)]
21#define LL(x) (((x) >> 0) & 0x3)
22#define LL_MSG(x) ll_msgs[LL(x)]
23#define RRRR(x) (((x) >> 4) & 0xf)
24#define RRRR_MSG(x) rrrr_msgs[RRRR(x)]
25#define TO(x) (((x) >> 8) & 0x1)
26#define TO_MSG(x) to_msgs[TO(x)]
27#define PP(x) (((x) >> 9) & 0x3)
28#define PP_MSG(x) pp_msgs[PP(x)]
29
30#define K8_NBSH 0x4C
31
32#define K8_NBSH_VALID_BIT BIT(31)
33#define K8_NBSH_OVERFLOW BIT(30)
34#define K8_NBSH_UC_ERR BIT(29)
35#define K8_NBSH_ERR_EN BIT(28)
36#define K8_NBSH_MISCV BIT(27)
37#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
38#define K8_NBSH_PCC BIT(25)
39#define K8_NBSH_ERR_CPU_VAL BIT(24)
40#define K8_NBSH_CECC BIT(14)
41#define K8_NBSH_UECC BIT(13)
42#define K8_NBSH_ERR_SCRUBER BIT(8)
43
44extern const char *tt_msgs[];
45extern const char *ll_msgs[];
46extern const char *rrrr_msgs[];
47extern const char *pp_msgs[];
48extern const char *to_msgs[];
49extern const char *ii_msgs[];
50extern const char *ext_msgs[];
51
52/*
53 * relevant NB regs
54 */
55struct err_regs {
56 u32 nbcfg;
57 u32 nbsh;
58 u32 nbsl;
59 u32 nbeah;
60 u32 nbeal;
61};
62
63
64void amd_report_gart_errors(bool);
65void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
66void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
67void amd_decode_nb_mce(int, struct err_regs *, int);
68
69#endif /* _EDAC_MCE_AMD_H */