aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c7
-rw-r--r--drivers/edac/amd64_edac.c98
-rw-r--r--drivers/edac/amd64_edac.h36
-rw-r--r--drivers/edac/amd64_edac_dbg.c2
-rw-r--r--drivers/edac/edac_mce_amd.c115
-rw-r--r--drivers/edac/edac_mce_amd.h38
6 files changed, 185 insertions, 111 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62f..b82866f6adf5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 183 set_bit(0, &mce_need_notify);
184} 184}
185 185
186void __weak decode_mce(struct mce *m)
187{
188 return;
189}
190
186static void print_mce(struct mce *m) 191static void print_mce(struct mce *m)
187{ 192{
188 printk(KERN_EMERG 193 printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 210 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 211 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 212 m->apicid);
213
214 decode_mce(m);
208} 215}
209 216
210static void print_mce_head(void) 217static void print_mce_head(void)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 82f48ee90f11..2080b1e2e8a2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
2282 } 2282 }
2283} 2283}
2284 2284
2285static void amd64_decode_bus_error(struct mem_ctl_info *mci, 2285static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2286 struct err_regs *info, int ecc_type) 2286 struct err_regs *info, int ecc_type)
2287{ 2287{
2288 u32 ec = ERROR_CODE(info->nbsl); 2288 u32 ec = ERROR_CODE(info->nbsl);
2289 u32 xec = EXT_ERROR_CODE(info->nbsl); 2289 u32 xec = EXT_ERROR_CODE(info->nbsl);
@@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); 2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
2317} 2317}
2318 2318
2319void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, 2319void amd64_decode_bus_error(int node_id, struct err_regs *regs,
2320 int handle_errors) 2320 int ecc_type)
2321{ 2321{
2322 struct amd64_pvt *pvt = mci->pvt_info; 2322 struct mem_ctl_info *mci = mci_lookup[node_id];
2323 int ecc;
2324 u32 ec = ERROR_CODE(regs->nbsl);
2325 u32 xec = EXT_ERROR_CODE(regs->nbsl);
2326
2327 if (!handle_errors)
2328 return;
2329
2330 pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
2331
2332 /*
2333 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
2334 * value encoding has changed so interpret those differently
2335 */
2336 if ((boot_cpu_data.x86 == 0x10) &&
2337 (boot_cpu_data.x86_model > 8)) {
2338 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
2339 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
2340 } else {
2341 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
2342 }
2343
2344 pr_emerg(" Error: %sorrected",
2345 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
2346 pr_cont(", Report Error: %s",
2347 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
2348 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
2349 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
2350 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
2351
2352 /* do the two bits[14:13] together */
2353 ecc = regs->nbsh & (0x3 << 13);
2354 if (ecc)
2355 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
2356
2357 pr_cont("\n");
2358
2359 if (TLB_ERROR(ec)) {
2360 /*
2361 * GART errors are intended to help graphics driver developers
2362 * to detect bad GART PTEs. It is recommended by AMD to disable
2363 * GART table walk error reporting by default[1] (currently
2364 * being disabled in mce_cpu_quirks()) and according to the
2365 * comment in mce_cpu_quirks(), such GART errors can be
2366 * incorrectly triggered. We may see these errors anyway and
2367 * unless requested by the user, they won't be reported.
2368 *
2369 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2370 * AMD NPT family 0Fh processors
2371 */
2372 if (!report_gart_errors)
2373 return;
2374
2375 pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
2376 TT_MSG(ec), LL_MSG(ec));
2377 } else if (MEM_ERROR(ec)) {
2378 pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
2379 " Cache Level: %s",
2380 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
2381 } else if (BUS_ERROR(ec)) {
2382 pr_emerg(" Bus (Link/DRAM) error\n");
2383 amd64_decode_bus_error(mci, regs, ecc);
2384 } else {
2385 /* shouldn't reach here! */
2386 amd64_mc_printk(mci, KERN_WARNING,
2387 "%s(): unknown MCE error 0x%x\n", __func__, ec);
2388 }
2389 2323
2390 pr_emerg("%s.\n", EXT_ERR_MSG(xec)); 2324 __amd64_decode_bus_error(mci, regs, ecc_type);
2391 2325
2392 /* 2326 /*
2393 * Check the UE bit of the NB status high register, if set generate some 2327 * Check the UE bit of the NB status high register, if set generate some
2394 * logs. If NOT a GART error, then process the event as a NO-INFO event. 2328 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2395 * If it was a GART error, skip that process. 2329 * If it was a GART error, skip that process.
2330 *
2331 * FIXME: this should go somewhere else, if at all.
2396 */ 2332 */
2397 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2333 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2398 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2334 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2335
2399} 2336}
2400 2337
2401/* 2338/*
@@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci)
2406{ 2343{
2407 struct err_regs regs; 2344 struct err_regs regs;
2408 2345
2409 if (amd64_get_error_info(mci, &regs)) 2346 if (amd64_get_error_info(mci, &regs)) {
2410 amd64_decode_nb_mce(mci, &regs, 1); 2347 struct amd64_pvt *pvt = mci->pvt_info;
2348 amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2349 }
2411} 2350}
2412 2351
2413/* 2352/*
@@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
3103 3042
3104 mci_lookup[node_id] = mci; 3043 mci_lookup[node_id] = mci;
3105 pvt_lookup[node_id] = NULL; 3044 pvt_lookup[node_id] = NULL;
3045
3046 /* register stuff with EDAC MCE */
3047 if (report_gart_errors)
3048 amd_report_gart_errors(true);
3049
3050 amd_register_ecc_decoder(amd64_decode_bus_error);
3051
3106 return 0; 3052 return 0;
3107 3053
3108err_add_mc: 3054err_add_mc:
@@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
3169 3115
3170 mci_lookup[pvt->mc_node_id] = NULL; 3116 mci_lookup[pvt->mc_node_id] = NULL;
3171 3117
3118 /* unregister from EDAC MCE */
3119 amd_report_gart_errors(false);
3120 amd_unregister_ecc_decoder(amd64_decode_bus_error);
3121
3172 /* Free the EDAC CORE resources */ 3122 /* Free the EDAC CORE resources */
3173 edac_mc_free(mci); 3123 edac_mc_free(mci);
3174} 3124}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index ecab0c9fd14e..8ea07e2715dc 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -346,24 +346,8 @@ enum {
346#define K8_NBSL_PP_OBS 0x2 346#define K8_NBSL_PP_OBS 0x2
347#define K8_NBSL_PP_GENERIC 0x3 347#define K8_NBSL_PP_GENERIC 0x3
348 348
349
350#define K8_NBSH 0x4C
351
352#define K8_NBSH_VALID_BIT BIT(31)
353#define K8_NBSH_OVERFLOW BIT(30)
354#define K8_NBSH_UC_ERR BIT(29)
355#define K8_NBSH_ERR_EN BIT(28)
356#define K8_NBSH_MISCV BIT(27)
357#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
358#define K8_NBSH_PCC BIT(25)
359#define K8_NBSH_ERR_CPU_VAL BIT(24)
360#define K8_NBSH_CECC BIT(14)
361#define K8_NBSH_UECC BIT(13)
362#define K8_NBSH_ERR_SCRUBER BIT(8)
363
364#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) 349#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
365 350
366
367#define K8_NBEAL 0x50 351#define K8_NBEAL 0x50
368#define K8_NBEAH 0x54 352#define K8_NBEAH 0x54
369#define K8_SCRCTRL 0x58 353#define K8_SCRCTRL 0x58
@@ -428,23 +412,6 @@ enum amd64_chipset_families {
428 F11_CPUS, 412 F11_CPUS,
429}; 413};
430 414
431/*
432 * Structure to hold:
433 *
434 * 1) dynamically read status and error address HW registers
435 * 2) sysfs entered values
436 * 3) MCE values
437 *
438 * Depends on entry into the modules
439 */
440struct err_regs {
441 u32 nbcfg;
442 u32 nbsh;
443 u32 nbsl;
444 u32 nbeah;
445 u32 nbeal;
446};
447
448/* Error injection control structure */ 415/* Error injection control structure */
449struct error_injection { 416struct error_injection {
450 u32 section; 417 u32 section;
@@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index)
610#define F10_MIN_SCRUB_RATE_BITS 0x5 577#define F10_MIN_SCRUB_RATE_BITS 0x5
611#define F11_MIN_SCRUB_RATE_BITS 0x6 578#define F11_MIN_SCRUB_RATE_BITS 0x6
612 579
613void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
614 int handle_errors);
615
616int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, 580int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
617 u64 *hole_offset, u64 *hole_size); 581 u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index bcb4e2eba3dc..59cf2cf6e11e 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
24 24
25 /* Process the Mapping request */ 25 /* Process the Mapping request */
26 /* TODO: Add race prevention */ 26 /* TODO: Add race prevention */
27 amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); 27 amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
28 28
29 return count; 29 return count;
30 } 30 }
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 918567e8cfd5..444c2cc4472d 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -1,6 +1,31 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include "edac_mce_amd.h" 2#include "edac_mce_amd.h"
3 3
4static bool report_gart_errors;
5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type);
6
7void amd_report_gart_errors(bool v)
8{
9 report_gart_errors = v;
10}
11EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12
13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int))
14{
15 nb_bus_decoder = f;
16}
17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18
19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int))
20{
21 if (nb_bus_decoder) {
22 WARN_ON(nb_bus_decoder != f);
23
24 nb_bus_decoder = NULL;
25 }
26}
27EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
28
4/* 29/*
5 * string representation for the different MCA reported error types, see F3x48 30 * string representation for the different MCA reported error types, see F3x48
6 * or MSR0000_0411. 31 * or MSR0000_0411.
@@ -102,3 +127,93 @@ const char *ext_msgs[] = {
102 "Probe Filter error" /* 1_1111b */ 127 "Probe Filter error" /* 1_1111b */
103}; 128};
104EXPORT_SYMBOL_GPL(ext_msgs); 129EXPORT_SYMBOL_GPL(ext_msgs);
130
131void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
132{
133 int ecc;
134 u32 ec = ERROR_CODE(regs->nbsl);
135 u32 xec = EXT_ERROR_CODE(regs->nbsl);
136
137 if (!handle_errors)
138 return;
139
140 pr_emerg(" Northbridge Error, node %d", node_id);
141
142 /*
143 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
144 * value encoding has changed so interpret those differently
145 */
146 if ((boot_cpu_data.x86 == 0x10) &&
147 (boot_cpu_data.x86_model > 8)) {
148 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
149 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
150 } else {
151 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
152 }
153
154 pr_emerg(" Error: %sorrected",
155 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
156 pr_cont(", Report Error: %s",
157 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
158 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
159 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
160 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
161
162 /* do the two bits[14:13] together */
163 ecc = regs->nbsh & (0x3 << 13);
164 if (ecc)
165 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
166
167 pr_cont("\n");
168
169 if (TLB_ERROR(ec)) {
170 /*
171 * GART errors are intended to help graphics driver developers
172 * to detect bad GART PTEs. It is recommended by AMD to disable
173 * GART table walk error reporting by default[1] (currently
174 * being disabled in mce_cpu_quirks()) and according to the
175 * comment in mce_cpu_quirks(), such GART errors can be
176 * incorrectly triggered. We may see these errors anyway and
177 * unless requested by the user, they won't be reported.
178 *
179 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
180 * AMD NPT family 0Fh processors
181 */
182 if (!report_gart_errors)
183 return;
184
185 pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
186 TT_MSG(ec), LL_MSG(ec));
187 } else if (MEM_ERROR(ec)) {
188 pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
189 " Cache Level: %s",
190 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
191 } else if (BUS_ERROR(ec)) {
192 pr_emerg(" Bus (Link/DRAM) error\n");
193 if (nb_bus_decoder)
194 nb_bus_decoder(node_id, regs, ecc);
195 } else {
196 /* shouldn't reach here! */
197 pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec);
198 }
199
200 pr_emerg("%s.\n", EXT_ERR_MSG(xec));
201}
202EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
203
204void decode_mce(struct mce *m)
205{
206 struct err_regs regs;
207 int node;
208
209 if (m->bank != 4)
210 return;
211
212 regs.nbsl = (u32) m->status;
213 regs.nbsh = (u32)(m->status >> 32);
214 regs.nbeal = (u32) m->addr;
215 regs.nbeah = (u32)(m->addr >> 32);
216 node = topology_cpu_node_id(m->extcpu);
217
218 amd_decode_nb_mce(node, &regs, 1);
219}
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
index 39971cdabb51..9114dc62782b 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/edac_mce_amd.h
@@ -1,3 +1,8 @@
1#ifndef _EDAC_MCE_AMD_H
2#define _EDAC_MCE_AMD_H
3
4#include <asm/mce.h>
5
1#define ERROR_CODE(x) ((x) & 0xffff) 6#define ERROR_CODE(x) ((x) & 0xffff)
2#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) 7#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
3#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] 8#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
@@ -22,6 +27,20 @@
22#define PP(x) (((x) >> 9) & 0x3) 27#define PP(x) (((x) >> 9) & 0x3)
23#define PP_MSG(x) pp_msgs[PP(x)] 28#define PP_MSG(x) pp_msgs[PP(x)]
24 29
30#define K8_NBSH 0x4C
31
32#define K8_NBSH_VALID_BIT BIT(31)
33#define K8_NBSH_OVERFLOW BIT(30)
34#define K8_NBSH_UC_ERR BIT(29)
35#define K8_NBSH_ERR_EN BIT(28)
36#define K8_NBSH_MISCV BIT(27)
37#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
38#define K8_NBSH_PCC BIT(25)
39#define K8_NBSH_ERR_CPU_VAL BIT(24)
40#define K8_NBSH_CECC BIT(14)
41#define K8_NBSH_UECC BIT(13)
42#define K8_NBSH_ERR_SCRUBER BIT(8)
43
25extern const char *tt_msgs[]; 44extern const char *tt_msgs[];
26extern const char *ll_msgs[]; 45extern const char *ll_msgs[];
27extern const char *rrrr_msgs[]; 46extern const char *rrrr_msgs[];
@@ -29,3 +48,22 @@ extern const char *pp_msgs[];
29extern const char *to_msgs[]; 48extern const char *to_msgs[];
30extern const char *ii_msgs[]; 49extern const char *ii_msgs[];
31extern const char *ext_msgs[]; 50extern const char *ext_msgs[];
51
52/*
53 * relevant NB regs
54 */
55struct err_regs {
56 u32 nbcfg;
57 u32 nbsh;
58 u32 nbsl;
59 u32 nbeah;
60 u32 nbeal;
61};
62
63
64void amd_report_gart_errors(bool);
65void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int));
66void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int));
67void amd_decode_nb_mce(int, struct err_regs *, int);
68
69#endif /* _EDAC_MCE_AMD_H */