aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2009-07-24 07:51:42 -0400
committerBorislav Petkov <borislav.petkov@amd.com>2009-09-14 12:59:17 -0400
commit549d042df240dfb4203bab40ad44f9336751b7d6 (patch)
treeaf357ed8eaf06c26f19d458686b6c7ea4e425a05 /drivers/edac
parentecaf5606de65cdd04de5f526185fe28fb0df654e (diff)
x86, mce: pass mce info to EDAC for decoding
Move NB decoder along with required defines to EDAC MCE core. Add registration routines for further decoding of the MCE info in the AMD64 EDAC module. CC: Andi Kleen <andi@firstfloor.org> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/amd64_edac.c98
-rw-r--r--drivers/edac/amd64_edac.h36
-rw-r--r--drivers/edac/amd64_edac_dbg.c2
-rw-r--r--drivers/edac/edac_mce_amd.c115
-rw-r--r--drivers/edac/edac_mce_amd.h38
5 files changed, 178 insertions, 111 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 82f48ee90f11..2080b1e2e8a2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
2282 } 2282 }
2283} 2283}
2284 2284
2285static void amd64_decode_bus_error(struct mem_ctl_info *mci, 2285static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2286 struct err_regs *info, int ecc_type) 2286 struct err_regs *info, int ecc_type)
2287{ 2287{
2288 u32 ec = ERROR_CODE(info->nbsl); 2288 u32 ec = ERROR_CODE(info->nbsl);
2289 u32 xec = EXT_ERROR_CODE(info->nbsl); 2289 u32 xec = EXT_ERROR_CODE(info->nbsl);
@@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); 2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
2317} 2317}
2318 2318
2319void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, 2319void amd64_decode_bus_error(int node_id, struct err_regs *regs,
2320 int handle_errors) 2320 int ecc_type)
2321{ 2321{
2322 struct amd64_pvt *pvt = mci->pvt_info; 2322 struct mem_ctl_info *mci = mci_lookup[node_id];
2323 int ecc;
2324 u32 ec = ERROR_CODE(regs->nbsl);
2325 u32 xec = EXT_ERROR_CODE(regs->nbsl);
2326
2327 if (!handle_errors)
2328 return;
2329
2330 pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
2331
2332 /*
2333 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
2334 * value encoding has changed so interpret those differently
2335 */
2336 if ((boot_cpu_data.x86 == 0x10) &&
2337 (boot_cpu_data.x86_model > 8)) {
2338 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
2339 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
2340 } else {
2341 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
2342 }
2343
2344 pr_emerg(" Error: %sorrected",
2345 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
2346 pr_cont(", Report Error: %s",
2347 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
2348 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
2349 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
2350 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
2351
2352 /* do the two bits[14:13] together */
2353 ecc = regs->nbsh & (0x3 << 13);
2354 if (ecc)
2355 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
2356
2357 pr_cont("\n");
2358
2359 if (TLB_ERROR(ec)) {
2360 /*
2361 * GART errors are intended to help graphics driver developers
2362 * to detect bad GART PTEs. It is recommended by AMD to disable
2363 * GART table walk error reporting by default[1] (currently
2364 * being disabled in mce_cpu_quirks()) and according to the
2365 * comment in mce_cpu_quirks(), such GART errors can be
2366 * incorrectly triggered. We may see these errors anyway and
2367 * unless requested by the user, they won't be reported.
2368 *
2369 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2370 * AMD NPT family 0Fh processors
2371 */
2372 if (!report_gart_errors)
2373 return;
2374
2375 pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
2376 TT_MSG(ec), LL_MSG(ec));
2377 } else if (MEM_ERROR(ec)) {
2378 pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
2379 " Cache Level: %s",
2380 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
2381 } else if (BUS_ERROR(ec)) {
2382 pr_emerg(" Bus (Link/DRAM) error\n");
2383 amd64_decode_bus_error(mci, regs, ecc);
2384 } else {
2385 /* shouldn't reach here! */
2386 amd64_mc_printk(mci, KERN_WARNING,
2387 "%s(): unknown MCE error 0x%x\n", __func__, ec);
2388 }
2389 2323
2390 pr_emerg("%s.\n", EXT_ERR_MSG(xec)); 2324 __amd64_decode_bus_error(mci, regs, ecc_type);
2391 2325
2392 /* 2326 /*
2393 * Check the UE bit of the NB status high register, if set generate some 2327 * Check the UE bit of the NB status high register, if set generate some
2394 * logs. If NOT a GART error, then process the event as a NO-INFO event. 2328 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2395 * If it was a GART error, skip that process. 2329 * If it was a GART error, skip that process.
2330 *
2331 * FIXME: this should go somewhere else, if at all.
2396 */ 2332 */
2397 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2333 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2398 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2334 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2335
2399} 2336}
2400 2337
2401/* 2338/*
@@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci)
2406{ 2343{
2407 struct err_regs regs; 2344 struct err_regs regs;
2408 2345
2409 if (amd64_get_error_info(mci, &regs)) 2346 if (amd64_get_error_info(mci, &regs)) {
2410 amd64_decode_nb_mce(mci, &regs, 1); 2347 struct amd64_pvt *pvt = mci->pvt_info;
2348 amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2349 }
2411} 2350}
2412 2351
2413/* 2352/*
@@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
3103 3042
3104 mci_lookup[node_id] = mci; 3043 mci_lookup[node_id] = mci;
3105 pvt_lookup[node_id] = NULL; 3044 pvt_lookup[node_id] = NULL;
3045
3046 /* register stuff with EDAC MCE */
3047 if (report_gart_errors)
3048 amd_report_gart_errors(true);
3049
3050 amd_register_ecc_decoder(amd64_decode_bus_error);
3051
3106 return 0; 3052 return 0;
3107 3053
3108err_add_mc: 3054err_add_mc:
@@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
3169 3115
3170 mci_lookup[pvt->mc_node_id] = NULL; 3116 mci_lookup[pvt->mc_node_id] = NULL;
3171 3117
3118 /* unregister from EDAC MCE */
3119 amd_report_gart_errors(false);
3120 amd_unregister_ecc_decoder(amd64_decode_bus_error);
3121
3172 /* Free the EDAC CORE resources */ 3122 /* Free the EDAC CORE resources */
3173 edac_mc_free(mci); 3123 edac_mc_free(mci);
3174} 3124}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index ecab0c9fd14e..8ea07e2715dc 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -346,24 +346,8 @@ enum {
346#define K8_NBSL_PP_OBS 0x2 346#define K8_NBSL_PP_OBS 0x2
347#define K8_NBSL_PP_GENERIC 0x3 347#define K8_NBSL_PP_GENERIC 0x3
348 348
349
350#define K8_NBSH 0x4C
351
352#define K8_NBSH_VALID_BIT BIT(31)
353#define K8_NBSH_OVERFLOW BIT(30)
354#define K8_NBSH_UC_ERR BIT(29)
355#define K8_NBSH_ERR_EN BIT(28)
356#define K8_NBSH_MISCV BIT(27)
357#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
358#define K8_NBSH_PCC BIT(25)
359#define K8_NBSH_ERR_CPU_VAL BIT(24)
360#define K8_NBSH_CECC BIT(14)
361#define K8_NBSH_UECC BIT(13)
362#define K8_NBSH_ERR_SCRUBER BIT(8)
363
364#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) 349#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
365 350
366
367#define K8_NBEAL 0x50 351#define K8_NBEAL 0x50
368#define K8_NBEAH 0x54 352#define K8_NBEAH 0x54
369#define K8_SCRCTRL 0x58 353#define K8_SCRCTRL 0x58
@@ -428,23 +412,6 @@ enum amd64_chipset_families {
428 F11_CPUS, 412 F11_CPUS,
429}; 413};
430 414
431/*
432 * Structure to hold:
433 *
434 * 1) dynamically read status and error address HW registers
435 * 2) sysfs entered values
436 * 3) MCE values
437 *
438 * Depends on entry into the modules
439 */
440struct err_regs {
441 u32 nbcfg;
442 u32 nbsh;
443 u32 nbsl;
444 u32 nbeah;
445 u32 nbeal;
446};
447
448/* Error injection control structure */ 415/* Error injection control structure */
449struct error_injection { 416struct error_injection {
450 u32 section; 417 u32 section;
@@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index)
610#define F10_MIN_SCRUB_RATE_BITS 0x5 577#define F10_MIN_SCRUB_RATE_BITS 0x5
611#define F11_MIN_SCRUB_RATE_BITS 0x6 578#define F11_MIN_SCRUB_RATE_BITS 0x6
612 579
613void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
614 int handle_errors);
615
616int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, 580int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
617 u64 *hole_offset, u64 *hole_size); 581 u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index bcb4e2eba3dc..59cf2cf6e11e 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
24 24
25 /* Process the Mapping request */ 25 /* Process the Mapping request */
26 /* TODO: Add race prevention */ 26 /* TODO: Add race prevention */
27 amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); 27 amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
28 28
29 return count; 29 return count;
30 } 30 }
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 918567e8cfd5..444c2cc4472d 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -1,6 +1,31 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include "edac_mce_amd.h" 2#include "edac_mce_amd.h"
3 3
4static bool report_gart_errors;
5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type);
6
7void amd_report_gart_errors(bool v)
8{
9 report_gart_errors = v;
10}
11EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12
13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int))
14{
15 nb_bus_decoder = f;
16}
17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18
19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int))
20{
21 if (nb_bus_decoder) {
22 WARN_ON(nb_bus_decoder != f);
23
24 nb_bus_decoder = NULL;
25 }
26}
27EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
28
4/* 29/*
5 * string representation for the different MCA reported error types, see F3x48 30 * string representation for the different MCA reported error types, see F3x48
6 * or MSR0000_0411. 31 * or MSR0000_0411.
@@ -102,3 +127,93 @@ const char *ext_msgs[] = {
102 "Probe Filter error" /* 1_1111b */ 127 "Probe Filter error" /* 1_1111b */
103}; 128};
104EXPORT_SYMBOL_GPL(ext_msgs); 129EXPORT_SYMBOL_GPL(ext_msgs);
130
131void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
132{
133 int ecc;
134 u32 ec = ERROR_CODE(regs->nbsl);
135 u32 xec = EXT_ERROR_CODE(regs->nbsl);
136
137 if (!handle_errors)
138 return;
139
140 pr_emerg(" Northbridge Error, node %d", node_id);
141
142 /*
143 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
144 * value encoding has changed so interpret those differently
145 */
146 if ((boot_cpu_data.x86 == 0x10) &&
147 (boot_cpu_data.x86_model > 8)) {
148 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
149 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
150 } else {
151 pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
152 }
153
154 pr_emerg(" Error: %sorrected",
155 ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
156 pr_cont(", Report Error: %s",
157 ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
158 pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
159 ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
160 ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
161
162 /* do the two bits[14:13] together */
163 ecc = regs->nbsh & (0x3 << 13);
164 if (ecc)
165 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
166
167 pr_cont("\n");
168
169 if (TLB_ERROR(ec)) {
170 /*
171 * GART errors are intended to help graphics driver developers
172 * to detect bad GART PTEs. It is recommended by AMD to disable
173 * GART table walk error reporting by default[1] (currently
174 * being disabled in mce_cpu_quirks()) and according to the
175 * comment in mce_cpu_quirks(), such GART errors can be
176 * incorrectly triggered. We may see these errors anyway and
177 * unless requested by the user, they won't be reported.
178 *
179 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
180 * AMD NPT family 0Fh processors
181 */
182 if (!report_gart_errors)
183 return;
184
185 pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
186 TT_MSG(ec), LL_MSG(ec));
187 } else if (MEM_ERROR(ec)) {
188 pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
189 " Cache Level: %s",
190 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
191 } else if (BUS_ERROR(ec)) {
192 pr_emerg(" Bus (Link/DRAM) error\n");
193 if (nb_bus_decoder)
194 nb_bus_decoder(node_id, regs, ecc);
195 } else {
196 /* shouldn't reach here! */
197 pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec);
198 }
199
200 pr_emerg("%s.\n", EXT_ERR_MSG(xec));
201}
202EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
203
204void decode_mce(struct mce *m)
205{
206 struct err_regs regs;
207 int node;
208
209 if (m->bank != 4)
210 return;
211
212 regs.nbsl = (u32) m->status;
213 regs.nbsh = (u32)(m->status >> 32);
214 regs.nbeal = (u32) m->addr;
215 regs.nbeah = (u32)(m->addr >> 32);
216 node = topology_cpu_node_id(m->extcpu);
217
218 amd_decode_nb_mce(node, &regs, 1);
219}
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
index 39971cdabb51..9114dc62782b 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/edac_mce_amd.h
@@ -1,3 +1,8 @@
1#ifndef _EDAC_MCE_AMD_H
2#define _EDAC_MCE_AMD_H
3
4#include <asm/mce.h>
5
1#define ERROR_CODE(x) ((x) & 0xffff) 6#define ERROR_CODE(x) ((x) & 0xffff)
2#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) 7#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
3#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] 8#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
@@ -22,6 +27,20 @@
22#define PP(x) (((x) >> 9) & 0x3) 27#define PP(x) (((x) >> 9) & 0x3)
23#define PP_MSG(x) pp_msgs[PP(x)] 28#define PP_MSG(x) pp_msgs[PP(x)]
24 29
30#define K8_NBSH 0x4C
31
32#define K8_NBSH_VALID_BIT BIT(31)
33#define K8_NBSH_OVERFLOW BIT(30)
34#define K8_NBSH_UC_ERR BIT(29)
35#define K8_NBSH_ERR_EN BIT(28)
36#define K8_NBSH_MISCV BIT(27)
37#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
38#define K8_NBSH_PCC BIT(25)
39#define K8_NBSH_ERR_CPU_VAL BIT(24)
40#define K8_NBSH_CECC BIT(14)
41#define K8_NBSH_UECC BIT(13)
42#define K8_NBSH_ERR_SCRUBER BIT(8)
43
25extern const char *tt_msgs[]; 44extern const char *tt_msgs[];
26extern const char *ll_msgs[]; 45extern const char *ll_msgs[];
27extern const char *rrrr_msgs[]; 46extern const char *rrrr_msgs[];
@@ -29,3 +48,22 @@ extern const char *pp_msgs[];
29extern const char *to_msgs[]; 48extern const char *to_msgs[];
30extern const char *ii_msgs[]; 49extern const char *ii_msgs[];
31extern const char *ext_msgs[]; 50extern const char *ext_msgs[];
51
52/*
53 * relevant NB regs
54 */
55struct err_regs {
56 u32 nbcfg;
57 u32 nbsh;
58 u32 nbsl;
59 u32 nbeah;
60 u32 nbeal;
61};
62
63
64void amd_report_gart_errors(bool);
65void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int));
66void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int));
67void amd_decode_nb_mce(int, struct err_regs *, int);
68
69#endif /* _EDAC_MCE_AMD_H */