aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2010-09-01 08:45:20 -0400
committerBorislav Petkov <bp@amd64.org>2010-10-21 08:47:58 -0400
commit7cfd4a87441f5ca3018fdd1f7ad67e8a73a05dc2 (patch)
treea74a1bb40d2ef3b5a66551562cdf775b63a27c8f /drivers/edac
parent6337583d7dc0dced36ab98dd63de2389c95c22d9 (diff)
EDAC, MCE: Pass complete MCE info to decoders
... instead of the MCi_STATUS info only for improved handling of certain types of errors later. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/amd64_edac.c13
-rw-r--r--drivers/edac/amd64_edac_dbg.c10
-rw-r--r--drivers/edac/edac_mce_amd.c74
-rw-r--r--drivers/edac/edac_mce_amd.h6
4 files changed, 56 insertions, 47 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e7d5d6b5dcf6..76f7cc0ee149 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2073 amd64_handle_ue(mci, info); 2073 amd64_handle_ue(mci, info);
2074} 2074}
2075 2075
2076void amd64_decode_bus_error(int node_id, struct err_regs *regs) 2076void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
2077{ 2077{
2078 struct mem_ctl_info *mci = mci_lookup[node_id]; 2078 struct mem_ctl_info *mci = mci_lookup[node_id];
2079 struct err_regs regs;
2079 2080
2080 __amd64_decode_bus_error(mci, regs); 2081 regs.nbsl = (u32) m->status;
2082 regs.nbsh = (u32)(m->status >> 32);
2083 regs.nbeal = (u32) m->addr;
2084 regs.nbeah = (u32)(m->addr >> 32);
2085 regs.nbcfg = nbcfg;
2086
2087 __amd64_decode_bus_error(mci, &regs);
2081 2088
2082 /* 2089 /*
2083 * Check the UE bit of the NB status high register, if set generate some 2090 * Check the UE bit of the NB status high register, if set generate some
@@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
2086 * 2093 *
2087 * FIXME: this should go somewhere else, if at all. 2094 * FIXME: this should go somewhere else, if at all.
2088 */ 2095 */
2089 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2096 if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2090 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2097 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2091 2098
2092} 2099}
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 22ef3fecf569..f6d5695de5b6 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -10,11 +10,14 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
10 size_t count) 10 size_t count)
11{ 11{
12 struct amd64_pvt *pvt = mci->pvt_info; 12 struct amd64_pvt *pvt = mci->pvt_info;
13 unsigned long long value; 13 u64 value;
14 int ret = 0; 14 int ret = 0;
15 struct mce m;
15 16
16 ret = strict_strtoull(data, 16, &value); 17 ret = strict_strtoull(data, 16, &value);
17 if (ret != -EINVAL) { 18 if (ret != -EINVAL) {
19 struct err_regs *regs = &pvt->ctl_error_info;
20
18 debugf0("received NBEA= 0x%llx\n", value); 21 debugf0("received NBEA= 0x%llx\n", value);
19 22
20 /* place the value into the virtual error packet */ 23 /* place the value into the virtual error packet */
@@ -22,9 +25,12 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
22 value >>= 32; 25 value >>= 32;
23 pvt->ctl_error_info.nbeah = (u32) value; 26 pvt->ctl_error_info.nbeah = (u32) value;
24 27
28 m.addr = value;
29 m.status = regs->nbsl | ((u64)regs->nbsh << 32);
30
25 /* Process the Mapping request */ 31 /* Process the Mapping request */
26 /* TODO: Add race prevention */ 32 /* TODO: Add race prevention */
27 amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info); 33 amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg);
28 34
29 return count; 35 return count;
30 } 36 }
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index d0e850eea50a..6cfa881888bc 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -2,7 +2,7 @@
2#include "edac_mce_amd.h" 2#include "edac_mce_amd.h"
3 3
4static bool report_gart_errors; 4static bool report_gart_errors;
5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); 5static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
6 6
7void amd_report_gart_errors(bool v) 7void amd_report_gart_errors(bool v)
8{ 8{
@@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v)
10} 10}
11EXPORT_SYMBOL_GPL(amd_report_gart_errors); 11EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12 12
13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) 13void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
14{ 14{
15 nb_bus_decoder = f; 15 nb_bus_decoder = f;
16} 16}
17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18 18
19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) 19void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
20{ 20{
21 if (nb_bus_decoder) { 21 if (nb_bus_decoder) {
22 WARN_ON(nb_bus_decoder != f); 22 WARN_ON(nb_bus_decoder != f);
@@ -97,17 +97,17 @@ const char *ext_msgs[] = {
97}; 97};
98EXPORT_SYMBOL_GPL(ext_msgs); 98EXPORT_SYMBOL_GPL(ext_msgs);
99 99
100static void amd_decode_dc_mce(u64 mc0_status) 100static void amd_decode_dc_mce(struct mce *m)
101{ 101{
102 u32 ec = mc0_status & 0xffff; 102 u32 ec = m->status & 0xffff;
103 u32 xec = (mc0_status >> 16) & 0xf; 103 u32 xec = (m->status >> 16) & 0xf;
104 104
105 pr_emerg(HW_ERR "Data Cache Error: "); 105 pr_emerg(HW_ERR "Data Cache Error: ");
106 106
107 if (xec == 1 && TLB_ERROR(ec)) 107 if (xec == 1 && TLB_ERROR(ec))
108 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 108 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
109 else if (xec == 0) { 109 else if (xec == 0) {
110 if (mc0_status & (1ULL << 40)) 110 if (m->status & (1ULL << 40))
111 pr_cont(" during Data Scrub.\n"); 111 pr_cont(" during Data Scrub.\n");
112 else if (TLB_ERROR(ec)) 112 else if (TLB_ERROR(ec))
113 pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); 113 pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
@@ -140,10 +140,10 @@ wrong_dc_mce:
140 pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); 140 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
141} 141}
142 142
143static void amd_decode_ic_mce(u64 mc1_status) 143static void amd_decode_ic_mce(struct mce *m)
144{ 144{
145 u32 ec = mc1_status & 0xffff; 145 u32 ec = m->status & 0xffff;
146 u32 xec = (mc1_status >> 16) & 0xf; 146 u32 xec = (m->status >> 16) & 0xf;
147 147
148 pr_emerg(HW_ERR "Instruction Cache Error"); 148 pr_emerg(HW_ERR "Instruction Cache Error");
149 149
@@ -154,7 +154,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
154 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); 154 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
155 else if (BUS_ERROR(ec)) { 155 else if (BUS_ERROR(ec)) {
156 if (boot_cpu_data.x86 == 0xf && 156 if (boot_cpu_data.x86 == 0xf &&
157 (mc1_status & (1ULL << 58))) 157 (m->status & BIT(58)))
158 pr_cont(" during system linefill.\n"); 158 pr_cont(" during system linefill.\n");
159 else 159 else
160 pr_cont(" during attempted NB data read.\n"); 160 pr_cont(" during attempted NB data read.\n");
@@ -197,10 +197,10 @@ wrong_ic_mce:
197 pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); 197 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
198} 198}
199 199
200static void amd_decode_bu_mce(u64 mc2_status) 200static void amd_decode_bu_mce(struct mce *m)
201{ 201{
202 u32 ec = mc2_status & 0xffff; 202 u32 ec = m->status & 0xffff;
203 u32 xec = (mc2_status >> 16) & 0xf; 203 u32 xec = (m->status >> 16) & 0xf;
204 204
205 pr_emerg(HW_ERR "Bus Unit Error"); 205 pr_emerg(HW_ERR "Bus Unit Error");
206 206
@@ -239,10 +239,10 @@ wrong_bu_mce:
239 pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); 239 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
240} 240}
241 241
242static void amd_decode_ls_mce(u64 mc3_status) 242static void amd_decode_ls_mce(struct mce *m)
243{ 243{
244 u32 ec = mc3_status & 0xffff; 244 u32 ec = m->status & 0xffff;
245 u32 xec = (mc3_status >> 16) & 0xf; 245 u32 xec = (m->status >> 16) & 0xf;
246 246
247 pr_emerg(HW_ERR "Load Store Error"); 247 pr_emerg(HW_ERR "Load Store Error");
248 248
@@ -260,9 +260,11 @@ wrong_ls_mce:
260 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 260 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
261} 261}
262 262
263void amd_decode_nb_mce(int node_id, struct err_regs *regs) 263void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
264{ 264{
265 u32 ec = ERROR_CODE(regs->nbsl); 265 u32 ec = m->status & 0xffff;
266 u32 nbsh = (u32)(m->status >> 32);
267 u32 nbsl = (u32)m->status;
266 268
267 /* 269 /*
268 * GART TLB error reporting is disabled by default. Bail out early. 270 * GART TLB error reporting is disabled by default. Bail out early.
@@ -278,10 +280,10 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs)
278 */ 280 */
279 if ((boot_cpu_data.x86 == 0x10) && 281 if ((boot_cpu_data.x86 == 0x10) &&
280 (boot_cpu_data.x86_model > 7)) { 282 (boot_cpu_data.x86_model > 7)) {
281 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) 283 if (nbsh & K8_NBSH_ERR_CPU_VAL)
282 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); 284 pr_cont(", core: %u\n", (u8)(nbsh & 0xf));
283 } else { 285 } else {
284 u8 assoc_cpus = regs->nbsh & 0xf; 286 u8 assoc_cpus = nbsh & 0xf;
285 287
286 if (assoc_cpus > 0) 288 if (assoc_cpus > 0)
287 pr_cont(", core: %d", fls(assoc_cpus) - 1); 289 pr_cont(", core: %d", fls(assoc_cpus) - 1);
@@ -289,17 +291,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs)
289 pr_cont("\n"); 291 pr_cont("\n");
290 } 292 }
291 293
292 pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl)); 294 pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl));
293 295
294 if (BUS_ERROR(ec) && nb_bus_decoder) 296 if (BUS_ERROR(ec) && nb_bus_decoder)
295 nb_bus_decoder(node_id, regs); 297 nb_bus_decoder(node_id, m, nbcfg);
296} 298}
297EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 299EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
298 300
299static void amd_decode_fr_mce(u64 mc5_status) 301static void amd_decode_fr_mce(struct mce *m)
300{ 302{
301 /* we have only one error signature so match all fields at once. */ 303 /* we have only one error signature so match all fields at once. */
302 if ((mc5_status & 0xffff) == 0x0f0f) 304 if ((m->status & 0xffff) == 0x0f0f)
303 pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); 305 pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
304 else 306 else
305 pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); 307 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
@@ -326,7 +328,6 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
326 void *data) 328 void *data)
327{ 329{
328 struct mce *m = (struct mce *)data; 330 struct mce *m = (struct mce *)data;
329 struct err_regs regs;
330 int node, ecc; 331 int node, ecc;
331 332
332 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); 333 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
@@ -346,33 +347,28 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
346 347
347 switch (m->bank) { 348 switch (m->bank) {
348 case 0: 349 case 0:
349 amd_decode_dc_mce(m->status); 350 amd_decode_dc_mce(m);
350 break; 351 break;
351 352
352 case 1: 353 case 1:
353 amd_decode_ic_mce(m->status); 354 amd_decode_ic_mce(m);
354 break; 355 break;
355 356
356 case 2: 357 case 2:
357 amd_decode_bu_mce(m->status); 358 amd_decode_bu_mce(m);
358 break; 359 break;
359 360
360 case 3: 361 case 3:
361 amd_decode_ls_mce(m->status); 362 amd_decode_ls_mce(m);
362 break; 363 break;
363 364
364 case 4: 365 case 4:
365 regs.nbsl = (u32) m->status; 366 node = amd_get_nb_id(m->extcpu);
366 regs.nbsh = (u32)(m->status >> 32); 367 amd_decode_nb_mce(node, m, 0);
367 regs.nbeal = (u32) m->addr;
368 regs.nbeah = (u32)(m->addr >> 32);
369 node = amd_get_nb_id(m->extcpu);
370
371 amd_decode_nb_mce(node, &regs);
372 break; 368 break;
373 369
374 case 5: 370 case 5:
375 amd_decode_fr_mce(m->status); 371 amd_decode_fr_mce(m);
376 break; 372 break;
377 373
378 default: 374 default:
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
index 2ee499d7f898..0fba0e76c25f 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/edac_mce_amd.h
@@ -63,8 +63,8 @@ struct err_regs {
63 63
64 64
65void amd_report_gart_errors(bool); 65void amd_report_gart_errors(bool);
66void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); 66void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
67void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); 67void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
68void amd_decode_nb_mce(int, struct err_regs *); 68void amd_decode_nb_mce(int, struct mce *, u32);
69 69
70#endif /* _EDAC_MCE_AMD_H */ 70#endif /* _EDAC_MCE_AMD_H */