diff options
| author | Borislav Petkov <borislav.petkov@amd.com> | 2010-09-01 08:45:20 -0400 |
|---|---|---|
| committer | Borislav Petkov <bp@amd64.org> | 2010-10-21 08:47:58 -0400 |
| commit | 7cfd4a87441f5ca3018fdd1f7ad67e8a73a05dc2 (patch) | |
| tree | a74a1bb40d2ef3b5a66551562cdf775b63a27c8f | |
| parent | 6337583d7dc0dced36ab98dd63de2389c95c22d9 (diff) | |
EDAC, MCE: Pass complete MCE info to decoders
... instead of the MCi_STATUS info only for improved handling of certain
types of errors later.
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
| -rw-r--r-- | drivers/edac/amd64_edac.c | 13 | ||||
| -rw-r--r-- | drivers/edac/amd64_edac_dbg.c | 10 | ||||
| -rw-r--r-- | drivers/edac/edac_mce_amd.c | 74 | ||||
| -rw-r--r-- | drivers/edac/edac_mce_amd.h | 6 |
4 files changed, 56 insertions, 47 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e7d5d6b5dcf6..76f7cc0ee149 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c | |||
| @@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, | |||
| 2073 | amd64_handle_ue(mci, info); | 2073 | amd64_handle_ue(mci, info); |
| 2074 | } | 2074 | } |
| 2075 | 2075 | ||
| 2076 | void amd64_decode_bus_error(int node_id, struct err_regs *regs) | 2076 | void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg) |
| 2077 | { | 2077 | { |
| 2078 | struct mem_ctl_info *mci = mci_lookup[node_id]; | 2078 | struct mem_ctl_info *mci = mci_lookup[node_id]; |
| 2079 | struct err_regs regs; | ||
| 2079 | 2080 | ||
| 2080 | __amd64_decode_bus_error(mci, regs); | 2081 | regs.nbsl = (u32) m->status; |
| 2082 | regs.nbsh = (u32)(m->status >> 32); | ||
| 2083 | regs.nbeal = (u32) m->addr; | ||
| 2084 | regs.nbeah = (u32)(m->addr >> 32); | ||
| 2085 | regs.nbcfg = nbcfg; | ||
| 2086 | |||
| 2087 | __amd64_decode_bus_error(mci, ®s); | ||
| 2081 | 2088 | ||
| 2082 | /* | 2089 | /* |
| 2083 | * Check the UE bit of the NB status high register, if set generate some | 2090 | * Check the UE bit of the NB status high register, if set generate some |
| @@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs) | |||
| 2086 | * | 2093 | * |
| 2087 | * FIXME: this should go somewhere else, if at all. | 2094 | * FIXME: this should go somewhere else, if at all. |
| 2088 | */ | 2095 | */ |
| 2089 | if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) | 2096 | if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors) |
| 2090 | edac_mc_handle_ue_no_info(mci, "UE bit is set"); | 2097 | edac_mc_handle_ue_no_info(mci, "UE bit is set"); |
| 2091 | 2098 | ||
| 2092 | } | 2099 | } |
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 22ef3fecf569..f6d5695de5b6 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c | |||
| @@ -10,11 +10,14 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, | |||
| 10 | size_t count) | 10 | size_t count) |
| 11 | { | 11 | { |
| 12 | struct amd64_pvt *pvt = mci->pvt_info; | 12 | struct amd64_pvt *pvt = mci->pvt_info; |
| 13 | unsigned long long value; | 13 | u64 value; |
| 14 | int ret = 0; | 14 | int ret = 0; |
| 15 | struct mce m; | ||
| 15 | 16 | ||
| 16 | ret = strict_strtoull(data, 16, &value); | 17 | ret = strict_strtoull(data, 16, &value); |
| 17 | if (ret != -EINVAL) { | 18 | if (ret != -EINVAL) { |
| 19 | struct err_regs *regs = &pvt->ctl_error_info; | ||
| 20 | |||
| 18 | debugf0("received NBEA= 0x%llx\n", value); | 21 | debugf0("received NBEA= 0x%llx\n", value); |
| 19 | 22 | ||
| 20 | /* place the value into the virtual error packet */ | 23 | /* place the value into the virtual error packet */ |
| @@ -22,9 +25,12 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, | |||
| 22 | value >>= 32; | 25 | value >>= 32; |
| 23 | pvt->ctl_error_info.nbeah = (u32) value; | 26 | pvt->ctl_error_info.nbeah = (u32) value; |
| 24 | 27 | ||
| 28 | m.addr = value; | ||
| 29 | m.status = regs->nbsl | ((u64)regs->nbsh << 32); | ||
| 30 | |||
| 25 | /* Process the Mapping request */ | 31 | /* Process the Mapping request */ |
| 26 | /* TODO: Add race prevention */ | 32 | /* TODO: Add race prevention */ |
| 27 | amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info); | 33 | amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg); |
| 28 | 34 | ||
| 29 | return count; | 35 | return count; |
| 30 | } | 36 | } |
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index d0e850eea50a..6cfa881888bc 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | #include "edac_mce_amd.h" | 2 | #include "edac_mce_amd.h" |
| 3 | 3 | ||
| 4 | static bool report_gart_errors; | 4 | static bool report_gart_errors; |
| 5 | static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); | 5 | static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); |
| 6 | 6 | ||
| 7 | void amd_report_gart_errors(bool v) | 7 | void amd_report_gart_errors(bool v) |
| 8 | { | 8 | { |
| @@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v) | |||
| 10 | } | 10 | } |
| 11 | EXPORT_SYMBOL_GPL(amd_report_gart_errors); | 11 | EXPORT_SYMBOL_GPL(amd_report_gart_errors); |
| 12 | 12 | ||
| 13 | void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) | 13 | void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) |
| 14 | { | 14 | { |
| 15 | nb_bus_decoder = f; | 15 | nb_bus_decoder = f; |
| 16 | } | 16 | } |
| 17 | EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); | 17 | EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); |
| 18 | 18 | ||
| 19 | void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) | 19 | void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) |
| 20 | { | 20 | { |
| 21 | if (nb_bus_decoder) { | 21 | if (nb_bus_decoder) { |
| 22 | WARN_ON(nb_bus_decoder != f); | 22 | WARN_ON(nb_bus_decoder != f); |
| @@ -97,17 +97,17 @@ const char *ext_msgs[] = { | |||
| 97 | }; | 97 | }; |
| 98 | EXPORT_SYMBOL_GPL(ext_msgs); | 98 | EXPORT_SYMBOL_GPL(ext_msgs); |
| 99 | 99 | ||
| 100 | static void amd_decode_dc_mce(u64 mc0_status) | 100 | static void amd_decode_dc_mce(struct mce *m) |
| 101 | { | 101 | { |
| 102 | u32 ec = mc0_status & 0xffff; | 102 | u32 ec = m->status & 0xffff; |
| 103 | u32 xec = (mc0_status >> 16) & 0xf; | 103 | u32 xec = (m->status >> 16) & 0xf; |
| 104 | 104 | ||
| 105 | pr_emerg(HW_ERR "Data Cache Error: "); | 105 | pr_emerg(HW_ERR "Data Cache Error: "); |
| 106 | 106 | ||
| 107 | if (xec == 1 && TLB_ERROR(ec)) | 107 | if (xec == 1 && TLB_ERROR(ec)) |
| 108 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); | 108 | pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); |
| 109 | else if (xec == 0) { | 109 | else if (xec == 0) { |
| 110 | if (mc0_status & (1ULL << 40)) | 110 | if (m->status & (1ULL << 40)) |
| 111 | pr_cont(" during Data Scrub.\n"); | 111 | pr_cont(" during Data Scrub.\n"); |
| 112 | else if (TLB_ERROR(ec)) | 112 | else if (TLB_ERROR(ec)) |
| 113 | pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); | 113 | pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); |
| @@ -140,10 +140,10 @@ wrong_dc_mce: | |||
| 140 | pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); | 140 | pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); |
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | static void amd_decode_ic_mce(u64 mc1_status) | 143 | static void amd_decode_ic_mce(struct mce *m) |
| 144 | { | 144 | { |
| 145 | u32 ec = mc1_status & 0xffff; | 145 | u32 ec = m->status & 0xffff; |
| 146 | u32 xec = (mc1_status >> 16) & 0xf; | 146 | u32 xec = (m->status >> 16) & 0xf; |
| 147 | 147 | ||
| 148 | pr_emerg(HW_ERR "Instruction Cache Error"); | 148 | pr_emerg(HW_ERR "Instruction Cache Error"); |
| 149 | 149 | ||
| @@ -154,7 +154,7 @@ static void amd_decode_ic_mce(u64 mc1_status) | |||
| 154 | pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); | 154 | pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); |
| 155 | else if (BUS_ERROR(ec)) { | 155 | else if (BUS_ERROR(ec)) { |
| 156 | if (boot_cpu_data.x86 == 0xf && | 156 | if (boot_cpu_data.x86 == 0xf && |
| 157 | (mc1_status & (1ULL << 58))) | 157 | (m->status & BIT(58))) |
| 158 | pr_cont(" during system linefill.\n"); | 158 | pr_cont(" during system linefill.\n"); |
| 159 | else | 159 | else |
| 160 | pr_cont(" during attempted NB data read.\n"); | 160 | pr_cont(" during attempted NB data read.\n"); |
| @@ -197,10 +197,10 @@ wrong_ic_mce: | |||
| 197 | pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); | 197 | pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); |
| 198 | } | 198 | } |
| 199 | 199 | ||
| 200 | static void amd_decode_bu_mce(u64 mc2_status) | 200 | static void amd_decode_bu_mce(struct mce *m) |
| 201 | { | 201 | { |
| 202 | u32 ec = mc2_status & 0xffff; | 202 | u32 ec = m->status & 0xffff; |
| 203 | u32 xec = (mc2_status >> 16) & 0xf; | 203 | u32 xec = (m->status >> 16) & 0xf; |
| 204 | 204 | ||
| 205 | pr_emerg(HW_ERR "Bus Unit Error"); | 205 | pr_emerg(HW_ERR "Bus Unit Error"); |
| 206 | 206 | ||
| @@ -239,10 +239,10 @@ wrong_bu_mce: | |||
| 239 | pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); | 239 | pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); |
| 240 | } | 240 | } |
| 241 | 241 | ||
| 242 | static void amd_decode_ls_mce(u64 mc3_status) | 242 | static void amd_decode_ls_mce(struct mce *m) |
| 243 | { | 243 | { |
| 244 | u32 ec = mc3_status & 0xffff; | 244 | u32 ec = m->status & 0xffff; |
| 245 | u32 xec = (mc3_status >> 16) & 0xf; | 245 | u32 xec = (m->status >> 16) & 0xf; |
| 246 | 246 | ||
| 247 | pr_emerg(HW_ERR "Load Store Error"); | 247 | pr_emerg(HW_ERR "Load Store Error"); |
| 248 | 248 | ||
| @@ -260,9 +260,11 @@ wrong_ls_mce: | |||
| 260 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); | 260 | pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); |
| 261 | } | 261 | } |
| 262 | 262 | ||
| 263 | void amd_decode_nb_mce(int node_id, struct err_regs *regs) | 263 | void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) |
| 264 | { | 264 | { |
| 265 | u32 ec = ERROR_CODE(regs->nbsl); | 265 | u32 ec = m->status & 0xffff; |
| 266 | u32 nbsh = (u32)(m->status >> 32); | ||
| 267 | u32 nbsl = (u32)m->status; | ||
| 266 | 268 | ||
| 267 | /* | 269 | /* |
| 268 | * GART TLB error reporting is disabled by default. Bail out early. | 270 | * GART TLB error reporting is disabled by default. Bail out early. |
| @@ -278,10 +280,10 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs) | |||
| 278 | */ | 280 | */ |
| 279 | if ((boot_cpu_data.x86 == 0x10) && | 281 | if ((boot_cpu_data.x86 == 0x10) && |
| 280 | (boot_cpu_data.x86_model > 7)) { | 282 | (boot_cpu_data.x86_model > 7)) { |
| 281 | if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) | 283 | if (nbsh & K8_NBSH_ERR_CPU_VAL) |
| 282 | pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); | 284 | pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); |
| 283 | } else { | 285 | } else { |
| 284 | u8 assoc_cpus = regs->nbsh & 0xf; | 286 | u8 assoc_cpus = nbsh & 0xf; |
| 285 | 287 | ||
| 286 | if (assoc_cpus > 0) | 288 | if (assoc_cpus > 0) |
| 287 | pr_cont(", core: %d", fls(assoc_cpus) - 1); | 289 | pr_cont(", core: %d", fls(assoc_cpus) - 1); |
| @@ -289,17 +291,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs) | |||
| 289 | pr_cont("\n"); | 291 | pr_cont("\n"); |
| 290 | } | 292 | } |
| 291 | 293 | ||
| 292 | pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl)); | 294 | pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); |
| 293 | 295 | ||
| 294 | if (BUS_ERROR(ec) && nb_bus_decoder) | 296 | if (BUS_ERROR(ec) && nb_bus_decoder) |
| 295 | nb_bus_decoder(node_id, regs); | 297 | nb_bus_decoder(node_id, m, nbcfg); |
| 296 | } | 298 | } |
| 297 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); | 299 | EXPORT_SYMBOL_GPL(amd_decode_nb_mce); |
| 298 | 300 | ||
| 299 | static void amd_decode_fr_mce(u64 mc5_status) | 301 | static void amd_decode_fr_mce(struct mce *m) |
| 300 | { | 302 | { |
| 301 | /* we have only one error signature so match all fields at once. */ | 303 | /* we have only one error signature so match all fields at once. */ |
| 302 | if ((mc5_status & 0xffff) == 0x0f0f) | 304 | if ((m->status & 0xffff) == 0x0f0f) |
| 303 | pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); | 305 | pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); |
| 304 | else | 306 | else |
| 305 | pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); | 307 | pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); |
| @@ -326,7 +328,6 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, | |||
| 326 | void *data) | 328 | void *data) |
| 327 | { | 329 | { |
| 328 | struct mce *m = (struct mce *)data; | 330 | struct mce *m = (struct mce *)data; |
| 329 | struct err_regs regs; | ||
| 330 | int node, ecc; | 331 | int node, ecc; |
| 331 | 332 | ||
| 332 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); | 333 | pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); |
| @@ -346,33 +347,28 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, | |||
| 346 | 347 | ||
| 347 | switch (m->bank) { | 348 | switch (m->bank) { |
| 348 | case 0: | 349 | case 0: |
| 349 | amd_decode_dc_mce(m->status); | 350 | amd_decode_dc_mce(m); |
| 350 | break; | 351 | break; |
| 351 | 352 | ||
| 352 | case 1: | 353 | case 1: |
| 353 | amd_decode_ic_mce(m->status); | 354 | amd_decode_ic_mce(m); |
| 354 | break; | 355 | break; |
| 355 | 356 | ||
| 356 | case 2: | 357 | case 2: |
| 357 | amd_decode_bu_mce(m->status); | 358 | amd_decode_bu_mce(m); |
| 358 | break; | 359 | break; |
| 359 | 360 | ||
| 360 | case 3: | 361 | case 3: |
| 361 | amd_decode_ls_mce(m->status); | 362 | amd_decode_ls_mce(m); |
| 362 | break; | 363 | break; |
| 363 | 364 | ||
| 364 | case 4: | 365 | case 4: |
| 365 | regs.nbsl = (u32) m->status; | 366 | node = amd_get_nb_id(m->extcpu); |
| 366 | regs.nbsh = (u32)(m->status >> 32); | 367 | amd_decode_nb_mce(node, m, 0); |
| 367 | regs.nbeal = (u32) m->addr; | ||
| 368 | regs.nbeah = (u32)(m->addr >> 32); | ||
| 369 | node = amd_get_nb_id(m->extcpu); | ||
| 370 | |||
| 371 | amd_decode_nb_mce(node, ®s); | ||
| 372 | break; | 368 | break; |
| 373 | 369 | ||
| 374 | case 5: | 370 | case 5: |
| 375 | amd_decode_fr_mce(m->status); | 371 | amd_decode_fr_mce(m); |
| 376 | break; | 372 | break; |
| 377 | 373 | ||
| 378 | default: | 374 | default: |
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 2ee499d7f898..0fba0e76c25f 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h | |||
| @@ -63,8 +63,8 @@ struct err_regs { | |||
| 63 | 63 | ||
| 64 | 64 | ||
| 65 | void amd_report_gart_errors(bool); | 65 | void amd_report_gart_errors(bool); |
| 66 | void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); | 66 | void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); |
| 67 | void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); | 67 | void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); |
| 68 | void amd_decode_nb_mce(int, struct err_regs *); | 68 | void amd_decode_nb_mce(int, struct mce *, u32); |
| 69 | 69 | ||
| 70 | #endif /* _EDAC_MCE_AMD_H */ | 70 | #endif /* _EDAC_MCE_AMD_H */ |
