diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-10-01 10:14:32 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-10-02 09:42:18 -0400 |
commit | f436f8bb73138bc74eb1c6527723e00988ad8a8a (patch) | |
tree | 834bd21c6d8daff79328582e429146417c262270 | |
parent | 24e35800cdc4350fc34e2bed37b608a9e13ab3b6 (diff) |
x86: EDAC: MCE: Fix MCE decoding callback logic
Make decoding of MCEs happen only on AMD hardware by registering a
non-default callback only on CPU families which support it.
While looking at the interaction of decode_mce() with the other MCE
code i also noticed a few other things and made the following
cleanups/fixes:
- Fixed the mce_decode() weak alias - a weak alias is really not
good here, it should be a proper callback. A weak alias will be
overriden if a piece of code is built into the kernel - not
good, obviously.
- The patch initializes the callback on AMD family 10h and 11h.
- Added the more correct fallback printk of:
No support for human readable MCE decoding on this CPU type.
Transcribe the message and run it through 'mcelog --ascii' to decode.
On CPUs that dont have a decoder.
- Made the surrounding code more readable.
Note that the callback allows us to have a default fallback -
without having to check the CPU versions during the printout
itself. When an EDAC module registers itself, it can install the
decode-print function.
(there's no unregister needed as this is core code.)
version -v2 by Borislav Petkov:
- add K8 to the set of supported CPUs
- always build in edac_mce_amd since we use an early_initcall now
- fix checkpatch warnings
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <20091001141432.GA11410@aftab>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | arch/x86/include/asm/mce.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 58 | ||||
-rw-r--r-- | drivers/edac/Makefile | 2 | ||||
-rw-r--r-- | drivers/edac/edac_mce_amd.c | 15 |
4 files changed, 53 insertions, 24 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b608a64c581..f1363b72364 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -133,6 +133,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} | |||
133 | static inline void enable_p5_mce(void) {} | 133 | static inline void enable_p5_mce(void) {} |
134 | #endif | 134 | #endif |
135 | 135 | ||
136 | extern void (*x86_mce_decode_callback)(struct mce *m); | ||
137 | |||
136 | void mce_setup(struct mce *m); | 138 | void mce_setup(struct mce *m); |
137 | void mce_log(struct mce *m); | 139 | void mce_log(struct mce *m); |
138 | DECLARE_PER_CPU(struct sys_device, mce_dev); | 140 | DECLARE_PER_CPU(struct sys_device, mce_dev); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 183c3457d2f..b1598a9436d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -85,6 +85,18 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | |||
85 | static DEFINE_PER_CPU(struct mce, mces_seen); | 85 | static DEFINE_PER_CPU(struct mce, mces_seen); |
86 | static int cpu_missing; | 86 | static int cpu_missing; |
87 | 87 | ||
88 | static void default_decode_mce(struct mce *m) | ||
89 | { | ||
90 | pr_emerg("No human readable MCE decoding support on this CPU type.\n"); | ||
91 | pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * CPU/chipset specific EDAC code can register a callback here to print | ||
96 | * MCE errors in a human-readable form: | ||
97 | */ | ||
98 | void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; | ||
99 | EXPORT_SYMBOL(x86_mce_decode_callback); | ||
88 | 100 | ||
89 | /* MCA banks polled by the period polling timer for corrected events */ | 101 | /* MCA banks polled by the period polling timer for corrected events */ |
90 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | 102 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { |
@@ -165,46 +177,46 @@ void mce_log(struct mce *mce) | |||
165 | set_bit(0, &mce_need_notify); | 177 | set_bit(0, &mce_need_notify); |
166 | } | 178 | } |
167 | 179 | ||
168 | void __weak decode_mce(struct mce *m) | ||
169 | { | ||
170 | return; | ||
171 | } | ||
172 | |||
173 | static void print_mce(struct mce *m) | 180 | static void print_mce(struct mce *m) |
174 | { | 181 | { |
175 | printk(KERN_EMERG | 182 | pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", |
176 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
177 | m->extcpu, m->mcgstatus, m->bank, m->status); | 183 | m->extcpu, m->mcgstatus, m->bank, m->status); |
184 | |||
178 | if (m->ip) { | 185 | if (m->ip) { |
179 | printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", | 186 | pr_emerg("RIP%s %02x:<%016Lx> ", |
180 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | 187 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", |
181 | m->cs, m->ip); | 188 | m->cs, m->ip); |
189 | |||
182 | if (m->cs == __KERNEL_CS) | 190 | if (m->cs == __KERNEL_CS) |
183 | print_symbol("{%s}", m->ip); | 191 | print_symbol("{%s}", m->ip); |
184 | printk(KERN_CONT "\n"); | 192 | pr_cont("\n"); |
185 | } | 193 | } |
186 | printk(KERN_EMERG "TSC %llx ", m->tsc); | 194 | |
195 | pr_emerg("TSC %llx ", m->tsc); | ||
187 | if (m->addr) | 196 | if (m->addr) |
188 | printk(KERN_CONT "ADDR %llx ", m->addr); | 197 | pr_cont("ADDR %llx ", m->addr); |
189 | if (m->misc) | 198 | if (m->misc) |
190 | printk(KERN_CONT "MISC %llx ", m->misc); | 199 | pr_cont("MISC %llx ", m->misc); |
191 | printk(KERN_CONT "\n"); | 200 | |
192 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 201 | pr_cont("\n"); |
193 | m->cpuvendor, m->cpuid, m->time, m->socketid, | 202 | pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
194 | m->apicid); | 203 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); |
195 | 204 | ||
196 | decode_mce(m); | 205 | /* |
206 | * Print out human-readable details about the MCE error, | ||
207 | * (if the CPU has an implementation for that): | ||
208 | */ | ||
209 | x86_mce_decode_callback(m); | ||
197 | } | 210 | } |
198 | 211 | ||
199 | static void print_mce_head(void) | 212 | static void print_mce_head(void) |
200 | { | 213 | { |
201 | printk(KERN_EMERG "\nHARDWARE ERROR\n"); | 214 | pr_emerg("\nHARDWARE ERROR\n"); |
202 | } | 215 | } |
203 | 216 | ||
204 | static void print_mce_tail(void) | 217 | static void print_mce_tail(void) |
205 | { | 218 | { |
206 | printk(KERN_EMERG "This is not a software problem!\n" | 219 | pr_emerg("This is not a software problem!\n"); |
207 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | ||
208 | } | 220 | } |
209 | 221 | ||
210 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 222 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
@@ -218,6 +230,7 @@ static atomic_t mce_fake_paniced; | |||
218 | static void wait_for_panic(void) | 230 | static void wait_for_panic(void) |
219 | { | 231 | { |
220 | long timeout = PANIC_TIMEOUT*USEC_PER_SEC; | 232 | long timeout = PANIC_TIMEOUT*USEC_PER_SEC; |
233 | |||
221 | preempt_disable(); | 234 | preempt_disable(); |
222 | local_irq_enable(); | 235 | local_irq_enable(); |
223 | while (timeout-- > 0) | 236 | while (timeout-- > 0) |
@@ -285,6 +298,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
285 | static int msr_to_offset(u32 msr) | 298 | static int msr_to_offset(u32 msr) |
286 | { | 299 | { |
287 | unsigned bank = __get_cpu_var(injectm.bank); | 300 | unsigned bank = __get_cpu_var(injectm.bank); |
301 | |||
288 | if (msr == rip_msr) | 302 | if (msr == rip_msr) |
289 | return offsetof(struct mce, ip); | 303 | return offsetof(struct mce, ip); |
290 | if (msr == MSR_IA32_MCx_STATUS(bank)) | 304 | if (msr == MSR_IA32_MCx_STATUS(bank)) |
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 7a473bbe8ab..8701cd7ce4e 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile | |||
@@ -18,7 +18,7 @@ edac_core-objs += edac_pci.o edac_pci_sysfs.o | |||
18 | endif | 18 | endif |
19 | 19 | ||
20 | ifdef CONFIG_CPU_SUP_AMD | 20 | ifdef CONFIG_CPU_SUP_AMD |
21 | edac_core-objs += edac_mce_amd.o | 21 | obj-$(CONFIG_X86_MCE) += edac_mce_amd.o |
22 | endif | 22 | endif |
23 | 23 | ||
24 | obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o | 24 | obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o |
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 0c21c370c9d..83a01a1187d 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c | |||
@@ -362,7 +362,7 @@ static inline void amd_decode_err_code(unsigned int ec) | |||
362 | pr_warning("Huh? Unknown MCE error 0x%x\n", ec); | 362 | pr_warning("Huh? Unknown MCE error 0x%x\n", ec); |
363 | } | 363 | } |
364 | 364 | ||
365 | void decode_mce(struct mce *m) | 365 | static void amd_decode_mce(struct mce *m) |
366 | { | 366 | { |
367 | struct err_regs regs; | 367 | struct err_regs regs; |
368 | int node, ecc; | 368 | int node, ecc; |
@@ -420,3 +420,16 @@ void decode_mce(struct mce *m) | |||
420 | 420 | ||
421 | amd_decode_err_code(m->status & 0xffff); | 421 | amd_decode_err_code(m->status & 0xffff); |
422 | } | 422 | } |
423 | |||
424 | static int __init mce_amd_init(void) | ||
425 | { | ||
426 | /* | ||
427 | * We can decode MCEs for Opteron and later CPUs: | ||
428 | */ | ||
429 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && | ||
430 | (boot_cpu_data.x86 >= 0xf)) | ||
431 | x86_mce_decode_callback = amd_decode_mce; | ||
432 | |||
433 | return 0; | ||
434 | } | ||
435 | early_initcall(mce_amd_init); | ||