aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-05-27 15:56:55 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-06-03 17:40:39 -0400
commitbd19a5e6b73df276e1ccedf9059e9ee70c372d7d (patch)
treef5e10b6340cfa416efa7b1d0c82712d8fbb2c94b /arch/x86/kernel/cpu/mcheck/mce.c
parent817f32d02a52dd7f5941534e0699883691e918df (diff)
x86, mce: check early in exception handler if panic is needed
The exception handler should behave differently if the exception is fatal versus one that can be returned from. In the first case it should never clear any registers because these need to be preserved for logging after the next boot. Otherwise it should clear them on each CPU step by step so that other CPUs sharing the same bank don't see duplicate events. Otherwise we risk reporting events multiple times on any CPUs which have shared machine check banks, which is a common problem on Intel Nehalem which has both SMT (two CPU threads sharing banks) and shared machine check banks in the uncore. Determine early in a special pass if any event requires a panic. This uses the mce_severity() function added earlier. This is needed for the next patch. Also fixes a problem together with an earlier patch that corrected events weren't logged on a fatal MCE. [ Impact: Feature ] Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c38
1 files changed, 25 insertions, 13 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6773610061d..5031814ac94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
36#include <asm/mce.h> 36#include <asm/mce.h>
37#include <asm/msr.h> 37#include <asm/msr.h>
38 38
39#include "mce-internal.h"
39#include "mce.h" 40#include "mce.h"
40 41
41/* Handle unconfigured int18 (should never happen) */ 42/* Handle unconfigured int18 (should never happen) */
@@ -191,7 +192,7 @@ static void print_mce(struct mce *m)
191 "and contact your hardware vendor\n"); 192 "and contact your hardware vendor\n");
192} 193}
193 194
194static void mce_panic(char *msg, struct mce *final) 195static void mce_panic(char *msg, struct mce *final, char *exp)
195{ 196{
196 int i; 197 int i;
197 198
@@ -214,6 +215,8 @@ static void mce_panic(char *msg, struct mce *final)
214 } 215 }
215 if (final) 216 if (final)
216 print_mce(final); 217 print_mce(final);
218 if (exp)
219 printk(KERN_EMERG "Machine check: %s\n", exp);
217 panic(msg); 220 panic(msg);
218} 221}
219 222
@@ -358,6 +361,22 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
358EXPORT_SYMBOL_GPL(machine_check_poll); 361EXPORT_SYMBOL_GPL(machine_check_poll);
359 362
360/* 363/*
364 * Do a quick check if any of the events requires a panic.
365 * This decides if we keep the events around or clear them.
366 */
367static int mce_no_way_out(struct mce *m, char **msg)
368{
369 int i;
370
371 for (i = 0; i < banks; i++) {
372 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
373 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
374 return 1;
375 }
376 return 0;
377}
378
379/*
361 * The actual machine check handler. This only handles real 380 * The actual machine check handler. This only handles real
362 * exceptions when something got corrupted coming in through int 18. 381 * exceptions when something got corrupted coming in through int 18.
363 * 382 *
@@ -381,6 +400,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
381 */ 400 */
382 int kill_it = 0; 401 int kill_it = 0;
383 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 402 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
403 char *msg = "Unknown";
384 404
385 atomic_inc(&mce_entry); 405 atomic_inc(&mce_entry);
386 406
@@ -395,10 +415,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
395 mce_setup(&m); 415 mce_setup(&m);
396 416
397 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 417 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
398 418 no_way_out = mce_no_way_out(&m, &msg);
399 /* if the restart IP is not valid, we're done for */
400 if (!(m.mcgstatus & MCG_STATUS_RIPV))
401 no_way_out = 1;
402 419
403 barrier(); 420 barrier();
404 421
@@ -430,18 +447,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
430 __set_bit(i, toclear); 447 __set_bit(i, toclear);
431 448
432 if (m.status & MCI_STATUS_EN) { 449 if (m.status & MCI_STATUS_EN) {
433 /* if PCC was set, there's no way out */
434 no_way_out |= !!(m.status & MCI_STATUS_PCC);
435 /* 450 /*
436 * If this error was uncorrectable and there was 451 * If this error was uncorrectable and there was
437 * an overflow, we're in trouble. If no overflow, 452 * an overflow, we're in trouble. If no overflow,
438 * we might get away with just killing a task. 453 * we might get away with just killing a task.
439 */ 454 */
440 if (m.status & MCI_STATUS_UC) { 455 if (m.status & MCI_STATUS_UC)
441 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
442 no_way_out = 1;
443 kill_it = 1; 456 kill_it = 1;
444 }
445 } else { 457 } else {
446 /* 458 /*
447 * Machine check event was not enabled. Clear, but 459 * Machine check event was not enabled. Clear, but
@@ -483,7 +495,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
483 * has not set tolerant to an insane level, give up and die. 495 * has not set tolerant to an insane level, give up and die.
484 */ 496 */
485 if (no_way_out && tolerant < 3) 497 if (no_way_out && tolerant < 3)
486 mce_panic("Machine check", &panicm); 498 mce_panic("Machine check", &panicm, msg);
487 499
488 /* 500 /*
489 * If the error seems to be unrecoverable, something should be 501 * If the error seems to be unrecoverable, something should be
@@ -511,7 +523,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
511 if (user_space) { 523 if (user_space) {
512 force_sig(SIGBUS, current); 524 force_sig(SIGBUS, current);
513 } else if (panic_on_oops || tolerant < 2) { 525 } else if (panic_on_oops || tolerant < 2) {
514 mce_panic("Uncorrected machine check", &panicm); 526 mce_panic("Uncorrected machine check", &panicm, msg);
515 } 527 }
516 } 528 }
517 529