aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2012-07-19 14:28:46 -0400
committerIngo Molnar <mingo@kernel.org>2012-07-26 09:05:47 -0400
commit61b0fccd7f114573f973dfe25d864608822dc09e (patch)
tree8ba7f8a8d984cba1ae1c1a528e96309e6a73cd2a
parent736edce5f395b8309a61aa62c36c4356abc83219 (diff)
x86/mce: Add quirk for instruction recovery on Sandy Bridge processors
Sandy Bridge processors follow the SDM (Vol 3B, Table 15-20) and set both the RIPV and EIPV bits in the MCG_STATUS register to zero for machine checks during instruction fetch. This is more than a little counter-intuitive and means that Linux cannot recover from these errors. Rather than insert special case code at several places in mce.c and mce-severity.c, we pretend the EIPV bit was set for just this case early in processing the machine check. Acked-by: Borislav Petkov <bp@amd64.org> Signed-off-by: Tony Luck <tony.luck@intel.com> Cc: Chen Gong <gong.chen@linux.intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Link: http://lkml.kernel.org/r/180a06f3f357cf9f78259ae443a082b14a29535b.1343078495.git.tony.luck@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c43
1 files changed, 40 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a5a5dc1ff15..8cf60e29790a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,6 +105,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
105 105
106static DEFINE_PER_CPU(struct work_struct, mce_work); 106static DEFINE_PER_CPU(struct work_struct, mce_work);
107 107
108static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
109
108/* 110/*
109 * CPU/chipset specific EDAC code can register a notifier call here to print 111 * CPU/chipset specific EDAC code can register a notifier call here to print
110 * MCE errors in a human-readable form. 112 * MCE errors in a human-readable form.
@@ -652,14 +654,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
652 * Do a quick check if any of the events requires a panic. 654 * Do a quick check if any of the events requires a panic.
653 * This decides if we keep the events around or clear them. 655 * This decides if we keep the events around or clear them.
654 */ 656 */
655static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) 657static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
658 struct pt_regs *regs)
656{ 659{
657 int i, ret = 0; 660 int i, ret = 0;
658 661
659 for (i = 0; i < banks; i++) { 662 for (i = 0; i < banks; i++) {
660 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 663 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
661 if (m->status & MCI_STATUS_VAL) 664 if (m->status & MCI_STATUS_VAL) {
662 __set_bit(i, validp); 665 __set_bit(i, validp);
666 if (quirk_no_way_out)
667 quirk_no_way_out(i, m, regs);
668 }
663 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 669 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
664 ret = 1; 670 ret = 1;
665 } 671 }
@@ -1042,7 +1048,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1042 *final = m; 1048 *final = m;
1043 1049
1044 memset(valid_banks, 0, sizeof(valid_banks)); 1050 memset(valid_banks, 0, sizeof(valid_banks));
1045 no_way_out = mce_no_way_out(&m, &msg, valid_banks); 1051 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1046 1052
1047 barrier(); 1053 barrier();
1048 1054
@@ -1418,6 +1424,34 @@ static void __mcheck_cpu_init_generic(void)
1418 } 1424 }
1419} 1425}
1420 1426
1427/*
1428 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1429 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1430 * Vol 3B Table 15-20). But this confuses both the code that determines
1431 * whether the machine check occurred in kernel or user mode, and also
1432 * the severity assessment code. Pretend that EIPV was set, and take the
1433 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1434 */
1435static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1436{
1437 if (bank != 0)
1438 return;
1439 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1440 return;
1441 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1442 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1443 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1444 MCACOD)) !=
1445 (MCI_STATUS_UC|MCI_STATUS_EN|
1446 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1447 MCI_STATUS_AR|MCACOD_INSTR))
1448 return;
1449
1450 m->mcgstatus |= MCG_STATUS_EIPV;
1451 m->ip = regs->ip;
1452 m->cs = regs->cs;
1453}
1454
1421/* Add per CPU specific workarounds here */ 1455/* Add per CPU specific workarounds here */
1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1456static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1423{ 1457{
@@ -1515,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1515 */ 1549 */
1516 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1550 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1517 mce_bootlog = 0; 1551 mce_bootlog = 0;
1552
1553 if (c->x86 == 6 && c->x86_model == 45)
1554 quirk_no_way_out = quirk_sandybridge_ifu;
1518 } 1555 }
1519 if (monarch_timeout < 0) 1556 if (monarch_timeout < 0)
1520 monarch_timeout = 0; 1557 monarch_timeout = 0;