aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-05-27 15:56:57 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-06-03 17:45:34 -0400
commited7290d0ee8f81aa78bfe816f01b012f208cafc5 (patch)
treec73c44c14ff9f43147422df00dcef830cd952530 /arch/x86/kernel/cpu/mcheck/mce.c
parent86503560e48153aba539ff117450d31ab2ef76d7 (diff)
x86, mce: implement new status bits
The x86 architecture recently added some new machine check status bits: S(ignalled) and AR (Action-Required). Signalled allows to check if a specific event caused an exception or was just logged through CMCI. AR allows the kernel to decide if an event needs immediate action or can be delayed or ignored. Implement support for these new status bits. mce_severity() uses the new bits to grade the machine check correctly and decide what to do. The exception handler uses AR to decide to kill or not. The S bit is used to separate events between the poll/CMCI handler and the exception handler. Classical UC always leads to panic. That was true before anyways because the existing CPUs always passed a PCC with it. Also corrects the rules whether to kill in user or kernel context and how to handle missing RIPV. The machine check handler largely uses the mce-severity grading engine now instead of making its own decisions. This means the logic is centralized in one place. This is useful because it has to be evaluated multiple times. v2: Some rule fixes; Add AO events Fix RIPV, RIPV|EIPV order (Ying Huang) Fix UCNA with AR=1 message (Ying Huang) Add comment about panicing in m_c_p. Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c84
1 files changed, 44 insertions, 40 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff9c732989de..f051a7807ab4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -83,6 +83,7 @@ static int rip_msr;
83static int mce_bootlog = -1; 83static int mce_bootlog = -1;
84static int monarch_timeout = -1; 84static int monarch_timeout = -1;
85static int mce_panic_timeout; 85static int mce_panic_timeout;
86int mce_ser;
86 87
87static char trigger[128]; 88static char trigger[128];
88static char *trigger_argv[2] = { trigger, NULL }; 89static char *trigger_argv[2] = { trigger, NULL };
@@ -391,6 +392,15 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
391 * Those are just logged through /dev/mcelog. 392 * Those are just logged through /dev/mcelog.
392 * 393 *
393 * This is executed in standard interrupt context. 394 * This is executed in standard interrupt context.
395 *
396 * Note: spec recommends to panic for fatal unsignalled
397 * errors here. However this would be quite problematic --
398 * we would need to reimplement the Monarch handling and
399 * it would mess up the exclusion between exception handler
400 * and poll hander -- * so we skip this for now.
401 * These cases should not happen anyways, or only when the CPU
402 * is already totally * confused. In this case it's likely it will
403 * not fully execute the machine check handler either.
394 */ 404 */
395void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 405void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
396{ 406{
@@ -417,13 +427,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
417 continue; 427 continue;
418 428
419 /* 429 /*
420 * Uncorrected events are handled by the exception handler 430 * Uncorrected or signalled events are handled by the exception
421 * when it is enabled. But when the exception is disabled log 431 * handler when it is enabled, so don't process those here.
422 * everything.
423 * 432 *
424 * TBD do the same check for MCI_STATUS_EN here? 433 * TBD do the same check for MCI_STATUS_EN here?
425 */ 434 */
426 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 435 if (!(flags & MCP_UC) &&
436 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
427 continue; 437 continue;
428 438
429 if (m.status & MCI_STATUS_MISCV) 439 if (m.status & MCI_STATUS_MISCV)
@@ -790,6 +800,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
790 barrier(); 800 barrier();
791 801
792 /* 802 /*
803 * When no restart IP must always kill or panic.
804 */
805 if (!(m.mcgstatus & MCG_STATUS_RIPV))
806 kill_it = 1;
807
808 /*
793 * Go through all the banks in exclusion of the other CPUs. 809 * Go through all the banks in exclusion of the other CPUs.
794 * This way we don't report duplicated events on shared banks 810 * This way we don't report duplicated events on shared banks
795 * because the first one to see it will clear it. 811 * because the first one to see it will clear it.
@@ -809,10 +825,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
809 continue; 825 continue;
810 826
811 /* 827 /*
812 * Non uncorrected errors are handled by machine_check_poll 828 * Non uncorrected or non signaled errors are handled by
813 * Leave them alone, unless this panics. 829 * machine_check_poll. Leave them alone, unless this panics.
814 */ 830 */
815 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 831 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
832 !no_way_out)
816 continue; 833 continue;
817 834
818 /* 835 /*
@@ -820,17 +837,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
820 */ 837 */
821 add_taint(TAINT_MACHINE_CHECK); 838 add_taint(TAINT_MACHINE_CHECK);
822 839
823 __set_bit(i, toclear); 840 severity = mce_severity(&m, tolerant, NULL);
824 841
825 if (m.status & MCI_STATUS_EN) { 842 /*
826 /* 843 * When machine check was for corrected handler don't touch,
827 * If this error was uncorrectable and there was 844 * unless we're panicing.
828 * an overflow, we're in trouble. If no overflow, 845 */
829 * we might get away with just killing a task. 846 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
830 */ 847 continue;
831 if (m.status & MCI_STATUS_UC) 848 __set_bit(i, toclear);
832 kill_it = 1; 849 if (severity == MCE_NO_SEVERITY) {
833 } else {
834 /* 850 /*
835 * Machine check event was not enabled. Clear, but 851 * Machine check event was not enabled. Clear, but
836 * ignore. 852 * ignore.
@@ -838,6 +854,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
838 continue; 854 continue;
839 } 855 }
840 856
857 /*
858 * Kill on action required.
859 */
860 if (severity == MCE_AR_SEVERITY)
861 kill_it = 1;
862
841 if (m.status & MCI_STATUS_MISCV) 863 if (m.status & MCI_STATUS_MISCV)
842 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 864 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
843 if (m.status & MCI_STATUS_ADDRV) 865 if (m.status & MCI_STATUS_ADDRV)
@@ -846,7 +868,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
846 mce_get_rip(&m, regs); 868 mce_get_rip(&m, regs);
847 mce_log(&m); 869 mce_log(&m);
848 870
849 severity = mce_severity(&m, tolerant, NULL);
850 if (severity > worst) { 871 if (severity > worst) {
851 *final = m; 872 *final = m;
852 worst = severity; 873 worst = severity;
@@ -879,29 +900,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
879 * one task, do that. If the user has set the tolerance very 900 * one task, do that. If the user has set the tolerance very
880 * high, don't try to do anything at all. 901 * high, don't try to do anything at all.
881 */ 902 */
882 if (kill_it && tolerant < 3) {
883 int user_space = 0;
884
885 /*
886 * If the EIPV bit is set, it means the saved IP is the
887 * instruction which caused the MCE.
888 */
889 if (m.mcgstatus & MCG_STATUS_EIPV)
890 user_space = final->ip && (final->cs & 3);
891 903
892 /* 904 if (kill_it && tolerant < 3)
893 * If we know that the error was in user space, send a 905 force_sig(SIGBUS, current);
894 * SIGBUS. Otherwise, panic if tolerance is low.
895 *
896 * force_sig() takes an awful lot of locks and has a slight
897 * risk of deadlocking.
898 */
899 if (user_space) {
900 force_sig(SIGBUS, current);
901 } else if (panic_on_oops || tolerant < 2) {
902 mce_panic("Uncorrected machine check", final, msg);
903 }
904 }
905 906
906 /* notify userspace ASAP */ 907 /* notify userspace ASAP */
907 set_thread_flag(TIF_MCE_NOTIFY); 908 set_thread_flag(TIF_MCE_NOTIFY);
@@ -1049,6 +1050,9 @@ static int mce_cap_init(void)
1049 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1050 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1050 rip_msr = MSR_IA32_MCG_EIP; 1051 rip_msr = MSR_IA32_MCG_EIP;
1051 1052
1053 if (cap & MCG_SER_P)
1054 mce_ser = 1;
1055
1052 return 0; 1056 return 0;
1053} 1057}
1054 1058