x86/mce: Handle "action required" errors

All non-urgent actions (reporting low severity errors and handling "action-optional" errors) are now handled by a work queue. This means that TIF_MCE_NOTIFY can be used to block execution for a thread experiencing an "action-required" fault until we get all cpus out of the machine check handler (and the thread that hit the fault into mce_notify_process(). We use the new mce_{save,find,clear}_info() API to get information from do_machine_check() to mce_notify_process(), and then use the newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle the error (possibly signalling the process). Update some comments to make the new code flows clearer. Signed-off-by: Tony Luck <tony.luck@intel.com>
author: Tony Luck <tony.luck@intel.com> 2012-01-03 14:45:45 -0500
committer: Tony Luck <tony.luck@intel.com> 2012-01-03 15:07:01 -0500
commit: a8c321fbf9aeced45519248e5901af8cbc240510 (patch)
tree: 74df0cc3cfc4d8f5e422384005d42e7330a08e9a /arch/x86/kernel
parent: af104e394e17e328df85c25a9e21448539725b67 (diff)
1 files changed, 53 insertions, 42 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e1579c5a71d..56e4e79387c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        barrier();
        /*
-         * When no restart IP must always kill or panic.
+         * When no restart IP might need to kill or panic.
+         * Assume the worst for now, but if we find the
+         * severity is MCE_AR_SEVERITY we have other options.
         */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
                kill_it = 1;
@@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                        continue;
                }
-                /*
-                 * Kill on action required.
-                 */
-                if (severity == MCE_AR_SEVERITY)
-                        kill_it = 1;
                mce_read_aux(&m, i);
                /*
@@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                }
        }
+        /* mce_clear_state will clear *final, save locally for use later */
+        m = *final;
        if (!no_way_out)
                mce_clear_state(toclear);
@@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                no_way_out = worst >= MCE_PANIC_SEVERITY;
        /*
-         * If we have decided that we just CAN'T continue, and the user
+         * At insane "tolerant" levels we take no action. Otherwise
-         * has not set tolerant to an insane level, give up and die.
+         * we only die if we have no other choice. For less serious
-         *
+         * issues we try to recover, or limit damage to the current
-         * This is mainly used in the case when the system doesn't
+         * process.
-         * support MCE broadcasting or it has been disabled.
         */
-        if (no_way_out && tolerant < 3)
+        if (tolerant < 3) {
-                mce_panic("Fatal machine check on current CPU", final, msg);
+                if (no_way_out)
+                        mce_panic("Fatal machine check on current CPU", &m, msg);
-        /*
+                if (worst == MCE_AR_SEVERITY) {
-         * If the error seems to be unrecoverable, something should be
+                        /* schedule action before return to userland */
-         * done.  Try to kill as little as possible.  If we can kill just
+                        mce_save_info(m.addr);
-         * one task, do that.  If the user has set the tolerance very
+                        set_thread_flag(TIF_MCE_NOTIFY);
-         * high, don't try to do anything at all.
+                } else if (kill_it) {
-         */
+                        force_sig(SIGBUS, current);
+                }
-        if (kill_it && tolerant < 3)
+        }
-                force_sig(SIGBUS, current);
-        /* notify userspace ASAP */
-        set_thread_flag(TIF_MCE_NOTIFY);
        if (worst > 0)
                mce_report_event(regs);
@@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check);
 #ifndef CONFIG_MEMORY_FAILURE
 int memory_failure(unsigned long pfn, int vector, int flags)
 {
+        /* mce_severity() should not hand us an ACTION_REQUIRED error */
+        BUG_ON(flags & MF_ACTION_REQUIRED);
        printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
@@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 #endif
 /*
- * Called after mce notification in process context. This code
+ * Called in process context that interrupted by MCE and marked with
- * is allowed to sleep. Call the high level VM handler to process
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * any corrupted pages.
+ * This code is allowed to sleep.
- * Assume that the work queue code only calls this one at a time
+ * Attempt possible recovery such as calling the high level VM handler to
- * per CPU.
+ * process any corrupted pages, and kill/signal current process if required.
- * Note we don't disable preemption, so this code might run on the wrong
+ * Action required errors are handled here.
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
 */
 void mce_notify_process(void)
 {
        unsigned long pfn;
-        mce_notify_irq();
+        struct mce_info *mi = mce_find_info();
-        while (mce_ring_get(&pfn))
-                memory_failure(pfn, MCE_VECTOR, 0);
+        if (!mi)
+                mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+        pfn = mi->paddr >> PAGE_SHIFT;
+        clear_thread_flag(TIF_MCE_NOTIFY);
+        pr_err("Uncorrected hardware memory error in user-access at %llx",
+                 mi->paddr);
+        if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+                pr_err("Memory error not recovered");
+                force_sig(SIGBUS, current);
+        }
+        mce_clear_info(mi);
 }
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
 static void mce_process_work(struct work_struct *dummy)
 {
-        mce_notify_process();
+        unsigned long pfn;
+        while (mce_ring_get(&pfn))
+                memory_failure(pfn, MCE_VECTOR, 0);
 }
 #ifdef CONFIG_X86_MCE_INTEL
@@ -1225,8 +1238,6 @@ int mce_notify_irq(void)
        /* Not more than two messages every minute */
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &mce_need_notify)) {
                /* wake processes polling /dev/mcelog */
                wake_up_interruptible(&mce_chrdev_wait);
author	Tony Luck <tony.luck@intel.com>	2012-01-03 14:45:45 -0500
committer	Tony Luck <tony.luck@intel.com>	2012-01-03 15:07:01 -0500
commit	a8c321fbf9aeced45519248e5901af8cbc240510 (patch)
tree	74df0cc3cfc4d8f5e422384005d42e7330a08e9a /arch/x86/kernel
parent	af104e394e17e328df85c25a9e21448539725b67 (diff)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e1579c5a71d..56e4e79387c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
982	barrier();	982	barrier();
983		983
984	/*	984	/*
985	* When no restart IP must always kill or panic.	985	* When no restart IP might need to kill or panic.
		986	* Assume the worst for now, but if we find the
		987	* severity is MCE_AR_SEVERITY we have other options.
986	*/	988	*/
987	if (!(m.mcgstatus & MCG_STATUS_RIPV))	989	if (!(m.mcgstatus & MCG_STATUS_RIPV))
988	kill_it = 1;	990	kill_it = 1;
@@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1036	continue;	1038	continue;
1037	}	1039	}
1038		1040
1039	/*
1040	* Kill on action required.
1041	*/
1042	if (severity == MCE_AR_SEVERITY)
1043	kill_it = 1;
1044
1045	mce_read_aux(&m, i);	1041	mce_read_aux(&m, i);
1046		1042
1047	/*	1043	/*
@@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1062	}	1058	}
1063	}	1059	}
1064		1060
		1061	/* mce_clear_state will clear final, save locally for use later /
		1062	m = *final;
		1063
1065	if (!no_way_out)	1064	if (!no_way_out)
1066	mce_clear_state(toclear);	1065	mce_clear_state(toclear);
1067		1066
@@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1073	no_way_out = worst >= MCE_PANIC_SEVERITY;	1072	no_way_out = worst >= MCE_PANIC_SEVERITY;
1074		1073
1075	/*	1074	/*
1076	* If we have decided that we just CAN'T continue, and the user	1075	* At insane "tolerant" levels we take no action. Otherwise
1077	* has not set tolerant to an insane level, give up and die.	1076	* we only die if we have no other choice. For less serious
1078	*	1077	* issues we try to recover, or limit damage to the current
1079	* This is mainly used in the case when the system doesn't	1078	* process.
1080	* support MCE broadcasting or it has been disabled.
1081	*/	1079	*/
1082	if (no_way_out && tolerant < 3)	1080	if (tolerant < 3) {
1083	mce_panic("Fatal machine check on current CPU", final, msg);	1081	if (no_way_out)
1084		1082	mce_panic("Fatal machine check on current CPU", &m, msg);
1085	/*	1083	if (worst == MCE_AR_SEVERITY) {
1086	* If the error seems to be unrecoverable, something should be	1084	/* schedule action before return to userland */
1087	* done. Try to kill as little as possible. If we can kill just	1085	mce_save_info(m.addr);
1088	* one task, do that. If the user has set the tolerance very	1086	set_thread_flag(TIF_MCE_NOTIFY);
1089	* high, don't try to do anything at all.	1087	} else if (kill_it) {
1090	*/	1088	force_sig(SIGBUS, current);
1091		1089	}
1092	if (kill_it && tolerant < 3)	1090	}
1093	force_sig(SIGBUS, current);
1094
1095	/* notify userspace ASAP */
1096	set_thread_flag(TIF_MCE_NOTIFY);
1097		1091
1098	if (worst > 0)	1092	if (worst > 0)
1099	mce_report_event(regs);	1093	mce_report_event(regs);
@@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check);
1107	#ifndef CONFIG_MEMORY_FAILURE	1101	#ifndef CONFIG_MEMORY_FAILURE
1108	int memory_failure(unsigned long pfn, int vector, int flags)	1102	int memory_failure(unsigned long pfn, int vector, int flags)
1109	{	1103	{
		1104	/* mce_severity() should not hand us an ACTION_REQUIRED error */
		1105	BUG_ON(flags & MF_ACTION_REQUIRED);
1110	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"	1106	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
1111	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);	1107	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
1112		1108
@@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1115	#endif	1111	#endif
1116		1112
1117	/*	1113	/*
1118	* Called after mce notification in process context. This code	1114	* Called in process context that interrupted by MCE and marked with
1119	* is allowed to sleep. Call the high level VM handler to process	1115	* TIF_MCE_NOTIFY, just before returning to erroneous userland.
1120	* any corrupted pages.	1116	* This code is allowed to sleep.
1121	* Assume that the work queue code only calls this one at a time	1117	* Attempt possible recovery such as calling the high level VM handler to
1122	* per CPU.	1118	* process any corrupted pages, and kill/signal current process if required.
1123	* Note we don't disable preemption, so this code might run on the wrong	1119	* Action required errors are handled here.
1124	* CPU. In this case the event is picked up by the scheduled work queue.
1125	* This is merely a fast path to expedite processing in some common
1126	* cases.
1127	*/	1120	*/
1128	void mce_notify_process(void)	1121	void mce_notify_process(void)
1129	{	1122	{
1130	unsigned long pfn;	1123	unsigned long pfn;
1131	mce_notify_irq();	1124	struct mce_info *mi = mce_find_info();
1132	while (mce_ring_get(&pfn))	1125
1133	memory_failure(pfn, MCE_VECTOR, 0);	1126	if (!mi)
		1127	mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
		1128	pfn = mi->paddr >> PAGE_SHIFT;
		1129
		1130	clear_thread_flag(TIF_MCE_NOTIFY);
		1131
		1132	pr_err("Uncorrected hardware memory error in user-access at %llx",
		1133	mi->paddr);
		1134	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
		1135	pr_err("Memory error not recovered");
		1136	force_sig(SIGBUS, current);
		1137	}
		1138	mce_clear_info(mi);
1134	}	1139	}
1135		1140
		1141	/*
		1142	* Action optional processing happens here (picking up
		1143	* from the list of faulting pages that do_machine_check()
		1144	* placed into the "ring").
		1145	*/
1136	static void mce_process_work(struct work_struct *dummy)	1146	static void mce_process_work(struct work_struct *dummy)
1137	{	1147	{
1138	mce_notify_process();	1148	unsigned long pfn;
		1149
		1150	while (mce_ring_get(&pfn))
		1151	memory_failure(pfn, MCE_VECTOR, 0);
1139	}	1152	}
1140		1153
1141	#ifdef CONFIG_X86_MCE_INTEL	1154	#ifdef CONFIG_X86_MCE_INTEL
@@ -1225,8 +1238,6 @@ int mce_notify_irq(void)
1225	/* Not more than two messages every minute */	1238	/* Not more than two messages every minute */
1226	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);	1239	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1227		1240
1228	clear_thread_flag(TIF_MCE_NOTIFY);
1229
1230	if (test_and_clear_bit(0, &mce_need_notify)) {	1241	if (test_and_clear_bit(0, &mce_need_notify)) {
1231	/* wake processes polling /dev/mcelog */	1242	/* wake processes polling /dev/mcelog */
1232	wake_up_interruptible(&mce_chrdev_wait);	1243	wake_up_interruptible(&mce_chrdev_wait);