x86, mce: support action-optional machine checks

Newer Intel CPUs support a new class of machine checks called recoverable action optional. Action Optional means that the CPU detected some form of corruption in the background and tells the OS about using a machine check exception. The OS can then take appropiate action, like killing the process with the corrupted data or logging the event properly to disk. This is done by the new generic high level memory failure handler added in a earlier patch. The high level handler takes the address with the failed memory and does the appropiate action, like killing the process. In this version of the patch the high level handler is stubbed out with a weak function to not create a direct dependency on the hwpoison branch. The high level handler cannot be directly called from the machine check exception though, because it has to run in a defined process context to be able to sleep when taking VM locks (it is not expected to sleep for a long time, just do so in some exceptional cases like lock contention) Thus the MCE handler has to queue a work item for process context, trigger process context and then call the high level handler from there. This patch adds two path to process context: through a per thread kernel exit notify_user() callback or through a high priority work item. The first runs when the process exits back to user space, the other when it goes to sleep and there is no higher priority process. The machine check handler will schedule both, and whoever runs first will grab the event. This is done because quick reaction to this event is critical to avoid a potential more fatal machine check when the corruption is consumed. There is a simple lock less ring buffer to queue the corrupted addresses between the exception handler and the process context handler. Then in process context it just calls the high level VM code with the corrupted PFNs. The code adds the required code to extract the failed address from the CPU's machine check registers. It doesn't try to handle all possible cases -- the specification has 6 different ways to specify memory address -- but only the linear address. Most of the required checking has been already done earlier in the mce_severity rule checking engine. Following the Intel recommendations Action Optional errors are only enabled for known situations (encoded in MCACODs). The errors are ignored otherwise, because they are action optional. v2: Improve comment, disable preemption while processing ring buffer (reported by Ying Huang) Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: Andi Kleen <andi@firstfloor.org> 2009-05-27 15:56:59 -0400
committer: H. Peter Anvin <hpa@zytor.com> 2009-06-03 17:48:59 -0400
commit: 9b1beaf2b551a8a1604f104025b24e9c535c8963 (patch)
tree: b335ca7e4744c6de875c6421a6131539094ae851 /arch/x86
parent: 8fa8dd9e3aafb7b440b7d54219891615abc6390e (diff)
3 files changed, 135 insertions, 1 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 713926b62cbb..82978ad12072 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -160,6 +160,7 @@ enum mcp_flags {
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 int mce_notify_irq(void);
+void mce_notify_process(void);
 DECLARE_PER_CPU(struct mce, injectm);
 extern struct file_operations mce_chrdev_ops;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 13e1b7ffe73a..d4e7b5947a0e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
        return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
 }
+static DEFINE_PER_CPU(struct work_struct, mce_work);
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
        wrmsrl(msr, v);
 }
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16        /* we use one entry less */
+struct mce_ring {
+        unsigned short start;
+        unsigned short end;
+        unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+        struct mce_ring *r = &__get_cpu_var(mce_ring);
+        return r->start == r->end;
+}
+static int mce_ring_get(unsigned long *pfn)
+{
+        struct mce_ring *r;
+        int ret = 0;
+        *pfn = 0;
+        get_cpu();
+        r = &__get_cpu_var(mce_ring);
+        if (r->start == r->end)
+                goto out;
+        *pfn = r->ring[r->start];
+        r->start = (r->start + 1) % MCE_RING_SIZE;
+        ret = 1;
+out:
+        put_cpu();
+        return ret;
+}
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+        struct mce_ring *r = &__get_cpu_var(mce_ring);
+        unsigned next;
+        next = (r->end + 1) % MCE_RING_SIZE;
+        if (next == r->start)
+                return -1;
+        r->ring[r->end] = pfn;
+        wmb();
+        r->end = next;
+        return 0;
+}
 int mce_available(struct cpuinfo_x86 *c)
 {
        if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
+static void mce_schedule_work(void)
+{
+        if (!mce_ring_empty()) {
+                struct work_struct *work = &__get_cpu_var(mce_work);
+                if (!work_pending(work))
+                        schedule_work(work);
+        }
+}
 /*
 * Get the address of the instruction at the time of the machine check
 * error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
        exit_idle();
        irq_enter();
        mce_notify_irq();
+        mce_schedule_work();
        irq_exit();
 }
 #endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
 {
        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
                mce_notify_irq();
+                /*
+                 * Triggering the work queue here is just an insurance
+                 * policy in case the syscall exit notify handler
+                 * doesn't run soon enough or ends up running on the
+                 * wrong CPU (can happen when audit sleeps)
+                 */
+                mce_schedule_work();
                return;
        }
@@ -731,6 +806,23 @@ reset:
        return ret;
 }
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+                return 0;
+        if ((m->misc & 0x3f) > PAGE_SHIFT)
+                return 0;
+        if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+                return 0;
+        return 1;
+}
 static void mce_clear_state(unsigned long *toclear)
 {
        int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                if (m.status & MCI_STATUS_ADDRV)
                        m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+                /*
+                 * Action optional error. Queue address for later processing.
+                 * When the ring overflows we just ignore the AO error.
+                 * RED-PEN add some logging mechanism when
+                 * usable_address or mce_add_ring fails.
+                 * RED-PEN don't ignore overflow for tolerant == 0
+                 */
+                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+                        mce_ring_add(m.addr >> PAGE_SHIFT);
                mce_get_rip(&m, regs);
                mce_log(&m);
@@ -916,6 +1018,36 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+        printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+        unsigned long pfn;
+        mce_notify_irq();
+        while (mce_ring_get(&pfn))
+                memory_failure(pfn, MCE_VECTOR);
+}
+static void mce_process_work(struct work_struct *dummy)
+{
+        mce_notify_process();
+}
 #ifdef CONFIG_X86_MCE_INTEL
 /***
 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
        mce_init();
        mce_cpu_features(c);
        mce_init_timer();
+        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
 }
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d5dc15bce005..4976888094f0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 #ifdef CONFIG_X86_NEW_MCE
        /* notify userspace of pending MCEs */
        if (thread_info_flags & _TIF_MCE_NOTIFY)
-                mce_notify_irq();
+                mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
        /* deal with pending signal delivery */
author	Andi Kleen <andi@firstfloor.org>	2009-05-27 15:56:59 -0400
committer	H. Peter Anvin <hpa@zytor.com>	2009-06-03 17:48:59 -0400
commit	9b1beaf2b551a8a1604f104025b24e9c535c8963 (patch)
tree	b335ca7e4744c6de875c6421a6131539094ae851 /arch/x86
parent	8fa8dd9e3aafb7b440b7d54219891615abc6390e (diff)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 713926b62cbb..82978ad12072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h
@@ -160,6 +160,7 @@ enum mcp_flags {
160	void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);	160	void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
161		161
162	int mce_notify_irq(void);	162	int mce_notify_irq(void);
		163	void mce_notify_process(void);
163		164
164	DECLARE_PER_CPU(struct mce, injectm);	165	DECLARE_PER_CPU(struct mce, injectm);
165	extern struct file_operations mce_chrdev_ops;	166	extern struct file_operations mce_chrdev_ops;


diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73a..d4e7b5947a0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
33	#include <linux/cpu.h>	33	#include <linux/cpu.h>
34	#include <linux/smp.h>	34	#include <linux/smp.h>
35	#include <linux/fs.h>	35	#include <linux/fs.h>
		36	#include <linux/mm.h>
36		37
37	#include <asm/processor.h>	38	#include <asm/processor.h>
38	#include <asm/hw_irq.h>	39	#include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
105	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);	106	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
106	}	107	}
107		108
		109	static DEFINE_PER_CPU(struct work_struct, mce_work);
		110
108	/* Do initial initialization of a struct mce */	111	/* Do initial initialization of a struct mce */
109	void mce_setup(struct mce *m)	112	void mce_setup(struct mce *m)
110	{	113	{
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
312	wrmsrl(msr, v);	315	wrmsrl(msr, v);
313	}	316	}
314		317
		318	/*
		319	* Simple lockless ring to communicate PFNs from the exception handler with the
		320	* process context work function. This is vastly simplified because there's
		321	* only a single reader and a single writer.
		322	*/
		323	#define MCE_RING_SIZE 16 /* we use one entry less */
		324
		325	struct mce_ring {
		326	unsigned short start;
		327	unsigned short end;
		328	unsigned long ring[MCE_RING_SIZE];
		329	};
		330	static DEFINE_PER_CPU(struct mce_ring, mce_ring);
		331
		332	/* Runs with CPU affinity in workqueue */
		333	static int mce_ring_empty(void)
		334	{
		335	struct mce_ring *r = &__get_cpu_var(mce_ring);
		336
		337	return r->start == r->end;
		338	}
		339
		340	static int mce_ring_get(unsigned long *pfn)
		341	{
		342	struct mce_ring *r;
		343	int ret = 0;
		344
		345	*pfn = 0;
		346	get_cpu();
		347	r = &__get_cpu_var(mce_ring);
		348	if (r->start == r->end)
		349	goto out;
		350	*pfn = r->ring[r->start];
		351	r->start = (r->start + 1) % MCE_RING_SIZE;
		352	ret = 1;
		353	out:
		354	put_cpu();
		355	return ret;
		356	}
		357
		358	/* Always runs in MCE context with preempt off */
		359	static int mce_ring_add(unsigned long pfn)
		360	{
		361	struct mce_ring *r = &__get_cpu_var(mce_ring);
		362	unsigned next;
		363
		364	next = (r->end + 1) % MCE_RING_SIZE;
		365	if (next == r->start)
		366	return -1;
		367	r->ring[r->end] = pfn;
		368	wmb();
		369	r->end = next;
		370	return 0;
		371	}
		372
315	int mce_available(struct cpuinfo_x86 *c)	373	int mce_available(struct cpuinfo_x86 *c)
316	{	374	{
317	if (mce_disabled)	375	if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
319	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);	377	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
320	}	378	}
321		379
		380	static void mce_schedule_work(void)
		381	{
		382	if (!mce_ring_empty()) {
		383	struct work_struct *work = &__get_cpu_var(mce_work);
		384	if (!work_pending(work))
		385	schedule_work(work);
		386	}
		387	}
		388
322	/*	389	/*
323	* Get the address of the instruction at the time of the machine check	390	* Get the address of the instruction at the time of the machine check
324	* error.	391	* error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
349	exit_idle();	416	exit_idle();
350	irq_enter();	417	irq_enter();
351	mce_notify_irq();	418	mce_notify_irq();
		419	mce_schedule_work();
352	irq_exit();	420	irq_exit();
353	}	421	}
354	#endif	422	#endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
357	{	425	{
358	if (regs->flags & (X86_VM_MASK\|X86_EFLAGS_IF)) {	426	if (regs->flags & (X86_VM_MASK\|X86_EFLAGS_IF)) {
359	mce_notify_irq();	427	mce_notify_irq();
		428	/*
		429	* Triggering the work queue here is just an insurance
		430	* policy in case the syscall exit notify handler
		431	* doesn't run soon enough or ends up running on the
		432	* wrong CPU (can happen when audit sleeps)
		433	*/
		434	mce_schedule_work();
360	return;	435	return;
361	}	436	}
362		437
@@ -731,6 +806,23 @@ reset:
731	return ret;	806	return ret;
732	}	807	}
733		808
		809	/*
		810	* Check if the address reported by the CPU is in a format we can parse.
		811	* It would be possible to add code for most other cases, but all would
		812	* be somewhat complicated (e.g. segment offset would require an instruction
		813	* parser). So only support physical addresses upto page granuality for now.
		814	*/
		815	static int mce_usable_address(struct mce *m)
		816	{
		817	if (!(m->status & MCI_STATUS_MISCV) \|\| !(m->status & MCI_STATUS_ADDRV))
		818	return 0;
		819	if ((m->misc & 0x3f) > PAGE_SHIFT)
		820	return 0;
		821	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
		822	return 0;
		823	return 1;
		824	}
		825
734	static void mce_clear_state(unsigned long *toclear)	826	static void mce_clear_state(unsigned long *toclear)
735	{	827	{
736	int i;	828	int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
865	if (m.status & MCI_STATUS_ADDRV)	957	if (m.status & MCI_STATUS_ADDRV)
866	m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);	958	m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
867		959
		960	/*
		961	* Action optional error. Queue address for later processing.
		962	* When the ring overflows we just ignore the AO error.
		963	* RED-PEN add some logging mechanism when
		964	* usable_address or mce_add_ring fails.
		965	* RED-PEN don't ignore overflow for tolerant == 0
		966	*/
		967	if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
		968	mce_ring_add(m.addr >> PAGE_SHIFT);
		969
868	mce_get_rip(&m, regs);	970	mce_get_rip(&m, regs);
869	mce_log(&m);	971	mce_log(&m);
870		972
@@ -916,6 +1018,36 @@ out:
916	}	1018	}
917	EXPORT_SYMBOL_GPL(do_machine_check);	1019	EXPORT_SYMBOL_GPL(do_machine_check);
918		1020
		1021	/* dummy to break dependency. actual code is in mm/memory-failure.c */
		1022	void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
		1023	{
		1024	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
		1025	}
		1026
		1027	/*
		1028	* Called after mce notification in process context. This code
		1029	* is allowed to sleep. Call the high level VM handler to process
		1030	* any corrupted pages.
		1031	* Assume that the work queue code only calls this one at a time
		1032	* per CPU.
		1033	* Note we don't disable preemption, so this code might run on the wrong
		1034	* CPU. In this case the event is picked up by the scheduled work queue.
		1035	* This is merely a fast path to expedite processing in some common
		1036	* cases.
		1037	*/
		1038	void mce_notify_process(void)
		1039	{
		1040	unsigned long pfn;
		1041	mce_notify_irq();
		1042	while (mce_ring_get(&pfn))
		1043	memory_failure(pfn, MCE_VECTOR);
		1044	}
		1045
		1046	static void mce_process_work(struct work_struct *dummy)
		1047	{
		1048	mce_notify_process();
		1049	}
		1050
919	#ifdef CONFIG_X86_MCE_INTEL	1051	#ifdef CONFIG_X86_MCE_INTEL
920	/***	1052	/***
921	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog	1053	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1204	mce_init();	1336	mce_init();
1205	mce_cpu_features(c);	1337	mce_cpu_features(c);
1206	mce_init_timer();	1338	mce_init_timer();
		1339	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1207	}	1340	}
1208		1341
1209	/*	1342	/*


diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5dc15bce005..4976888094f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c
@@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs regs, void unused, __u32 thread_info_flags)
860	#ifdef CONFIG_X86_NEW_MCE	860	#ifdef CONFIG_X86_NEW_MCE
861	/* notify userspace of pending MCEs */	861	/* notify userspace of pending MCEs */
862	if (thread_info_flags & _TIF_MCE_NOTIFY)	862	if (thread_info_flags & _TIF_MCE_NOTIFY)
863	mce_notify_irq();	863	mce_notify_process();
864	#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */	864	#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
865		865
866	/* deal with pending signal delivery */	866	/* deal with pending signal delivery */