1 files changed, 133 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 13e1b7ffe73a..d4e7b5947a0e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
        return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
 }
+static DEFINE_PER_CPU(struct work_struct, mce_work);
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
        wrmsrl(msr, v);
 }
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16        /* we use one entry less */
+struct mce_ring {
+        unsigned short start;
+        unsigned short end;
+        unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+        struct mce_ring *r = &__get_cpu_var(mce_ring);
+        return r->start == r->end;
+}
+static int mce_ring_get(unsigned long *pfn)
+{
+        struct mce_ring *r;
+        int ret = 0;
+        *pfn = 0;
+        get_cpu();
+        r = &__get_cpu_var(mce_ring);
+        if (r->start == r->end)
+                goto out;
+        *pfn = r->ring[r->start];
+        r->start = (r->start + 1) % MCE_RING_SIZE;
+        ret = 1;
+out:
+        put_cpu();
+        return ret;
+}
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+        struct mce_ring *r = &__get_cpu_var(mce_ring);
+        unsigned next;
+        next = (r->end + 1) % MCE_RING_SIZE;
+        if (next == r->start)
+                return -1;
+        r->ring[r->end] = pfn;
+        wmb();
+        r->end = next;
+        return 0;
+}
 int mce_available(struct cpuinfo_x86 *c)
 {
        if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
+static void mce_schedule_work(void)
+{
+        if (!mce_ring_empty()) {
+                struct work_struct *work = &__get_cpu_var(mce_work);
+                if (!work_pending(work))
+                        schedule_work(work);
+        }
+}
 /*
 * Get the address of the instruction at the time of the machine check
 * error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
        exit_idle();
        irq_enter();
        mce_notify_irq();
+        mce_schedule_work();
        irq_exit();
 }
 #endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
 {
        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
                mce_notify_irq();
+                /*
+                 * Triggering the work queue here is just an insurance
+                 * policy in case the syscall exit notify handler
+                 * doesn't run soon enough or ends up running on the
+                 * wrong CPU (can happen when audit sleeps)
+                 */
+                mce_schedule_work();
                return;
        }
@@ -731,6 +806,23 @@ reset:
        return ret;
 }
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+                return 0;
+        if ((m->misc & 0x3f) > PAGE_SHIFT)
+                return 0;
+        if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+                return 0;
+        return 1;
+}
 static void mce_clear_state(unsigned long *toclear)
 {
        int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                if (m.status & MCI_STATUS_ADDRV)
                        m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+                /*
+                 * Action optional error. Queue address for later processing.
+                 * When the ring overflows we just ignore the AO error.
+                 * RED-PEN add some logging mechanism when
+                 * usable_address or mce_add_ring fails.
+                 * RED-PEN don't ignore overflow for tolerant == 0
+                 */
+                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+                        mce_ring_add(m.addr >> PAGE_SHIFT);
                mce_get_rip(&m, regs);
                mce_log(&m);
@@ -916,6 +1018,36 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+        printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+        unsigned long pfn;
+        mce_notify_irq();
+        while (mce_ring_get(&pfn))
+                memory_failure(pfn, MCE_VECTOR);
+}
+static void mce_process_work(struct work_struct *dummy)
+{
+        mce_notify_process();
+}
 #ifdef CONFIG_X86_MCE_INTEL
 /***
 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
        mce_init();
        mce_cpu_features(c);
        mce_init_timer();
+        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
 }
 /*

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73a..d4e7b5947a0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
33	#include <linux/cpu.h>	33	#include <linux/cpu.h>
34	#include <linux/smp.h>	34	#include <linux/smp.h>
35	#include <linux/fs.h>	35	#include <linux/fs.h>
		36	#include <linux/mm.h>
36		37
37	#include <asm/processor.h>	38	#include <asm/processor.h>
38	#include <asm/hw_irq.h>	39	#include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
105	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);	106	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
106	}	107	}
107		108
		109	static DEFINE_PER_CPU(struct work_struct, mce_work);
		110
108	/* Do initial initialization of a struct mce */	111	/* Do initial initialization of a struct mce */
109	void mce_setup(struct mce *m)	112	void mce_setup(struct mce *m)
110	{	113	{
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
312	wrmsrl(msr, v);	315	wrmsrl(msr, v);
313	}	316	}
314		317
		318	/*
		319	* Simple lockless ring to communicate PFNs from the exception handler with the
		320	* process context work function. This is vastly simplified because there's
		321	* only a single reader and a single writer.
		322	*/
		323	#define MCE_RING_SIZE 16 /* we use one entry less */
		324
		325	struct mce_ring {
		326	unsigned short start;
		327	unsigned short end;
		328	unsigned long ring[MCE_RING_SIZE];
		329	};
		330	static DEFINE_PER_CPU(struct mce_ring, mce_ring);
		331
		332	/* Runs with CPU affinity in workqueue */
		333	static int mce_ring_empty(void)
		334	{
		335	struct mce_ring *r = &__get_cpu_var(mce_ring);
		336
		337	return r->start == r->end;
		338	}
		339
		340	static int mce_ring_get(unsigned long *pfn)
		341	{
		342	struct mce_ring *r;
		343	int ret = 0;
		344
		345	*pfn = 0;
		346	get_cpu();
		347	r = &__get_cpu_var(mce_ring);
		348	if (r->start == r->end)
		349	goto out;
		350	*pfn = r->ring[r->start];
		351	r->start = (r->start + 1) % MCE_RING_SIZE;
		352	ret = 1;
		353	out:
		354	put_cpu();
		355	return ret;
		356	}
		357
		358	/* Always runs in MCE context with preempt off */
		359	static int mce_ring_add(unsigned long pfn)
		360	{
		361	struct mce_ring *r = &__get_cpu_var(mce_ring);
		362	unsigned next;
		363
		364	next = (r->end + 1) % MCE_RING_SIZE;
		365	if (next == r->start)
		366	return -1;
		367	r->ring[r->end] = pfn;
		368	wmb();
		369	r->end = next;
		370	return 0;
		371	}
		372
315	int mce_available(struct cpuinfo_x86 *c)	373	int mce_available(struct cpuinfo_x86 *c)
316	{	374	{
317	if (mce_disabled)	375	if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
319	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);	377	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
320	}	378	}
321		379
		380	static void mce_schedule_work(void)
		381	{
		382	if (!mce_ring_empty()) {
		383	struct work_struct *work = &__get_cpu_var(mce_work);
		384	if (!work_pending(work))
		385	schedule_work(work);
		386	}
		387	}
		388
322	/*	389	/*
323	* Get the address of the instruction at the time of the machine check	390	* Get the address of the instruction at the time of the machine check
324	* error.	391	* error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
349	exit_idle();	416	exit_idle();
350	irq_enter();	417	irq_enter();
351	mce_notify_irq();	418	mce_notify_irq();
		419	mce_schedule_work();
352	irq_exit();	420	irq_exit();
353	}	421	}
354	#endif	422	#endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
357	{	425	{
358	if (regs->flags & (X86_VM_MASK\|X86_EFLAGS_IF)) {	426	if (regs->flags & (X86_VM_MASK\|X86_EFLAGS_IF)) {
359	mce_notify_irq();	427	mce_notify_irq();
		428	/*
		429	* Triggering the work queue here is just an insurance
		430	* policy in case the syscall exit notify handler
		431	* doesn't run soon enough or ends up running on the
		432	* wrong CPU (can happen when audit sleeps)
		433	*/
		434	mce_schedule_work();
360	return;	435	return;
361	}	436	}
362		437
@@ -731,6 +806,23 @@ reset:
731	return ret;	806	return ret;
732	}	807	}
733		808
		809	/*
		810	* Check if the address reported by the CPU is in a format we can parse.
		811	* It would be possible to add code for most other cases, but all would
		812	* be somewhat complicated (e.g. segment offset would require an instruction
		813	* parser). So only support physical addresses upto page granuality for now.
		814	*/
		815	static int mce_usable_address(struct mce *m)
		816	{
		817	if (!(m->status & MCI_STATUS_MISCV) \|\| !(m->status & MCI_STATUS_ADDRV))
		818	return 0;
		819	if ((m->misc & 0x3f) > PAGE_SHIFT)
		820	return 0;
		821	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
		822	return 0;
		823	return 1;
		824	}
		825
734	static void mce_clear_state(unsigned long *toclear)	826	static void mce_clear_state(unsigned long *toclear)
735	{	827	{
736	int i;	828	int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
865	if (m.status & MCI_STATUS_ADDRV)	957	if (m.status & MCI_STATUS_ADDRV)
866	m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);	958	m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
867		959
		960	/*
		961	* Action optional error. Queue address for later processing.
		962	* When the ring overflows we just ignore the AO error.
		963	* RED-PEN add some logging mechanism when
		964	* usable_address or mce_add_ring fails.
		965	* RED-PEN don't ignore overflow for tolerant == 0
		966	*/
		967	if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
		968	mce_ring_add(m.addr >> PAGE_SHIFT);
		969
868	mce_get_rip(&m, regs);	970	mce_get_rip(&m, regs);
869	mce_log(&m);	971	mce_log(&m);
870		972
@@ -916,6 +1018,36 @@ out:
916	}	1018	}
917	EXPORT_SYMBOL_GPL(do_machine_check);	1019	EXPORT_SYMBOL_GPL(do_machine_check);
918		1020
		1021	/* dummy to break dependency. actual code is in mm/memory-failure.c */
		1022	void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
		1023	{
		1024	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
		1025	}
		1026
		1027	/*
		1028	* Called after mce notification in process context. This code
		1029	* is allowed to sleep. Call the high level VM handler to process
		1030	* any corrupted pages.
		1031	* Assume that the work queue code only calls this one at a time
		1032	* per CPU.
		1033	* Note we don't disable preemption, so this code might run on the wrong
		1034	* CPU. In this case the event is picked up by the scheduled work queue.
		1035	* This is merely a fast path to expedite processing in some common
		1036	* cases.
		1037	*/
		1038	void mce_notify_process(void)
		1039	{
		1040	unsigned long pfn;
		1041	mce_notify_irq();
		1042	while (mce_ring_get(&pfn))
		1043	memory_failure(pfn, MCE_VECTOR);
		1044	}
		1045
		1046	static void mce_process_work(struct work_struct *dummy)
		1047	{
		1048	mce_notify_process();
		1049	}
		1050
919	#ifdef CONFIG_X86_MCE_INTEL	1051	#ifdef CONFIG_X86_MCE_INTEL
920	/***	1052	/***
921	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog	1053	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1204	mce_init();	1336	mce_init();
1205	mce_cpu_features(c);	1337	mce_cpu_features(c);
1206	mce_init_timer();	1338	mce_init_timer();
		1339	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1207	}	1340	}
1208		1341
1209	/*	1342	/*