1 files changed, 126 insertions, 53 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..ad573d8baf10 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
        irq_work_queue(&__get_cpu_var(mce_irq_work));
 }
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+        if (m->status & MCI_STATUS_MISCV)
+                m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+        if (m->status & MCI_STATUS_ADDRV) {
+                m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+                /*
+                 * Mask the reported address by the reported granularity.
+                 */
+                if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
+                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+                        m->addr >>= shift;
+                        m->addr <<= shift;
+                }
+        }
+}
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 /*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
                    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
                        continue;
-                if (m.status & MCI_STATUS_MISCV)
+                mce_read_aux(&m, i);
-                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-                if (m.status & MCI_STATUS_ADDRV)
-                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
                if (!(flags & MCP_TIMESTAMP))
                        m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
 }
 /*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX    16
+struct mce_info {
+        atomic_t                inuse;
+        struct task_struct      *t;
+        __u64                   paddr;
+} mce_info[MCE_INFO_MAX];
+static void mce_save_info(__u64 addr)
+{
+        struct mce_info *mi;
+        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+                if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+                        mi->t = current;
+                        mi->paddr = addr;
+                        return;
+                }
+        }
+        mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+static struct mce_info *mce_find_info(void)
+{
+        struct mce_info *mi;
+        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+                if (atomic_read(&mi->inuse) && mi->t == current)
+                        return mi;
+        return NULL;
+}
+static void mce_clear_info(struct mce_info *mi)
+{
+        atomic_set(&mi->inuse, 0);
+}
+/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        barrier();
        /*
-         * When no restart IP must always kill or panic.
+         * When no restart IP might need to kill or panic.
+         * Assume the worst for now, but if we find the
+         * severity is MCE_AR_SEVERITY we have other options.
         */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
                kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                        continue;
                }
-                /*
+                mce_read_aux(&m, i);
-                 * Kill on action required.
-                 */
-                if (severity == MCE_AR_SEVERITY)
-                        kill_it = 1;
-                if (m.status & MCI_STATUS_MISCV)
-                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-                if (m.status & MCI_STATUS_ADDRV)
-                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
                /*
                 * Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                }
        }
+        /* mce_clear_state will clear *final, save locally for use later */
+        m = *final;
        if (!no_way_out)
                mce_clear_state(toclear);
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                no_way_out = worst >= MCE_PANIC_SEVERITY;
        /*
-         * If we have decided that we just CAN'T continue, and the user
+         * At insane "tolerant" levels we take no action. Otherwise
-         * has not set tolerant to an insane level, give up and die.
+         * we only die if we have no other choice. For less serious
-         *
+         * issues we try to recover, or limit damage to the current
-         * This is mainly used in the case when the system doesn't
+         * process.
-         * support MCE broadcasting or it has been disabled.
-         */
-        if (no_way_out && tolerant < 3)
-                mce_panic("Fatal machine check on current CPU", final, msg);
-        /*
-         * If the error seems to be unrecoverable, something should be
-         * done.  Try to kill as little as possible.  If we can kill just
-         * one task, do that.  If the user has set the tolerance very
-         * high, don't try to do anything at all.
         */
+        if (tolerant < 3) {
-        if (kill_it && tolerant < 3)
+                if (no_way_out)
-                force_sig(SIGBUS, current);
+                        mce_panic("Fatal machine check on current CPU", &m, msg);
+                if (worst == MCE_AR_SEVERITY) {
-        /* notify userspace ASAP */
+                        /* schedule action before return to userland */
-        set_thread_flag(TIF_MCE_NOTIFY);
+                        mce_save_info(m.addr);
+                        set_thread_flag(TIF_MCE_NOTIFY);
+                } else if (kill_it) {
+                        force_sig(SIGBUS, current);
+                }
+        }
        if (worst > 0)
                mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
+#ifndef CONFIG_MEMORY_FAILURE
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+int memory_failure(unsigned long pfn, int vector, int flags)
 {
-        printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+        /* mce_severity() should not hand us an ACTION_REQUIRED error */
+        BUG_ON(flags & MF_ACTION_REQUIRED);
+        printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
+                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+        return 0;
 }
+#endif
 /*
- * Called after mce notification in process context. This code
+ * Called in process context that interrupted by MCE and marked with
- * is allowed to sleep. Call the high level VM handler to process
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * any corrupted pages.
+ * This code is allowed to sleep.
- * Assume that the work queue code only calls this one at a time
+ * Attempt possible recovery such as calling the high level VM handler to
- * per CPU.
+ * process any corrupted pages, and kill/signal current process if required.
- * Note we don't disable preemption, so this code might run on the wrong
+ * Action required errors are handled here.
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
 */
 void mce_notify_process(void)
 {
        unsigned long pfn;
-        mce_notify_irq();
+        struct mce_info *mi = mce_find_info();
-        while (mce_ring_get(&pfn))
-                memory_failure(pfn, MCE_VECTOR);
+        if (!mi)
+                mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+        pfn = mi->paddr >> PAGE_SHIFT;
+        clear_thread_flag(TIF_MCE_NOTIFY);
+        pr_err("Uncorrected hardware memory error in user-access at %llx",
+                 mi->paddr);
+        if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+                pr_err("Memory error not recovered");
+                force_sig(SIGBUS, current);
+        }
+        mce_clear_info(mi);
 }
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
 static void mce_process_work(struct work_struct *dummy)
 {
-        mce_notify_process();
+        unsigned long pfn;
+        while (mce_ring_get(&pfn))
+                memory_failure(pfn, MCE_VECTOR, 0);
 }
 #ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
        /* Not more than two messages every minute */
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &mce_need_notify)) {
                /* wake processes polling /dev/mcelog */
                wake_up_interruptible(&mce_chrdev_wait);

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..ad573d8baf10 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
540	irq_work_queue(&__get_cpu_var(mce_irq_work));	540	irq_work_queue(&__get_cpu_var(mce_irq_work));
541	}	541	}
542		542
		543	/*
		544	* Read ADDR and MISC registers.
		545	*/
		546	static void mce_read_aux(struct mce *m, int i)
		547	{
		548	if (m->status & MCI_STATUS_MISCV)
		549	m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
		550	if (m->status & MCI_STATUS_ADDRV) {
		551	m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
		552
		553	/*
		554	* Mask the reported address by the reported granularity.
		555	*/
		556	if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
		557	u8 shift = MCI_MISC_ADDR_LSB(m->misc);
		558	m->addr >>= shift;
		559	m->addr <<= shift;
		560	}
		561	}
		562	}
		563
543	DEFINE_PER_CPU(unsigned, mce_poll_count);	564	DEFINE_PER_CPU(unsigned, mce_poll_count);
544		565
545	/*	566	/*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590	(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))	611	(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
591	continue;	612	continue;
592		613
593	if (m.status & MCI_STATUS_MISCV)	614	mce_read_aux(&m, i);
594	m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
595	if (m.status & MCI_STATUS_ADDRV)
596	m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
597		615
598	if (!(flags & MCP_TIMESTAMP))	616	if (!(flags & MCP_TIMESTAMP))
599	m.tsc = 0;	617	m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
917	}	935	}
918		936
919	/*	937	/*
		938	* Need to save faulting physical address associated with a process
		939	* in the machine check handler some place where we can grab it back
		940	* later in mce_notify_process()
		941	*/
		942	#define MCE_INFO_MAX 16
		943
		944	struct mce_info {
		945	atomic_t inuse;
		946	struct task_struct *t;
		947	__u64 paddr;
		948	} mce_info[MCE_INFO_MAX];
		949
		950	static void mce_save_info(__u64 addr)
		951	{
		952	struct mce_info *mi;
		953
		954	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
		955	if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
		956	mi->t = current;
		957	mi->paddr = addr;
		958	return;
		959	}
		960	}
		961
		962	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
		963	}
		964
		965	static struct mce_info *mce_find_info(void)
		966	{
		967	struct mce_info *mi;
		968
		969	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
		970	if (atomic_read(&mi->inuse) && mi->t == current)
		971	return mi;
		972	return NULL;
		973	}
		974
		975	static void mce_clear_info(struct mce_info *mi)
		976	{
		977	atomic_set(&mi->inuse, 0);
		978	}
		979
		980	/*
920	* The actual machine check handler. This only handles real	981	* The actual machine check handler. This only handles real
921	* exceptions when something got corrupted coming in through int 18.	982	* exceptions when something got corrupted coming in through int 18.
922	*	983	*
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
969	barrier();	1030	barrier();
970		1031
971	/*	1032	/*
972	* When no restart IP must always kill or panic.	1033	* When no restart IP might need to kill or panic.
		1034	* Assume the worst for now, but if we find the
		1035	* severity is MCE_AR_SEVERITY we have other options.
973	*/	1036	*/
974	if (!(m.mcgstatus & MCG_STATUS_RIPV))	1037	if (!(m.mcgstatus & MCG_STATUS_RIPV))
975	kill_it = 1;	1038	kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1023	continue;	1086	continue;
1024	}	1087	}
1025		1088
1026	/*	1089	mce_read_aux(&m, i);
1027	* Kill on action required.
1028	*/
1029	if (severity == MCE_AR_SEVERITY)
1030	kill_it = 1;
1031
1032	if (m.status & MCI_STATUS_MISCV)
1033	m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1034	if (m.status & MCI_STATUS_ADDRV)
1035	m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1036		1090
1037	/*	1091	/*
1038	* Action optional error. Queue address for later processing.	1092	* Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1052	}	1106	}
1053	}	1107	}
1054		1108
		1109	/* mce_clear_state will clear final, save locally for use later /
		1110	m = *final;
		1111
1055	if (!no_way_out)	1112	if (!no_way_out)
1056	mce_clear_state(toclear);	1113	mce_clear_state(toclear);
1057		1114
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063	no_way_out = worst >= MCE_PANIC_SEVERITY;	1120	no_way_out = worst >= MCE_PANIC_SEVERITY;
1064		1121
1065	/*	1122	/*
1066	* If we have decided that we just CAN'T continue, and the user	1123	* At insane "tolerant" levels we take no action. Otherwise
1067	* has not set tolerant to an insane level, give up and die.	1124	* we only die if we have no other choice. For less serious
1068	*	1125	* issues we try to recover, or limit damage to the current
1069	* This is mainly used in the case when the system doesn't	1126	* process.
1070	* support MCE broadcasting or it has been disabled.
1071	*/
1072	if (no_way_out && tolerant < 3)
1073	mce_panic("Fatal machine check on current CPU", final, msg);
1074
1075	/*
1076	* If the error seems to be unrecoverable, something should be
1077	* done. Try to kill as little as possible. If we can kill just
1078	* one task, do that. If the user has set the tolerance very
1079	* high, don't try to do anything at all.
1080	*/	1127	*/
1081		1128	if (tolerant < 3) {
1082	if (kill_it && tolerant < 3)	1129	if (no_way_out)
1083	force_sig(SIGBUS, current);	1130	mce_panic("Fatal machine check on current CPU", &m, msg);
1084		1131	if (worst == MCE_AR_SEVERITY) {
1085	/* notify userspace ASAP */	1132	/* schedule action before return to userland */
1086	set_thread_flag(TIF_MCE_NOTIFY);	1133	mce_save_info(m.addr);
		1134	set_thread_flag(TIF_MCE_NOTIFY);
		1135	} else if (kill_it) {
		1136	force_sig(SIGBUS, current);
		1137	}
		1138	}
1087		1139
1088	if (worst > 0)	1140	if (worst > 0)
1089	mce_report_event(regs);	1141	mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
1094	}	1146	}
1095	EXPORT_SYMBOL_GPL(do_machine_check);	1147	EXPORT_SYMBOL_GPL(do_machine_check);
1096		1148
1097	/* dummy to break dependency. actual code is in mm/memory-failure.c */	1149	#ifndef CONFIG_MEMORY_FAILURE
1098	void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)	1150	int memory_failure(unsigned long pfn, int vector, int flags)
1099	{	1151	{
1100	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);	1152	/* mce_severity() should not hand us an ACTION_REQUIRED error */
		1153	BUG_ON(flags & MF_ACTION_REQUIRED);
		1154	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
		1155	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
		1156
		1157	return 0;
1101	}	1158	}
		1159	#endif
1102		1160
1103	/*	1161	/*
1104	* Called after mce notification in process context. This code	1162	* Called in process context that interrupted by MCE and marked with
1105	* is allowed to sleep. Call the high level VM handler to process	1163	* TIF_MCE_NOTIFY, just before returning to erroneous userland.
1106	* any corrupted pages.	1164	* This code is allowed to sleep.
1107	* Assume that the work queue code only calls this one at a time	1165	* Attempt possible recovery such as calling the high level VM handler to
1108	* per CPU.	1166	* process any corrupted pages, and kill/signal current process if required.
1109	* Note we don't disable preemption, so this code might run on the wrong	1167	* Action required errors are handled here.
1110	* CPU. In this case the event is picked up by the scheduled work queue.
1111	* This is merely a fast path to expedite processing in some common
1112	* cases.
1113	*/	1168	*/
1114	void mce_notify_process(void)	1169	void mce_notify_process(void)
1115	{	1170	{
1116	unsigned long pfn;	1171	unsigned long pfn;
1117	mce_notify_irq();	1172	struct mce_info *mi = mce_find_info();
1118	while (mce_ring_get(&pfn))	1173
1119	memory_failure(pfn, MCE_VECTOR);	1174	if (!mi)
		1175	mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
		1176	pfn = mi->paddr >> PAGE_SHIFT;
		1177
		1178	clear_thread_flag(TIF_MCE_NOTIFY);
		1179
		1180	pr_err("Uncorrected hardware memory error in user-access at %llx",
		1181	mi->paddr);
		1182	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
		1183	pr_err("Memory error not recovered");
		1184	force_sig(SIGBUS, current);
		1185	}
		1186	mce_clear_info(mi);
1120	}	1187	}
1121		1188
		1189	/*
		1190	* Action optional processing happens here (picking up
		1191	* from the list of faulting pages that do_machine_check()
		1192	* placed into the "ring").
		1193	*/
1122	static void mce_process_work(struct work_struct *dummy)	1194	static void mce_process_work(struct work_struct *dummy)
1123	{	1195	{
1124	mce_notify_process();	1196	unsigned long pfn;
		1197
		1198	while (mce_ring_get(&pfn))
		1199	memory_failure(pfn, MCE_VECTOR, 0);
1125	}	1200	}
1126		1201
1127	#ifdef CONFIG_X86_MCE_INTEL	1202	#ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
1211	/* Not more than two messages every minute */	1286	/* Not more than two messages every minute */
1212	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);	1287	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1213		1288
1214	clear_thread_flag(TIF_MCE_NOTIFY);
1215
1216	if (test_and_clear_bit(0, &mce_need_notify)) {	1289	if (test_and_clear_bit(0, &mce_need_notify)) {
1217	/* wake processes polling /dev/mcelog */	1290	/* wake processes polling /dev/mcelog */
1218	wake_up_interruptible(&mce_chrdev_wait);	1291	wake_up_interruptible(&mce_chrdev_wait);