aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2012-01-03 14:45:45 -0500
committerTony Luck <tony.luck@intel.com>2012-01-03 15:07:01 -0500
commita8c321fbf9aeced45519248e5901af8cbc240510 (patch)
tree74df0cc3cfc4d8f5e422384005d42e7330a08e9a /arch/x86/kernel
parentaf104e394e17e328df85c25a9e21448539725b67 (diff)
x86/mce: Handle "action required" errors
All non-urgent actions (reporting low severity errors and handling "action-optional" errors) are now handled by a work queue. This means that TIF_MCE_NOTIFY can be used to block execution for a thread experiencing an "action-required" fault until we get all cpus out of the machine check handler (and the thread that hit the fault into mce_notify_process(). We use the new mce_{save,find,clear}_info() API to get information from do_machine_check() to mce_notify_process(), and then use the newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle the error (possibly signalling the process). Update some comments to make the new code flows clearer. Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c95
1 files changed, 53 insertions, 42 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e1579c5a71d..56e4e79387c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
982 barrier(); 982 barrier();
983 983
984 /* 984 /*
985 * When no restart IP must always kill or panic. 985 * When no restart IP might need to kill or panic.
986 * Assume the worst for now, but if we find the
987 * severity is MCE_AR_SEVERITY we have other options.
986 */ 988 */
987 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 989 if (!(m.mcgstatus & MCG_STATUS_RIPV))
988 kill_it = 1; 990 kill_it = 1;
@@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1036 continue; 1038 continue;
1037 } 1039 }
1038 1040
1039 /*
1040 * Kill on action required.
1041 */
1042 if (severity == MCE_AR_SEVERITY)
1043 kill_it = 1;
1044
1045 mce_read_aux(&m, i); 1041 mce_read_aux(&m, i);
1046 1042
1047 /* 1043 /*
@@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1062 } 1058 }
1063 } 1059 }
1064 1060
1061 /* mce_clear_state will clear *final, save locally for use later */
1062 m = *final;
1063
1065 if (!no_way_out) 1064 if (!no_way_out)
1066 mce_clear_state(toclear); 1065 mce_clear_state(toclear);
1067 1066
@@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1073 no_way_out = worst >= MCE_PANIC_SEVERITY; 1072 no_way_out = worst >= MCE_PANIC_SEVERITY;
1074 1073
1075 /* 1074 /*
1076 * If we have decided that we just CAN'T continue, and the user 1075 * At insane "tolerant" levels we take no action. Otherwise
1077 * has not set tolerant to an insane level, give up and die. 1076 * we only die if we have no other choice. For less serious
1078 * 1077 * issues we try to recover, or limit damage to the current
1079 * This is mainly used in the case when the system doesn't 1078 * process.
1080 * support MCE broadcasting or it has been disabled.
1081 */ 1079 */
1082 if (no_way_out && tolerant < 3) 1080 if (tolerant < 3) {
1083 mce_panic("Fatal machine check on current CPU", final, msg); 1081 if (no_way_out)
1084 1082 mce_panic("Fatal machine check on current CPU", &m, msg);
1085 /* 1083 if (worst == MCE_AR_SEVERITY) {
1086 * If the error seems to be unrecoverable, something should be 1084 /* schedule action before return to userland */
1087 * done. Try to kill as little as possible. If we can kill just 1085 mce_save_info(m.addr);
1088 * one task, do that. If the user has set the tolerance very 1086 set_thread_flag(TIF_MCE_NOTIFY);
1089 * high, don't try to do anything at all. 1087 } else if (kill_it) {
1090 */ 1088 force_sig(SIGBUS, current);
1091 1089 }
1092 if (kill_it && tolerant < 3) 1090 }
1093 force_sig(SIGBUS, current);
1094
1095 /* notify userspace ASAP */
1096 set_thread_flag(TIF_MCE_NOTIFY);
1097 1091
1098 if (worst > 0) 1092 if (worst > 0)
1099 mce_report_event(regs); 1093 mce_report_event(regs);
@@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check);
1107#ifndef CONFIG_MEMORY_FAILURE 1101#ifndef CONFIG_MEMORY_FAILURE
1108int memory_failure(unsigned long pfn, int vector, int flags) 1102int memory_failure(unsigned long pfn, int vector, int flags)
1109{ 1103{
1104 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1105 BUG_ON(flags & MF_ACTION_REQUIRED);
1110 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1106 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
1111 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1107 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
1112 1108
@@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1115#endif 1111#endif
1116 1112
1117/* 1113/*
1118 * Called after mce notification in process context. This code 1114 * Called in process context that interrupted by MCE and marked with
1119 * is allowed to sleep. Call the high level VM handler to process 1115 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1120 * any corrupted pages. 1116 * This code is allowed to sleep.
1121 * Assume that the work queue code only calls this one at a time 1117 * Attempt possible recovery such as calling the high level VM handler to
1122 * per CPU. 1118 * process any corrupted pages, and kill/signal current process if required.
1123 * Note we don't disable preemption, so this code might run on the wrong 1119 * Action required errors are handled here.
1124 * CPU. In this case the event is picked up by the scheduled work queue.
1125 * This is merely a fast path to expedite processing in some common
1126 * cases.
1127 */ 1120 */
1128void mce_notify_process(void) 1121void mce_notify_process(void)
1129{ 1122{
1130 unsigned long pfn; 1123 unsigned long pfn;
1131 mce_notify_irq(); 1124 struct mce_info *mi = mce_find_info();
1132 while (mce_ring_get(&pfn)) 1125
1133 memory_failure(pfn, MCE_VECTOR, 0); 1126 if (!mi)
1127 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1128 pfn = mi->paddr >> PAGE_SHIFT;
1129
1130 clear_thread_flag(TIF_MCE_NOTIFY);
1131
1132 pr_err("Uncorrected hardware memory error in user-access at %llx",
1133 mi->paddr);
1134 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
1135 pr_err("Memory error not recovered");
1136 force_sig(SIGBUS, current);
1137 }
1138 mce_clear_info(mi);
1134} 1139}
1135 1140
1141/*
1142 * Action optional processing happens here (picking up
1143 * from the list of faulting pages that do_machine_check()
1144 * placed into the "ring").
1145 */
1136static void mce_process_work(struct work_struct *dummy) 1146static void mce_process_work(struct work_struct *dummy)
1137{ 1147{
1138 mce_notify_process(); 1148 unsigned long pfn;
1149
1150 while (mce_ring_get(&pfn))
1151 memory_failure(pfn, MCE_VECTOR, 0);
1139} 1152}
1140 1153
1141#ifdef CONFIG_X86_MCE_INTEL 1154#ifdef CONFIG_X86_MCE_INTEL
@@ -1225,8 +1238,6 @@ int mce_notify_irq(void)
1225 /* Not more than two messages every minute */ 1238 /* Not more than two messages every minute */
1226 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1239 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1227 1240
1228 clear_thread_flag(TIF_MCE_NOTIFY);
1229
1230 if (test_and_clear_bit(0, &mce_need_notify)) { 1241 if (test_and_clear_bit(0, &mce_need_notify)) {
1231 /* wake processes polling /dev/mcelog */ 1242 /* wake processes polling /dev/mcelog */
1232 wake_up_interruptible(&mce_chrdev_wait); 1243 wake_up_interruptible(&mce_chrdev_wait);