diff options
author | Tony Luck <tony.luck@intel.com> | 2012-01-03 14:45:45 -0500 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2012-01-03 15:07:01 -0500 |
commit | a8c321fbf9aeced45519248e5901af8cbc240510 (patch) | |
tree | 74df0cc3cfc4d8f5e422384005d42e7330a08e9a /arch/x86/kernel/cpu/mcheck/mce.c | |
parent | af104e394e17e328df85c25a9e21448539725b67 (diff) |
x86/mce: Handle "action required" errors
All non-urgent actions (reporting low severity errors and handling
"action-optional" errors) are now handled by a work queue. This
means that TIF_MCE_NOTIFY can be used to block execution for a
thread experiencing an "action-required" fault until we get all
cpus out of the machine check handler (and the thread that hit
the fault into mce_notify_process().
We use the new mce_{save,find,clear}_info() API to get information
from do_machine_check() to mce_notify_process(), and then use the
newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle
the error (possibly signalling the process).
Update some comments to make the new code flows clearer.
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 95 |
1 files changed, 53 insertions, 42 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e1579c5a71da..56e4e79387c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
982 | barrier(); | 982 | barrier(); |
983 | 983 | ||
984 | /* | 984 | /* |
985 | * When no restart IP must always kill or panic. | 985 | * When no restart IP might need to kill or panic. |
986 | * Assume the worst for now, but if we find the | ||
987 | * severity is MCE_AR_SEVERITY we have other options. | ||
986 | */ | 988 | */ |
987 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 989 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
988 | kill_it = 1; | 990 | kill_it = 1; |
@@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1036 | continue; | 1038 | continue; |
1037 | } | 1039 | } |
1038 | 1040 | ||
1039 | /* | ||
1040 | * Kill on action required. | ||
1041 | */ | ||
1042 | if (severity == MCE_AR_SEVERITY) | ||
1043 | kill_it = 1; | ||
1044 | |||
1045 | mce_read_aux(&m, i); | 1041 | mce_read_aux(&m, i); |
1046 | 1042 | ||
1047 | /* | 1043 | /* |
@@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1062 | } | 1058 | } |
1063 | } | 1059 | } |
1064 | 1060 | ||
1061 | /* mce_clear_state will clear *final, save locally for use later */ | ||
1062 | m = *final; | ||
1063 | |||
1065 | if (!no_way_out) | 1064 | if (!no_way_out) |
1066 | mce_clear_state(toclear); | 1065 | mce_clear_state(toclear); |
1067 | 1066 | ||
@@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1073 | no_way_out = worst >= MCE_PANIC_SEVERITY; | 1072 | no_way_out = worst >= MCE_PANIC_SEVERITY; |
1074 | 1073 | ||
1075 | /* | 1074 | /* |
1076 | * If we have decided that we just CAN'T continue, and the user | 1075 | * At insane "tolerant" levels we take no action. Otherwise |
1077 | * has not set tolerant to an insane level, give up and die. | 1076 | * we only die if we have no other choice. For less serious |
1078 | * | 1077 | * issues we try to recover, or limit damage to the current |
1079 | * This is mainly used in the case when the system doesn't | 1078 | * process. |
1080 | * support MCE broadcasting or it has been disabled. | ||
1081 | */ | 1079 | */ |
1082 | if (no_way_out && tolerant < 3) | 1080 | if (tolerant < 3) { |
1083 | mce_panic("Fatal machine check on current CPU", final, msg); | 1081 | if (no_way_out) |
1084 | 1082 | mce_panic("Fatal machine check on current CPU", &m, msg); | |
1085 | /* | 1083 | if (worst == MCE_AR_SEVERITY) { |
1086 | * If the error seems to be unrecoverable, something should be | 1084 | /* schedule action before return to userland */ |
1087 | * done. Try to kill as little as possible. If we can kill just | 1085 | mce_save_info(m.addr); |
1088 | * one task, do that. If the user has set the tolerance very | 1086 | set_thread_flag(TIF_MCE_NOTIFY); |
1089 | * high, don't try to do anything at all. | 1087 | } else if (kill_it) { |
1090 | */ | 1088 | force_sig(SIGBUS, current); |
1091 | 1089 | } | |
1092 | if (kill_it && tolerant < 3) | 1090 | } |
1093 | force_sig(SIGBUS, current); | ||
1094 | |||
1095 | /* notify userspace ASAP */ | ||
1096 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1097 | 1091 | ||
1098 | if (worst > 0) | 1092 | if (worst > 0) |
1099 | mce_report_event(regs); | 1093 | mce_report_event(regs); |
@@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check); | |||
1107 | #ifndef CONFIG_MEMORY_FAILURE | 1101 | #ifndef CONFIG_MEMORY_FAILURE |
1108 | int memory_failure(unsigned long pfn, int vector, int flags) | 1102 | int memory_failure(unsigned long pfn, int vector, int flags) |
1109 | { | 1103 | { |
1104 | /* mce_severity() should not hand us an ACTION_REQUIRED error */ | ||
1105 | BUG_ON(flags & MF_ACTION_REQUIRED); | ||
1110 | printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" | 1106 | printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" |
1111 | "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); | 1107 | "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); |
1112 | 1108 | ||
@@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags) | |||
1115 | #endif | 1111 | #endif |
1116 | 1112 | ||
1117 | /* | 1113 | /* |
1118 | * Called after mce notification in process context. This code | 1114 | * Called in process context that interrupted by MCE and marked with |
1119 | * is allowed to sleep. Call the high level VM handler to process | 1115 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. |
1120 | * any corrupted pages. | 1116 | * This code is allowed to sleep. |
1121 | * Assume that the work queue code only calls this one at a time | 1117 | * Attempt possible recovery such as calling the high level VM handler to |
1122 | * per CPU. | 1118 | * process any corrupted pages, and kill/signal current process if required. |
1123 | * Note we don't disable preemption, so this code might run on the wrong | 1119 | * Action required errors are handled here. |
1124 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1125 | * This is merely a fast path to expedite processing in some common | ||
1126 | * cases. | ||
1127 | */ | 1120 | */ |
1128 | void mce_notify_process(void) | 1121 | void mce_notify_process(void) |
1129 | { | 1122 | { |
1130 | unsigned long pfn; | 1123 | unsigned long pfn; |
1131 | mce_notify_irq(); | 1124 | struct mce_info *mi = mce_find_info(); |
1132 | while (mce_ring_get(&pfn)) | 1125 | |
1133 | memory_failure(pfn, MCE_VECTOR, 0); | 1126 | if (!mi) |
1127 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
1128 | pfn = mi->paddr >> PAGE_SHIFT; | ||
1129 | |||
1130 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1131 | |||
1132 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1133 | mi->paddr); | ||
1134 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | ||
1135 | pr_err("Memory error not recovered"); | ||
1136 | force_sig(SIGBUS, current); | ||
1137 | } | ||
1138 | mce_clear_info(mi); | ||
1134 | } | 1139 | } |
1135 | 1140 | ||
1141 | /* | ||
1142 | * Action optional processing happens here (picking up | ||
1143 | * from the list of faulting pages that do_machine_check() | ||
1144 | * placed into the "ring"). | ||
1145 | */ | ||
1136 | static void mce_process_work(struct work_struct *dummy) | 1146 | static void mce_process_work(struct work_struct *dummy) |
1137 | { | 1147 | { |
1138 | mce_notify_process(); | 1148 | unsigned long pfn; |
1149 | |||
1150 | while (mce_ring_get(&pfn)) | ||
1151 | memory_failure(pfn, MCE_VECTOR, 0); | ||
1139 | } | 1152 | } |
1140 | 1153 | ||
1141 | #ifdef CONFIG_X86_MCE_INTEL | 1154 | #ifdef CONFIG_X86_MCE_INTEL |
@@ -1225,8 +1238,6 @@ int mce_notify_irq(void) | |||
1225 | /* Not more than two messages every minute */ | 1238 | /* Not more than two messages every minute */ |
1226 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | 1239 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
1227 | 1240 | ||
1228 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1229 | |||
1230 | if (test_and_clear_bit(0, &mce_need_notify)) { | 1241 | if (test_and_clear_bit(0, &mce_need_notify)) { |
1231 | /* wake processes polling /dev/mcelog */ | 1242 | /* wake processes polling /dev/mcelog */ |
1232 | wake_up_interruptible(&mce_chrdev_wait); | 1243 | wake_up_interruptible(&mce_chrdev_wait); |