aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2012-05-14 18:07:48 -0400
committerTony Luck <tony.luck@intel.com>2012-05-14 18:07:48 -0400
commitdad1743e5993f19b3d7e7bd0fb35dc45b5326626 (patch)
tree98ff01524c0e7393616a2ac9102d06ccc3ed6f95 /arch/x86
parentd48b97b403d23f6df0b990cee652bdf9a52337a3 (diff)
x86/mce: Only restart instruction after machine check recovery if it is safe
Section 15.3.1.2 of the software developer manual has this to say about the RIPV bit in the IA32_MCG_STATUS register: RIPV (restart IP valid) flag, bit 0 — Indicates (when set) that program execution can be restarted reliably at the instruction pointed to by the instruction pointer pushed on the stack when the machine-check exception is generated. When clear, the program cannot be reliably restarted at the pushed instruction pointer. We need to save the state of this bit in do_machine_check() and use it in mce_notify_process() to force a signal; even if memory_failure() says it made a complete recovery ... e.g. replaced a clean LRU page. Acked-by: Borislav Petkov <bp@amd64.org> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c14
1 files changed, 11 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087d..11c9166c3337 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -945,9 +945,10 @@ struct mce_info {
945 atomic_t inuse; 945 atomic_t inuse;
946 struct task_struct *t; 946 struct task_struct *t;
947 __u64 paddr; 947 __u64 paddr;
948 int restartable;
948} mce_info[MCE_INFO_MAX]; 949} mce_info[MCE_INFO_MAX];
949 950
950static void mce_save_info(__u64 addr) 951static void mce_save_info(__u64 addr, int c)
951{ 952{
952 struct mce_info *mi; 953 struct mce_info *mi;
953 954
@@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr)
955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 956 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
956 mi->t = current; 957 mi->t = current;
957 mi->paddr = addr; 958 mi->paddr = addr;
959 mi->restartable = c;
958 return; 960 return;
959 } 961 }
960 } 962 }
@@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1130 mce_panic("Fatal machine check on current CPU", &m, msg); 1132 mce_panic("Fatal machine check on current CPU", &m, msg);
1131 if (worst == MCE_AR_SEVERITY) { 1133 if (worst == MCE_AR_SEVERITY) {
1132 /* schedule action before return to userland */ 1134 /* schedule action before return to userland */
1133 mce_save_info(m.addr); 1135 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1134 set_thread_flag(TIF_MCE_NOTIFY); 1136 set_thread_flag(TIF_MCE_NOTIFY);
1135 } else if (kill_it) { 1137 } else if (kill_it) {
1136 force_sig(SIGBUS, current); 1138 force_sig(SIGBUS, current);
@@ -1179,7 +1181,13 @@ void mce_notify_process(void)
1179 1181
1180 pr_err("Uncorrected hardware memory error in user-access at %llx", 1182 pr_err("Uncorrected hardware memory error in user-access at %llx",
1181 mi->paddr); 1183 mi->paddr);
1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { 1184 /*
1185 * We must call memory_failure() here even if the current process is
1186 * doomed. We still need to mark the page as poisoned and alert any
1187 * other users of the page.
1188 */
1189 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
1190 mi->restartable == 0) {
1183 pr_err("Memory error not recovered"); 1191 pr_err("Memory error not recovered");
1184 force_sig(SIGBUS, current); 1192 force_sig(SIGBUS, current);
1185 } 1193 }