diff options
author | Tony Luck <tony.luck@intel.com> | 2012-05-14 18:07:48 -0400 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2012-05-14 18:07:48 -0400 |
commit | dad1743e5993f19b3d7e7bd0fb35dc45b5326626 (patch) | |
tree | 98ff01524c0e7393616a2ac9102d06ccc3ed6f95 /arch/x86 | |
parent | d48b97b403d23f6df0b990cee652bdf9a52337a3 (diff) |
x86/mce: Only restart instruction after machine check recovery if it is safe
Section 15.3.1.2 of the software developer manual has this to say about the
RIPV bit in the IA32_MCG_STATUS register:
RIPV (restart IP valid) flag, bit 0 — Indicates (when set) that program
execution can be restarted reliably at the instruction pointed to by the
instruction pointer pushed on the stack when the machine-check exception
is generated. When clear, the program cannot be reliably restarted at
the pushed instruction pointer.
We need to save the state of this bit in do_machine_check() and use it
in mce_notify_process() to force a signal; even if memory_failure() says
it made a complete recovery ... e.g. replaced a clean LRU page.
Acked-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..11c9166c3337 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -945,9 +945,10 @@ struct mce_info { | |||
945 | atomic_t inuse; | 945 | atomic_t inuse; |
946 | struct task_struct *t; | 946 | struct task_struct *t; |
947 | __u64 paddr; | 947 | __u64 paddr; |
948 | int restartable; | ||
948 | } mce_info[MCE_INFO_MAX]; | 949 | } mce_info[MCE_INFO_MAX]; |
949 | 950 | ||
950 | static void mce_save_info(__u64 addr) | 951 | static void mce_save_info(__u64 addr, int c) |
951 | { | 952 | { |
952 | struct mce_info *mi; | 953 | struct mce_info *mi; |
953 | 954 | ||
@@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr) | |||
955 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | 956 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { |
956 | mi->t = current; | 957 | mi->t = current; |
957 | mi->paddr = addr; | 958 | mi->paddr = addr; |
959 | mi->restartable = c; | ||
958 | return; | 960 | return; |
959 | } | 961 | } |
960 | } | 962 | } |
@@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1130 | mce_panic("Fatal machine check on current CPU", &m, msg); | 1132 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1131 | if (worst == MCE_AR_SEVERITY) { | 1133 | if (worst == MCE_AR_SEVERITY) { |
1132 | /* schedule action before return to userland */ | 1134 | /* schedule action before return to userland */ |
1133 | mce_save_info(m.addr); | 1135 | mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); |
1134 | set_thread_flag(TIF_MCE_NOTIFY); | 1136 | set_thread_flag(TIF_MCE_NOTIFY); |
1135 | } else if (kill_it) { | 1137 | } else if (kill_it) { |
1136 | force_sig(SIGBUS, current); | 1138 | force_sig(SIGBUS, current); |
@@ -1179,7 +1181,13 @@ void mce_notify_process(void) | |||
1179 | 1181 | ||
1180 | pr_err("Uncorrected hardware memory error in user-access at %llx", | 1182 | pr_err("Uncorrected hardware memory error in user-access at %llx", |
1181 | mi->paddr); | 1183 | mi->paddr); |
1182 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | 1184 | /* |
1185 | * We must call memory_failure() here even if the current process is | ||
1186 | * doomed. We still need to mark the page as poisoned and alert any | ||
1187 | * other users of the page. | ||
1188 | */ | ||
1189 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || | ||
1190 | mi->restartable == 0) { | ||
1183 | pr_err("Memory error not recovered"); | 1191 | pr_err("Memory error not recovered"); |
1184 | force_sig(SIGBUS, current); | 1192 | force_sig(SIGBUS, current); |
1185 | } | 1193 | } |