diff options
author | Andy Lutomirski <luto@kernel.org> | 2018-08-29 11:47:18 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2018-08-31 11:08:22 -0400 |
commit | 4012e77a903d114f915fc607d6d2ed54a3d6c9b1 (patch) | |
tree | aa56d63db999604dd3004855996dca36c293e1cf | |
parent | 829fe4aa9ac16417a904ad1de1307de906854bcf (diff) |
x86/nmi: Fix NMI uaccess race against CR3 switching
A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off(). In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jann Horn <jannh@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org
-rw-r--r-- | arch/x86/events/core.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 40 | ||||
-rw-r--r-- | arch/x86/lib/usercopy.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 7 |
4 files changed, 53 insertions, 1 deletions
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c | |||
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs | |||
2465 | 2465 | ||
2466 | perf_callchain_store(entry, regs->ip); | 2466 | perf_callchain_store(entry, regs->ip); |
2467 | 2467 | ||
2468 | if (!current->mm) | 2468 | if (!nmi_uaccess_okay()) |
2469 | return; | 2469 | return; |
2470 | 2470 | ||
2471 | if (perf_callchain_user32(regs, entry)) | 2471 | if (perf_callchain_user32(regs, entry)) |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..58ce5288878e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -175,8 +175,16 @@ struct tlb_state { | |||
175 | * are on. This means that it may not match current->active_mm, | 175 | * are on. This means that it may not match current->active_mm, |
176 | * which will contain the previous user mm when we're in lazy TLB | 176 | * which will contain the previous user mm when we're in lazy TLB |
177 | * mode even if we've already switched back to swapper_pg_dir. | 177 | * mode even if we've already switched back to swapper_pg_dir. |
178 | * | ||
179 | * During switch_mm_irqs_off(), loaded_mm will be set to | ||
180 | * LOADED_MM_SWITCHING during the brief interrupts-off window | ||
181 | * when CR3 and loaded_mm would otherwise be inconsistent. This | ||
182 | * is for nmi_uaccess_okay()'s benefit. | ||
178 | */ | 183 | */ |
179 | struct mm_struct *loaded_mm; | 184 | struct mm_struct *loaded_mm; |
185 | |||
186 | #define LOADED_MM_SWITCHING ((struct mm_struct *)1) | ||
187 | |||
180 | u16 loaded_mm_asid; | 188 | u16 loaded_mm_asid; |
181 | u16 next_asid; | 189 | u16 next_asid; |
182 | /* last user mm's ctx id */ | 190 | /* last user mm's ctx id */ |
@@ -246,6 +254,38 @@ struct tlb_state { | |||
246 | }; | 254 | }; |
247 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); | 255 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); |
248 | 256 | ||
257 | /* | ||
258 | * Blindly accessing user memory from NMI context can be dangerous | ||
259 | * if we're in the middle of switching the current user task or | ||
260 | * switching the loaded mm. It can also be dangerous if we | ||
261 | * interrupted some kernel code that was temporarily using a | ||
262 | * different mm. | ||
263 | */ | ||
264 | static inline bool nmi_uaccess_okay(void) | ||
265 | { | ||
266 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); | ||
267 | struct mm_struct *current_mm = current->mm; | ||
268 | |||
269 | VM_WARN_ON_ONCE(!loaded_mm); | ||
270 | |||
271 | /* | ||
272 | * The condition we want to check is | ||
273 | * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, | ||
274 | * if we're running in a VM with shadow paging, and nmi_uaccess_okay() | ||
275 | * is supposed to be reasonably fast. | ||
276 | * | ||
277 | * Instead, we check the almost equivalent but somewhat conservative | ||
278 | * condition below, and we rely on the fact that switch_mm_irqs_off() | ||
279 | * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. | ||
280 | */ | ||
281 | if (loaded_mm != current_mm) | ||
282 | return false; | ||
283 | |||
284 | VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); | ||
285 | |||
286 | return true; | ||
287 | } | ||
288 | |||
249 | /* Initialize cr4 shadow for this CPU. */ | 289 | /* Initialize cr4 shadow for this CPU. */ |
250 | static inline void cr4_init_shadow(void) | 290 | static inline void cr4_init_shadow(void) |
251 | { | 291 | { |
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c | |||
@@ -7,6 +7,8 @@ | |||
7 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
8 | #include <linux/export.h> | 8 | #include <linux/export.h> |
9 | 9 | ||
10 | #include <asm/tlbflush.h> | ||
11 | |||
10 | /* | 12 | /* |
11 | * We rely on the nested NMI work to allow atomic faults from the NMI path; the | 13 | * We rely on the nested NMI work to allow atomic faults from the NMI path; the |
12 | * nested NMI paths are careful to preserve CR2. | 14 | * nested NMI paths are careful to preserve CR2. |
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) | |||
19 | if (__range_not_ok(from, n, TASK_SIZE)) | 21 | if (__range_not_ok(from, n, TASK_SIZE)) |
20 | return n; | 22 | return n; |
21 | 23 | ||
24 | if (!nmi_uaccess_okay()) | ||
25 | return n; | ||
26 | |||
22 | /* | 27 | /* |
23 | * Even though this function is typically called from NMI/IRQ context | 28 | * Even though this function is typically called from NMI/IRQ context |
24 | * disable pagefaults so that its behaviour is consistent even when | 29 | * disable pagefaults so that its behaviour is consistent even when |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9517d1b2a281..e96b99eb800c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
305 | 305 | ||
306 | choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); | 306 | choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); |
307 | 307 | ||
308 | /* Let nmi_uaccess_okay() know that we're changing CR3. */ | ||
309 | this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); | ||
310 | barrier(); | ||
311 | |||
308 | if (need_flush) { | 312 | if (need_flush) { |
309 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); | 313 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
310 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); | 314 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
335 | if (next != &init_mm) | 339 | if (next != &init_mm) |
336 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | 340 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); |
337 | 341 | ||
342 | /* Make sure we write CR3 before loaded_mm. */ | ||
343 | barrier(); | ||
344 | |||
338 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | 345 | this_cpu_write(cpu_tlbstate.loaded_mm, next); |
339 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | 346 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); |
340 | } | 347 | } |