aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2018-08-29 11:47:18 -0400
committerThomas Gleixner <tglx@linutronix.de>2018-08-31 11:08:22 -0400
commit4012e77a903d114f915fc607d6d2ed54a3d6c9b1 (patch)
treeaa56d63db999604dd3004855996dca36c293e1cf
parent829fe4aa9ac16417a904ad1de1307de906854bcf (diff)
x86/nmi: Fix NMI uaccess race against CR3 switching
A NMI can hit in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory. Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers. Signed-off-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Rik van Riel <riel@surriel.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Jann Horn <jannh@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org
-rw-r--r--arch/x86/events/core.c2
-rw-r--r--arch/x86/include/asm/tlbflush.h40
-rw-r--r--arch/x86/lib/usercopy.c5
-rw-r--r--arch/x86/mm/tlb.c7
4 files changed, 53 insertions, 1 deletions
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
2465 2465
2466 perf_callchain_store(entry, regs->ip); 2466 perf_callchain_store(entry, regs->ip);
2467 2467
2468 if (!current->mm) 2468 if (!nmi_uaccess_okay())
2469 return; 2469 return;
2470 2470
2471 if (perf_callchain_user32(regs, entry)) 2471 if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
175 * are on. This means that it may not match current->active_mm, 175 * are on. This means that it may not match current->active_mm,
176 * which will contain the previous user mm when we're in lazy TLB 176 * which will contain the previous user mm when we're in lazy TLB
177 * mode even if we've already switched back to swapper_pg_dir. 177 * mode even if we've already switched back to swapper_pg_dir.
178 *
179 * During switch_mm_irqs_off(), loaded_mm will be set to
180 * LOADED_MM_SWITCHING during the brief interrupts-off window
181 * when CR3 and loaded_mm would otherwise be inconsistent. This
182 * is for nmi_uaccess_okay()'s benefit.
178 */ 183 */
179 struct mm_struct *loaded_mm; 184 struct mm_struct *loaded_mm;
185
186#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
187
180 u16 loaded_mm_asid; 188 u16 loaded_mm_asid;
181 u16 next_asid; 189 u16 next_asid;
182 /* last user mm's ctx id */ 190 /* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
246}; 254};
247DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 255DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
248 256
257/*
258 * Blindly accessing user memory from NMI context can be dangerous
259 * if we're in the middle of switching the current user task or
260 * switching the loaded mm. It can also be dangerous if we
261 * interrupted some kernel code that was temporarily using a
262 * different mm.
263 */
264static inline bool nmi_uaccess_okay(void)
265{
266 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
267 struct mm_struct *current_mm = current->mm;
268
269 VM_WARN_ON_ONCE(!loaded_mm);
270
271 /*
272 * The condition we want to check is
273 * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
274 * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
275 * is supposed to be reasonably fast.
276 *
277 * Instead, we check the almost equivalent but somewhat conservative
278 * condition below, and we rely on the fact that switch_mm_irqs_off()
279 * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
280 */
281 if (loaded_mm != current_mm)
282 return false;
283
284 VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
285
286 return true;
287}
288
249/* Initialize cr4 shadow for this CPU. */ 289/* Initialize cr4 shadow for this CPU. */
250static inline void cr4_init_shadow(void) 290static inline void cr4_init_shadow(void)
251{ 291{
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/export.h> 8#include <linux/export.h>
9 9
10#include <asm/tlbflush.h>
11
10/* 12/*
11 * We rely on the nested NMI work to allow atomic faults from the NMI path; the 13 * We rely on the nested NMI work to allow atomic faults from the NMI path; the
12 * nested NMI paths are careful to preserve CR2. 14 * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
19 if (__range_not_ok(from, n, TASK_SIZE)) 21 if (__range_not_ok(from, n, TASK_SIZE))
20 return n; 22 return n;
21 23
24 if (!nmi_uaccess_okay())
25 return n;
26
22 /* 27 /*
23 * Even though this function is typically called from NMI/IRQ context 28 * Even though this function is typically called from NMI/IRQ context
24 * disable pagefaults so that its behaviour is consistent even when 29 * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
305 305
306 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 306 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
307 307
308 /* Let nmi_uaccess_okay() know that we're changing CR3. */
309 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
310 barrier();
311
308 if (need_flush) { 312 if (need_flush) {
309 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 313 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
310 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 314 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
335 if (next != &init_mm) 339 if (next != &init_mm)
336 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); 340 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
337 341
342 /* Make sure we write CR3 before loaded_mm. */
343 barrier();
344
338 this_cpu_write(cpu_tlbstate.loaded_mm, next); 345 this_cpu_write(cpu_tlbstate.loaded_mm, next);
339 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 346 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
340 } 347 }