aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2016-08-11 05:35:23 -0400
committerIngo Molnar <mingo@kernel.org>2016-08-24 06:11:42 -0400
commite37e43a497d5a8b7c0cc1736d56986f432c394c9 (patch)
treebd6a1682666510271a53ebd21dc6de5e13548aac
parentb4a0f533e5976cb1a79f31d6152e1d322d79b7f1 (diff)
x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)
This allows x86_64 kernels to enable vmapped stacks by setting HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y high level Kconfig option. There are a couple of interesting bits: First, x86 lazily faults in top-level paging entries for the vmalloc area. This won't work if we get a page fault while trying to access the stack: the CPU will promote it to a double-fault and we'll die. To avoid this problem, probe the new stack when switching stacks and forcibly populate the pgd entry for the stack when switching mms. Second, once we have guard pages around the stack, we'll want to detect and handle stack overflow. I didn't enable it on x86_32. We'd need to rework the double-fault code a bit and I'm concerned about running out of vmalloc virtual addresses under some workloads. This patch, by itself, will behave somewhat erratically when the stack overflows while RSP is still more than a few tens of bytes above the bottom of the stack. Specifically, we'll get #PF and make it to no_context and them oops without reliably triggering a double-fault, and no_context doesn't know about stack overflows. The next patch will improve that case. Thank you to Nadav and Brian for helping me pay enough attention to the SDM to hopefully get this right. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/switch_to.h28
-rw-r--r--arch/x86/kernel/traps.c61
-rw-r--r--arch/x86/mm/tlb.c15
4 files changed, 104 insertions, 1 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..21a6d0ec5983 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
94 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 94 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
95 select HAVE_ARCH_WITHIN_STACK_FRAMES 95 select HAVE_ARCH_WITHIN_STACK_FRAMES
96 select HAVE_EBPF_JIT if X86_64 96 select HAVE_EBPF_JIT if X86_64
97 select HAVE_ARCH_VMAP_STACK if X86_64
97 select HAVE_CC_STACKPROTECTOR 98 select HAVE_CC_STACKPROTECTOR
98 select HAVE_CMPXCHG_DOUBLE 99 select HAVE_CMPXCHG_DOUBLE
99 select HAVE_CMPXCHG_LOCAL 100 select HAVE_CMPXCHG_LOCAL
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..14e4b20f0aaf 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,28 @@ struct tss_struct;
8void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 8void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
9 struct tss_struct *tss); 9 struct tss_struct *tss);
10 10
11/* This runs runs on the previous thread's stack. */
12static inline void prepare_switch_to(struct task_struct *prev,
13 struct task_struct *next)
14{
15#ifdef CONFIG_VMAP_STACK
16 /*
17 * If we switch to a stack that has a top-level paging entry
18 * that is not present in the current mm, the resulting #PF will
19 * will be promoted to a double-fault and we'll panic. Probe
20 * the new stack now so that vmalloc_fault can fix up the page
21 * tables if needed. This can only happen if we use a stack
22 * in vmap space.
23 *
24 * We assume that the stack is aligned so that it never spans
25 * more than one top-level paging entry.
26 *
27 * To minimize cache pollution, just follow the stack pointer.
28 */
29 READ_ONCE(*(unsigned char *)next->thread.sp);
30#endif
31}
32
11#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
12 34
13#ifdef CONFIG_CC_STACKPROTECTOR 35#ifdef CONFIG_CC_STACKPROTECTOR
@@ -39,6 +61,8 @@ do { \
39 */ \ 61 */ \
40 unsigned long ebx, ecx, edx, esi, edi; \ 62 unsigned long ebx, ecx, edx, esi, edi; \
41 \ 63 \
64 prepare_switch_to(prev, next); \
65 \
42 asm volatile("pushl %%ebp\n\t" /* save EBP */ \ 66 asm volatile("pushl %%ebp\n\t" /* save EBP */ \
43 "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ 67 "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
44 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ 68 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
@@ -103,7 +127,9 @@ do { \
103 * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL 127 * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
104 * has no effect. 128 * has no effect.
105 */ 129 */
106#define switch_to(prev, next, last) \ 130#define switch_to(prev, next, last) \
131 prepare_switch_to(prev, next); \
132 \
107 asm volatile(SAVE_CONTEXT \ 133 asm volatile(SAVE_CONTEXT \
108 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 134 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
109 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 135 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b70ca12dd389..907b4e4aeb5e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
292DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 292DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
293DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) 293DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
294 294
295#ifdef CONFIG_VMAP_STACK
296static void __noreturn handle_stack_overflow(const char *message,
297 struct pt_regs *regs,
298 unsigned long fault_address)
299{
300 printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
301 (void *)fault_address, current->stack,
302 (char *)current->stack + THREAD_SIZE - 1);
303 die(message, regs, 0);
304
305 /* Be absolutely certain we don't return. */
306 panic(message);
307}
308#endif
309
295#ifdef CONFIG_X86_64 310#ifdef CONFIG_X86_64
296/* Runs on IST stack */ 311/* Runs on IST stack */
297dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 312dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
298{ 313{
299 static const char str[] = "double fault"; 314 static const char str[] = "double fault";
300 struct task_struct *tsk = current; 315 struct task_struct *tsk = current;
316#ifdef CONFIG_VMAP_STACK
317 unsigned long cr2;
318#endif
301 319
302#ifdef CONFIG_X86_ESPFIX64 320#ifdef CONFIG_X86_ESPFIX64
303 extern unsigned char native_irq_return_iret[]; 321 extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
332 tsk->thread.error_code = error_code; 350 tsk->thread.error_code = error_code;
333 tsk->thread.trap_nr = X86_TRAP_DF; 351 tsk->thread.trap_nr = X86_TRAP_DF;
334 352
353#ifdef CONFIG_VMAP_STACK
354 /*
355 * If we overflow the stack into a guard page, the CPU will fail
356 * to deliver #PF and will send #DF instead. Similarly, if we
357 * take any non-IST exception while too close to the bottom of
358 * the stack, the processor will get a page fault while
359 * delivering the exception and will generate a double fault.
360 *
361 * According to the SDM (footnote in 6.15 under "Interrupt 14 -
362 * Page-Fault Exception (#PF):
363 *
364 * Processors update CR2 whenever a page fault is detected. If a
365 * second page fault occurs while an earlier page fault is being
366 * deliv- ered, the faulting linear address of the second fault will
367 * overwrite the contents of CR2 (replacing the previous
368 * address). These updates to CR2 occur even if the page fault
369 * results in a double fault or occurs during the delivery of a
370 * double fault.
371 *
372 * The logic below has a small possibility of incorrectly diagnosing
373 * some errors as stack overflows. For example, if the IDT or GDT
374 * gets corrupted such that #GP delivery fails due to a bad descriptor
375 * causing #GP and we hit this condition while CR2 coincidentally
376 * points to the stack guard page, we'll think we overflowed the
377 * stack. Given that we're going to panic one way or another
378 * if this happens, this isn't necessarily worth fixing.
379 *
380 * If necessary, we could improve the test by only diagnosing
381 * a stack overflow if the saved RSP points within 47 bytes of
382 * the bottom of the stack: if RSP == tsk_stack + 48 and we
383 * take an exception, the stack is already aligned and there
384 * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
385 * possible error code, so a stack overflow would *not* double
386 * fault. With any less space left, exception delivery could
387 * fail, and, as a practical matter, we've overflowed the
388 * stack even if the actual trigger for the double fault was
389 * something else.
390 */
391 cr2 = read_cr2();
392 if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
393 handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
394#endif
395
335#ifdef CONFIG_DOUBLEFAULT 396#ifdef CONFIG_DOUBLEFAULT
336 df_debug(regs, error_code); 397 df_debug(regs, error_code);
337#endif 398#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4dbe65622810..a7655f6caf7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
77 unsigned cpu = smp_processor_id(); 77 unsigned cpu = smp_processor_id();
78 78
79 if (likely(prev != next)) { 79 if (likely(prev != next)) {
80 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
81 /*
82 * If our current stack is in vmalloc space and isn't
83 * mapped in the new pgd, we'll double-fault. Forcibly
84 * map it.
85 */
86 unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
87
88 pgd_t *pgd = next->pgd + stack_pgd_index;
89
90 if (unlikely(pgd_none(*pgd)))
91 set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
92 }
93
80#ifdef CONFIG_SMP 94#ifdef CONFIG_SMP
81 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 95 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
82 this_cpu_write(cpu_tlbstate.active_mm, next); 96 this_cpu_write(cpu_tlbstate.active_mm, next);
83#endif 97#endif
98
84 cpumask_set_cpu(cpu, mm_cpumask(next)); 99 cpumask_set_cpu(cpu, mm_cpumask(next));
85 100
86 /* 101 /*