diff options
author | Andy Lutomirski <luto@kernel.org> | 2017-12-04 09:07:23 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-12-17 08:27:38 -0500 |
commit | 7f2590a110b837af5679d08fc25c6227c5a8c497 (patch) | |
tree | f94f6d9d657feaf40bda4f7597807b8beb3f0081 | |
parent | 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb (diff) |
x86/entry/64: Use a per-CPU trampoline stack for IDT entries
Historically, IDT entries from usermode have always gone directly
to the running task's kernel stack. Rearrange it so that we enter on
a per-CPU trampoline stack and then manually switch to the task's stack.
This touches a couple of extra cachelines, but it gives us a chance
to run some code before we touch the kernel stack.
The asm isn't exactly beautiful, but I think that fully refactoring
it can wait.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/x86/entry/entry_64.S | 67 | ||||
-rw-r--r-- | arch/x86/entry/entry_64_compat.S | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/switch_to.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/traps.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 21 |
6 files changed, 72 insertions, 32 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 32306788821c..35b8e949ac2f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -560,6 +560,13 @@ END(irq_entries_start) | |||
560 | /* 0(%rsp): ~(interrupt number) */ | 560 | /* 0(%rsp): ~(interrupt number) */ |
561 | .macro interrupt func | 561 | .macro interrupt func |
562 | cld | 562 | cld |
563 | |||
564 | testb $3, CS-ORIG_RAX(%rsp) | ||
565 | jz 1f | ||
566 | SWAPGS | ||
567 | call switch_to_thread_stack | ||
568 | 1: | ||
569 | |||
563 | ALLOC_PT_GPREGS_ON_STACK | 570 | ALLOC_PT_GPREGS_ON_STACK |
564 | SAVE_C_REGS | 571 | SAVE_C_REGS |
565 | SAVE_EXTRA_REGS | 572 | SAVE_EXTRA_REGS |
@@ -569,12 +576,8 @@ END(irq_entries_start) | |||
569 | jz 1f | 576 | jz 1f |
570 | 577 | ||
571 | /* | 578 | /* |
572 | * IRQ from user mode. Switch to kernel gsbase and inform context | 579 | * IRQ from user mode. |
573 | * tracking that we're in kernel mode. | 580 | * |
574 | */ | ||
575 | SWAPGS | ||
576 | |||
577 | /* | ||
578 | * We need to tell lockdep that IRQs are off. We can't do this until | 581 | * We need to tell lockdep that IRQs are off. We can't do this until |
579 | * we fix gsbase, and we should do it before enter_from_user_mode | 582 | * we fix gsbase, and we should do it before enter_from_user_mode |
580 | * (which can take locks). Since TRACE_IRQS_OFF idempotent, | 583 | * (which can take locks). Since TRACE_IRQS_OFF idempotent, |
@@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt | |||
828 | */ | 831 | */ |
829 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) | 832 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) |
830 | 833 | ||
834 | /* | ||
835 | * Switch to the thread stack. This is called with the IRET frame and | ||
836 | * orig_ax on the stack. (That is, RDI..R12 are not on the stack and | ||
837 | * space has not been allocated for them.) | ||
838 | */ | ||
839 | ENTRY(switch_to_thread_stack) | ||
840 | UNWIND_HINT_FUNC | ||
841 | |||
842 | pushq %rdi | ||
843 | movq %rsp, %rdi | ||
844 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | ||
845 | UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI | ||
846 | |||
847 | pushq 7*8(%rdi) /* regs->ss */ | ||
848 | pushq 6*8(%rdi) /* regs->rsp */ | ||
849 | pushq 5*8(%rdi) /* regs->eflags */ | ||
850 | pushq 4*8(%rdi) /* regs->cs */ | ||
851 | pushq 3*8(%rdi) /* regs->ip */ | ||
852 | pushq 2*8(%rdi) /* regs->orig_ax */ | ||
853 | pushq 8(%rdi) /* return address */ | ||
854 | UNWIND_HINT_FUNC | ||
855 | |||
856 | movq (%rdi), %rdi | ||
857 | ret | ||
858 | END(switch_to_thread_stack) | ||
859 | |||
831 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 | 860 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 |
832 | ENTRY(\sym) | 861 | ENTRY(\sym) |
833 | UNWIND_HINT_IRET_REGS offset=\has_error_code*8 | 862 | UNWIND_HINT_IRET_REGS offset=\has_error_code*8 |
@@ -845,11 +874,12 @@ ENTRY(\sym) | |||
845 | 874 | ||
846 | ALLOC_PT_GPREGS_ON_STACK | 875 | ALLOC_PT_GPREGS_ON_STACK |
847 | 876 | ||
848 | .if \paranoid | 877 | .if \paranoid < 2 |
849 | .if \paranoid == 1 | ||
850 | testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ | 878 | testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ |
851 | jnz 1f | 879 | jnz .Lfrom_usermode_switch_stack_\@ |
852 | .endif | 880 | .endif |
881 | |||
882 | .if \paranoid | ||
853 | call paranoid_entry | 883 | call paranoid_entry |
854 | .else | 884 | .else |
855 | call error_entry | 885 | call error_entry |
@@ -891,20 +921,15 @@ ENTRY(\sym) | |||
891 | jmp error_exit | 921 | jmp error_exit |
892 | .endif | 922 | .endif |
893 | 923 | ||
894 | .if \paranoid == 1 | 924 | .if \paranoid < 2 |
895 | /* | 925 | /* |
896 | * Paranoid entry from userspace. Switch stacks and treat it | 926 | * Entry from userspace. Switch stacks and treat it |
897 | * as a normal entry. This means that paranoid handlers | 927 | * as a normal entry. This means that paranoid handlers |
898 | * run in real process context if user_mode(regs). | 928 | * run in real process context if user_mode(regs). |
899 | */ | 929 | */ |
900 | 1: | 930 | .Lfrom_usermode_switch_stack_\@: |
901 | call error_entry | 931 | call error_entry |
902 | 932 | ||
903 | |||
904 | movq %rsp, %rdi /* pt_regs pointer */ | ||
905 | call sync_regs | ||
906 | movq %rax, %rsp /* switch stack */ | ||
907 | |||
908 | movq %rsp, %rdi /* pt_regs pointer */ | 933 | movq %rsp, %rdi /* pt_regs pointer */ |
909 | 934 | ||
910 | .if \has_error_code | 935 | .if \has_error_code |
@@ -1165,6 +1190,14 @@ ENTRY(error_entry) | |||
1165 | SWAPGS | 1190 | SWAPGS |
1166 | 1191 | ||
1167 | .Lerror_entry_from_usermode_after_swapgs: | 1192 | .Lerror_entry_from_usermode_after_swapgs: |
1193 | /* Put us onto the real thread stack. */ | ||
1194 | popq %r12 /* save return addr in %12 */ | ||
1195 | movq %rsp, %rdi /* arg0 = pt_regs pointer */ | ||
1196 | call sync_regs | ||
1197 | movq %rax, %rsp /* switch stack */ | ||
1198 | ENCODE_FRAME_POINTER | ||
1199 | pushq %r12 | ||
1200 | |||
1168 | /* | 1201 | /* |
1169 | * We need to tell lockdep that IRQs are off. We can't do this until | 1202 | * We need to tell lockdep that IRQs are off. We can't do this until |
1170 | * we fix gsbase, and we should do it before enter_from_user_mode | 1203 | * we fix gsbase, and we should do it before enter_from_user_mode |
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index dcc6987f9bae..95ad40eb7eff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S | |||
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat) | |||
306 | */ | 306 | */ |
307 | movl %eax, %eax | 307 | movl %eax, %eax |
308 | 308 | ||
309 | /* Construct struct pt_regs on stack (iret frame is already on stack) */ | ||
310 | pushq %rax /* pt_regs->orig_ax */ | 309 | pushq %rax /* pt_regs->orig_ax */ |
310 | |||
311 | /* switch to thread stack expects orig_ax to be pushed */ | ||
312 | call switch_to_thread_stack | ||
313 | |||
311 | pushq %rdi /* pt_regs->di */ | 314 | pushq %rdi /* pt_regs->di */ |
312 | pushq %rsi /* pt_regs->si */ | 315 | pushq %rsi /* pt_regs->si */ |
313 | pushq %rdx /* pt_regs->dx */ | 316 | pushq %rdx /* pt_regs->dx */ |
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..cbc71e73bd32 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h | |||
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) | |||
90 | /* This is used when switching tasks or entering/exiting vm86 mode. */ | 90 | /* This is used when switching tasks or entering/exiting vm86 mode. */ |
91 | static inline void update_sp0(struct task_struct *task) | 91 | static inline void update_sp0(struct task_struct *task) |
92 | { | 92 | { |
93 | /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ | ||
93 | #ifdef CONFIG_X86_32 | 94 | #ifdef CONFIG_X86_32 |
94 | load_sp0(task->thread.sp0); | 95 | load_sp0(task->thread.sp0); |
95 | #else | 96 | #else |
96 | load_sp0(task_top_of_stack(task)); | 97 | if (static_cpu_has(X86_FEATURE_XENPV)) |
98 | load_sp0(task_top_of_stack(task)); | ||
97 | #endif | 99 | #endif |
98 | } | 100 | } |
99 | 101 | ||
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff68..31051f35cbb7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); | |||
75 | dotraplinkage void do_stack_segment(struct pt_regs *, long); | 75 | dotraplinkage void do_stack_segment(struct pt_regs *, long); |
76 | #ifdef CONFIG_X86_64 | 76 | #ifdef CONFIG_X86_64 |
77 | dotraplinkage void do_double_fault(struct pt_regs *, long); | 77 | dotraplinkage void do_double_fault(struct pt_regs *, long); |
78 | asmlinkage struct pt_regs *sync_regs(struct pt_regs *); | ||
79 | #endif | 78 | #endif |
80 | dotraplinkage void do_general_protection(struct pt_regs *, long); | 79 | dotraplinkage void do_general_protection(struct pt_regs *, long); |
81 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); | 80 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e5837bd6c672..57968880e39b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1623,11 +1623,13 @@ void cpu_init(void) | |||
1623 | setup_cpu_entry_area(cpu); | 1623 | setup_cpu_entry_area(cpu); |
1624 | 1624 | ||
1625 | /* | 1625 | /* |
1626 | * Initialize the TSS. Don't bother initializing sp0, as the initial | 1626 | * Initialize the TSS. sp0 points to the entry trampoline stack |
1627 | * task never enters user mode. | 1627 | * regardless of what task is running. |
1628 | */ | 1628 | */ |
1629 | set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); | 1629 | set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); |
1630 | load_TR_desc(); | 1630 | load_TR_desc(); |
1631 | load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + | ||
1632 | offsetofend(struct tss_struct, SYSENTER_stack)); | ||
1631 | 1633 | ||
1632 | load_mm_ldt(&init_mm); | 1634 | load_mm_ldt(&init_mm); |
1633 | 1635 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f0029d17b14b..ee9ca0ad4388 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3); | |||
619 | 619 | ||
620 | #ifdef CONFIG_X86_64 | 620 | #ifdef CONFIG_X86_64 |
621 | /* | 621 | /* |
622 | * Help handler running on IST stack to switch off the IST stack if the | 622 | * Help handler running on a per-cpu (IST or entry trampoline) stack |
623 | * interrupted code was in user mode. The actual stack switch is done in | 623 | * to switch to the normal thread stack if the interrupted code was in |
624 | * entry_64.S | 624 | * user mode. The actual stack switch is done in entry_64.S |
625 | */ | 625 | */ |
626 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) | 626 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) |
627 | { | 627 | { |
628 | struct pt_regs *regs = task_pt_regs(current); | 628 | struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; |
629 | *regs = *eregs; | 629 | if (regs != eregs) |
630 | *regs = *eregs; | ||
630 | return regs; | 631 | return regs; |
631 | } | 632 | } |
632 | NOKPROBE_SYMBOL(sync_regs); | 633 | NOKPROBE_SYMBOL(sync_regs); |
@@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) | |||
642 | /* | 643 | /* |
643 | * This is called from entry_64.S early in handling a fault | 644 | * This is called from entry_64.S early in handling a fault |
644 | * caused by a bad iret to user mode. To handle the fault | 645 | * caused by a bad iret to user mode. To handle the fault |
645 | * correctly, we want move our stack frame to task_pt_regs | 646 | * correctly, we want to move our stack frame to where it would |
646 | * and we want to pretend that the exception came from the | 647 | * be had we entered directly on the entry stack (rather than |
647 | * iret target. | 648 | * just below the IRET frame) and we want to pretend that the |
649 | * exception came from the IRET target. | ||
648 | */ | 650 | */ |
649 | struct bad_iret_stack *new_stack = | 651 | struct bad_iret_stack *new_stack = |
650 | container_of(task_pt_regs(current), | 652 | (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; |
651 | struct bad_iret_stack, regs); | ||
652 | 653 | ||
653 | /* Copy the IRET target to the new stack. */ | 654 | /* Copy the IRET target to the new stack. */ |
654 | memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); | 655 | memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); |