aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2016-03-09 22:00:32 -0500
committerIngo Molnar <mingo@kernel.org>2016-03-10 03:48:14 -0500
commit7536656f08d0c1a3b4c487d00785c5186ec6f533 (patch)
tree7a629298ef8b0acee7e1aa1ff07b7c93bb65d136 /arch/x86/entry
parent6dcc94149d605908a7c0c4cf2085340637aac86d (diff)
x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup
Right after SYSENTER, we can get a #DB or NMI. On x86_32, there's no IST, so the exception handler is invoked on the temporary SYSENTER stack. Because the SYSENTER stack is very small, we have a fixup to switch off the stack quickly when this happens. The old fixup had several issues: 1. It checked the interrupt frame's CS and EIP. This wasn't obviously correct on Xen or if vm86 mode was in use [1]. 2. In the NMI handler, it did some frightening digging into the stack frame. I'm not convinced this digging was correct. 3. The fixup didn't switch stacks and then switch back. Instead, it synthesized a brand new stack frame that would redirect the IRET back to the SYSENTER code. That frame was highly questionable. For one thing, if NMI nested inside #DB, we would effectively abort the #DB prologue, which was probably safe but was frightening. For another, the code used PUSHFL to write the FLAGS portion of the frame, which was simply bogus -- by the time PUSHFL was called, at least TF, NT, VM, and all of the arithmetic flags were clobbered. Simplify this considerably. Instead of looking at the saved frame to see where we came from, check the hardware ESP register against the SYSENTER stack directly. Malicious user code cannot spoof the kernel ESP register, and by moving the check after SAVE_ALL, we can use normal PER_CPU accesses to find all the relevant addresses. With this patch applied, the improved syscall_nt_32 test finally passes on 32-bit kernels. [1] It isn't obviously correct, but it is nonetheless safe from vm86 shenanigans as far as I can tell. A user can't point EIP at entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32, like all kernel addresses, is greater than 0xffff and would thus violate the CS segment limit. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/b2cdbc037031c07ecf2c40a96069318aec0e7971.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/entry_32.S114
1 files changed, 51 insertions, 63 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2e1d446bdf8..7b3ec24ede82 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -976,51 +976,48 @@ error_code:
976 jmp ret_from_exception 976 jmp ret_from_exception
977END(page_fault) 977END(page_fault)
978 978
979/*
980 * Debug traps and NMI can happen at the one SYSENTER instruction
981 * that sets up the real kernel stack. Check here, since we can't
982 * allow the wrong stack to be used.
983 *
984 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
985 * already pushed 3 words if it hits on the sysenter instruction:
986 * eflags, cs and eip.
987 *
988 * We just load the right stack, and push the three (known) values
989 * by hand onto the new stack - while updating the return eip past
990 * the instruction that would have done it for sysenter.
991 */
992.macro FIX_STACK offset ok label
993 cmpw $__KERNEL_CS, 4(%esp)
994 jne \ok
995\label:
996 movl TSS_sysenter_sp0 + \offset(%esp), %esp
997 pushfl
998 pushl $__KERNEL_CS
999 pushl $sysenter_past_esp
1000.endm
1001
1002ENTRY(debug) 979ENTRY(debug)
980 /*
981 * #DB can happen at the first instruction of
982 * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
983 * happens, then we will be running on a very small stack. We
984 * need to detect this condition and switch to the thread
985 * stack before calling any C code at all.
986 *
987 * If you edit this code, keep in mind that NMIs can happen in here.
988 */
1003 ASM_CLAC 989 ASM_CLAC
1004 cmpl $entry_SYSENTER_32, (%esp)
1005 jne debug_stack_correct
1006 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1007debug_stack_correct:
1008 pushl $-1 # mark this as an int 990 pushl $-1 # mark this as an int
1009 SAVE_ALL 991 SAVE_ALL
1010 TRACE_IRQS_OFF
1011 xorl %edx, %edx # error code 0 992 xorl %edx, %edx # error code 0
1012 movl %esp, %eax # pt_regs pointer 993 movl %esp, %eax # pt_regs pointer
994
995 /* Are we currently on the SYSENTER stack? */
996 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
997 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
998 cmpl $SIZEOF_SYSENTER_stack, %ecx
999 jb .Ldebug_from_sysenter_stack
1000
1001 TRACE_IRQS_OFF
1002 call do_debug
1003 jmp ret_from_exception
1004
1005.Ldebug_from_sysenter_stack:
1006 /* We're on the SYSENTER stack. Switch off. */
1007 movl %esp, %ebp
1008 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1009 TRACE_IRQS_OFF
1013 call do_debug 1010 call do_debug
1011 movl %ebp, %esp
1014 jmp ret_from_exception 1012 jmp ret_from_exception
1015END(debug) 1013END(debug)
1016 1014
1017/* 1015/*
1018 * NMI is doubly nasty. It can happen _while_ we're handling 1016 * NMI is doubly nasty. It can happen on the first instruction of
1019 * a debug fault, and the debug fault hasn't yet been able to 1017 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
1020 * clear up the stack. So we first check whether we got an 1018 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
1021 * NMI on the sysenter entry path, but after that we need to 1019 * switched stacks. We handle both conditions by simply checking whether we
1022 * check whether we got an NMI on the debug path where the debug 1020 * interrupted kernel code running on the SYSENTER stack.
1023 * fault happened on the sysenter path.
1024 */ 1021 */
1025ENTRY(nmi) 1022ENTRY(nmi)
1026 ASM_CLAC 1023 ASM_CLAC
@@ -1031,41 +1028,32 @@ ENTRY(nmi)
1031 popl %eax 1028 popl %eax
1032 je nmi_espfix_stack 1029 je nmi_espfix_stack
1033#endif 1030#endif
1034 cmpl $entry_SYSENTER_32, (%esp) 1031
1035 je nmi_stack_fixup 1032 pushl %eax # pt_regs->orig_ax
1036 pushl %eax
1037 movl %esp, %eax
1038 /*
1039 * Do not access memory above the end of our stack page,
1040 * it might not exist.
1041 */
1042 andl $(THREAD_SIZE-1), %eax
1043 cmpl $(THREAD_SIZE-20), %eax
1044 popl %eax
1045 jae nmi_stack_correct
1046 cmpl $entry_SYSENTER_32, 12(%esp)
1047 je nmi_debug_stack_check
1048nmi_stack_correct:
1049 pushl %eax
1050 SAVE_ALL 1033 SAVE_ALL
1051 xorl %edx, %edx # zero error code 1034 xorl %edx, %edx # zero error code
1052 movl %esp, %eax # pt_regs pointer 1035 movl %esp, %eax # pt_regs pointer
1036
1037 /* Are we currently on the SYSENTER stack? */
1038 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
1039 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
1040 cmpl $SIZEOF_SYSENTER_stack, %ecx
1041 jb .Lnmi_from_sysenter_stack
1042
1043 /* Not on SYSENTER stack. */
1053 call do_nmi 1044 call do_nmi
1054 jmp restore_all_notrace 1045 jmp restore_all_notrace
1055 1046
1056nmi_stack_fixup: 1047.Lnmi_from_sysenter_stack:
1057 FIX_STACK 12, nmi_stack_correct, 1 1048 /*
1058 jmp nmi_stack_correct 1049 * We're on the SYSENTER stack. Switch off. No one (not even debug)
1059 1050 * is using the thread stack right now, so it's safe for us to use it.
1060nmi_debug_stack_check: 1051 */
1061 cmpw $__KERNEL_CS, 16(%esp) 1052 movl %esp, %ebp
1062 jne nmi_stack_correct 1053 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1063 cmpl $debug, (%esp) 1054 call do_nmi
1064 jb nmi_stack_correct 1055 movl %ebp, %esp
1065 cmpl $debug_esp_fix_insn, (%esp) 1056 jmp restore_all_notrace
1066 ja nmi_stack_correct
1067 FIX_STACK 24, nmi_stack_correct, 1
1068 jmp nmi_stack_correct
1069 1057
1070#ifdef CONFIG_X86_ESPFIX32 1058#ifdef CONFIG_X86_ESPFIX32
1071nmi_espfix_stack: 1059nmi_espfix_stack: