Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm updates from Ingo Molnar: "This is another big update. Main changes are: - lots of x86 system call (and other traps/exceptions) entry code enhancements. In particular the complex parts of the 64-bit entry code have been migrated to C code as well, and a number of dusty corners have been refreshed. (Andy Lutomirski) - vDSO special mapping robustification and general cleanups (Andy Lutomirski) - cpufeature refactoring, cleanups and speedups (Borislav Petkov) - lots of other changes ..." * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits) x86/cpufeature: Enable new AVX-512 features x86/entry/traps: Show unhandled signal for i386 in do_trap() x86/entry: Call enter_from_user_mode() with IRQs off x86/entry/32: Change INT80 to be an interrupt gate x86/entry: Improve system call entry comments x86/entry: Remove TIF_SINGLESTEP entry work x86/entry/32: Add and check a stack canary for the SYSENTER stack x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup x86/entry: Only allocate space for tss_struct::SYSENTER_stack if needed x86/entry: Vastly simplify SYSENTER TF (single-step) handling x86/entry/traps: Clear DR6 early in do_debug() and improve the comment x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions x86/entry/32: Restore FLAGS on SYSEXIT x86/entry/32: Filter NT and speed up AC filtering in SYSENTER x86/entry/compat: In SYSENTER, sink AC clearing below the existing FLAGS test selftests/x86: In syscall_nt, test NT|TF as well x86/asm-offsets: Remove PARAVIRT_enabled x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled uprobes: __create_xol_area() must nullify xol_mapping.fault x86/cpufeature: Create a new synthetic cpu capability for machine check recovery ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-15 12:32:27 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-15 12:32:27 -0400
commit: ba33ea811e1ff6726abb7f8f96df38c2d7b50304 (patch)
tree: 29134e5cc7c19c8e520cb9336b476144d3d1252f /arch/x86/entry
parent: e23604edac2a7be6a8808a5d13fac6b9df4eb9a8 (diff)
parent: d05004944206cbbf1c453e179768163731c7c6f1 (diff)
14 files changed, 566 insertions, 474 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e09868..9a9e5884066c 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
        .byte 0xf1
        .endm
-#else /* CONFIG_X86_64 */
-/*
- * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
- * are different from the entry_32.S versions in not changing the segment
- * registers. So only suitable for in kernel use, not when transitioning
- * from or to user space. The resulting stack frame is not a standard
- * pt_regs frame. The main use case is calling C code from assembler
- * when all the registers need to be preserved.
- */
-        .macro SAVE_ALL
-        pushl %eax
-        pushl %ebp
-        pushl %edi
-        pushl %esi
-        pushl %edx
-        pushl %ecx
-        pushl %ebx
-        .endm
-        .macro RESTORE_ALL
-        popl %ebx
-        popl %ecx
-        popl %edx
-        popl %esi
-        popl %edi
-        popl %ebp
-        popl %eax
-        .endm
 #endif /* CONFIG_X86_64 */
 /*
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03663740c866..e79d93d44ecd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
 #include <asm/traps.h>
 #include <asm/vdso.h>
 #include <asm/uaccess.h>
+#include <asm/cpufeature.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
        CT_WARN_ON(ct_state() != CONTEXT_USER);
        user_exit();
 }
+#else
+static inline void enter_from_user_mode(void) {}
 #endif
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
        work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
-#ifdef CONFIG_CONTEXT_TRACKING
-        /*
-         * If TIF_NOHZ is set, we are required to call user_exit() before
-         * doing anything that could touch RCU.
-         */
-        if (work & _TIF_NOHZ) {
-                enter_from_user_mode();
-                work &= ~_TIF_NOHZ;
-        }
-#endif
 #ifdef CONFIG_SECCOMP
        /*
         * Do seccomp first -- it should minimize exposure of other
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
        if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
                BUG_ON(regs != task_pt_regs(current));
-        /*
-         * If we stepped into a sysenter/syscall insn, it trapped in
-         * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-         * If user-mode had set TF itself, then it's still clear from
-         * do_debug() and we need to set it again to restore the user
-         * state.  If we entered on the slow path, TF was already set.
-         */
-        if (work & _TIF_SINGLESTEP)
-                regs->flags |= X86_EFLAGS_TF;
 #ifdef CONFIG_SECCOMP
        /*
         * Call seccomp_phase2 before running the other hooks so that
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 /* Called with IRQs disabled. */
 __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 {
+        struct thread_info *ti = pt_regs_to_thread_info(regs);
        u32 cached_flags;
        if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
        lockdep_sys_exit();
-        cached_flags =
+        cached_flags = READ_ONCE(ti->flags);
-                READ_ONCE(pt_regs_to_thread_info(regs)->flags);
        if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
                exit_to_usermode_loop(regs, cached_flags);
+#ifdef CONFIG_COMPAT
+        /*
+         * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+         * returning to user mode.  We need to clear it *after* signal
+         * handling, because syscall restart has a fixup for compat
+         * syscalls.  The fixup is exercised by the ptrace_syscall_32
+         * selftest.
+         */
+        ti->status &= ~TS_COMPAT;
+#endif
        user_enter();
 }
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
        if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
                syscall_slow_exit_work(regs, cached_flags);
-#ifdef CONFIG_COMPAT
+        local_irq_disable();
+        prepare_exit_to_usermode(regs);
+}
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+        struct thread_info *ti = pt_regs_to_thread_info(regs);
+        unsigned long nr = regs->orig_ax;
+        enter_from_user_mode();
+        local_irq_enable();
+        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+                nr = syscall_trace_enter(regs);
        /*
-         * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+         * NB: Native and x32 syscalls are dispatched from the same
-         * returning to user mode.
+         * table.  The only functional difference is the x32 bit in
+         * regs->orig_ax, which changes the behavior of some syscalls.
         */
-        ti->status &= ~TS_COMPAT;
+        if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
-#endif
+                regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+                        regs->di, regs->si, regs->dx,
+                        regs->r10, regs->r8, regs->r9);
+        }
-        local_irq_disable();
+        syscall_return_slowpath(regs);
-        prepare_exit_to_usermode(regs);
 }
+#endif
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
- * Does a 32-bit syscall.  Called with IRQs on and does all entry and
+ * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
- * exit work and returns with IRQs off.  This function is extremely hot
+ * all entry and exit work and returns with IRQs off.  This function is
- * in workloads that use it, and it's usually called from
+ * extremely hot in workloads that use it, and it's usually called from
 * do_fast_syscall_32, so forcibly inline it to improve performance.
 */
-#ifdef CONFIG_X86_32
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 {
        struct thread_info *ti = pt_regs_to_thread_info(regs);
        unsigned int nr = (unsigned int)regs->orig_ax;
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
        syscall_return_slowpath(regs);
 }
-#ifdef CONFIG_X86_64
+/* Handles int $0x80 */
-/* Handles INT80 on 64-bit kernels */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
 {
+        enter_from_user_mode();
        local_irq_enable();
        do_syscall_32_irqs_on(regs);
 }
-#endif
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
         */
        regs->ip = landing_pad;
-        /*
+        enter_from_user_mode();
-         * Fetch EBP from where the vDSO stashed it.
-         *
-         * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
-         */
        local_irq_enable();
+        /* Fetch EBP from where the vDSO stashed it. */
        if (
 #ifdef CONFIG_X86_64
                /*
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
                /* User code screwed up. */
                local_irq_disable();
                regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
-                enter_from_user_mode();
-#endif
                prepare_exit_to_usermode(regs);
                return 0;       /* Keep it simple: use IRET. */
        }
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bb3e376d0f33..10868aa734dc 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
@@ -287,14 +287,64 @@ need_resched:
 END(resume_kernel)
 #endif
-        # SYSENTER  call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+        addl    $5*4, %esp                      /* remove xen-provided frame */
+        jmp     sysenter_past_esp
+#endif
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
 ENTRY(entry_SYSENTER_32)
        movl    TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
        pushl   $__USER_DS              /* pt_regs->ss */
        pushl   %ebp                    /* pt_regs->sp (stashed in bp) */
        pushfl                          /* pt_regs->flags (except IF = 0) */
-        ASM_CLAC                        /* Clear AC after saving FLAGS */
        orl     $X86_EFLAGS_IF, (%esp)  /* Fix IF */
        pushl   $__USER_CS              /* pt_regs->cs */
        pushl   $0                      /* pt_regs->ip = 0 (placeholder) */
@@ -302,6 +352,29 @@ sysenter_past_esp:
        SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
        /*
+         * SYSENTER doesn't filter flags, so we need to clear NT, AC
+         * and TF ourselves.  To save a few cycles, we can check whether
+         * either was set instead of doing an unconditional popfq.
+         * This needs to happen before enabling interrupts so that
+         * we don't get preempted with NT set.
+         *
+         * If TF is set, we will single-step all the way to here -- do_debug
+         * will ignore all the traps.  (Yes, this is slow, but so is
+         * single-stepping in general.  This allows us to avoid having
+         * a more complicated code to handle the case where a user program
+         * forces us to single-step through the SYSENTER entry code.)
+         *
+         * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+         * out-of-line as an optimization: NT is unlikely to be set in the
+         * majority of the cases and instead of polluting the I$ unnecessarily,
+         * we're keeping that code behind a branch which will predict as
+         * not-taken and therefore its instructions won't be fetched.
+         */
+        testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+        jnz     .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+        /*
         * User mode is traced as though IRQs are on, and SYSENTER
         * turned them off.
         */
@@ -327,6 +400,15 @@ sysenter_past_esp:
        popl    %eax                    /* pt_regs->ax */
        /*
+         * Restore all flags except IF. (We restore IF separately because
+         * STI gives a one-instruction window in which we won't be interrupted,
+         * whereas POPF does not.)
+         */
+        addl    $PT_EFLAGS-PT_DS, %esp  /* point esp at pt_regs->flags */
+        btr     $X86_EFLAGS_IF_BIT, (%esp)
+        popfl
+        /*
         * Return back to the vDSO, which will pop ecx and edx.
         * Don't bother with DS and ES (they already contain __USER_DS).
         */
@@ -339,28 +421,63 @@ sysenter_past_esp:
 .popsection
        _ASM_EXTABLE(1b, 2b)
        PTGS_TO_GS_EX
+.Lsysenter_fix_flags:
+        pushl   $X86_EFLAGS_FIXED
+        popfl
+        jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
-        # system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
 ENTRY(entry_INT80_32)
        ASM_CLAC
        pushl   %eax                    /* pt_regs->orig_ax */
        SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
        /*
-         * User mode is traced as though IRQs are on.  Unlike the 64-bit
+         * User mode is traced as though IRQs are on, and the interrupt gate
-         * case, INT80 is a trap gate on 32-bit kernels, so interrupts
+         * turned them off.
-         * are already on (unless user code is messing around with iopl).
         */
+        TRACE_IRQS_OFF
        movl    %esp, %eax
-        call    do_syscall_32_irqs_on
+        call    do_int80_syscall_32
 .Lsyscall_32_done:
 restore_all:
        TRACE_IRQS_IRET
 restore_all_notrace:
 #ifdef CONFIG_X86_ESPFIX32
+        ALTERNATIVE     "jmp restore_nocheck", "", X86_BUG_ESPFIX
        movl    PT_EFLAGS(%esp), %eax           # mix EFLAGS, SS and CS
        /*
         * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +504,6 @@ ENTRY(iret_exc	)
 #ifdef CONFIG_X86_ESPFIX32
 ldt_ss:
-#ifdef CONFIG_PARAVIRT
-        /*
-         * The kernel can't run on a non-flat stack if paravirt mode
-         * is active.  Rather than try to fixup the high bits of
-         * ESP, bypass this code entirely.  This may break DOSemu
-         * and/or Wine support in a paravirt VM, although the option
-         * is still available to implement the setting of the high
-         * 16-bits in the INTERRUPT_RETURN paravirt-op.
-         */
-        cmpl    $0, pv_info+PARAVIRT_enabled
-        jne     restore_nocheck
-#endif
 /*
 * Setup and switch to ESPFIX stack
 *
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
 END(spurious_interrupt_bug)
 #ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
-        addl    $5*4, %esp                      /* remove xen-provided frame */
-        jmp     sysenter_past_esp
 ENTRY(xen_hypervisor_callback)
        pushl   $-1                             /* orig_ax = -1 => not a system call */
        SAVE_ALL
@@ -939,51 +1035,48 @@ error_code:
        jmp     ret_from_exception
 END(page_fault)
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-        cmpw    $__KERNEL_CS, 4(%esp)
-        jne     \ok
-\label:
-        movl    TSS_sysenter_sp0 + \offset(%esp), %esp
-        pushfl
-        pushl   $__KERNEL_CS
-        pushl   $sysenter_past_esp
-.endm
 ENTRY(debug)
+        /*
+         * #DB can happen at the first instruction of
+         * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+         * happens, then we will be running on a very small stack.  We
+         * need to detect this condition and switch to the thread
+         * stack before calling any C code at all.
+         *
+         * If you edit this code, keep in mind that NMIs can happen in here.
+         */
        ASM_CLAC
-        cmpl    $entry_SYSENTER_32, (%esp)
-        jne     debug_stack_correct
-        FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
        pushl   $-1                             # mark this as an int
        SAVE_ALL
-        TRACE_IRQS_OFF
        xorl    %edx, %edx                      # error code 0
        movl    %esp, %eax                      # pt_regs pointer
+        /* Are we currently on the SYSENTER stack? */
+        PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+        cmpl    $SIZEOF_SYSENTER_stack, %ecx
+        jb      .Ldebug_from_sysenter_stack
+        TRACE_IRQS_OFF
+        call    do_debug
+        jmp     ret_from_exception
+.Ldebug_from_sysenter_stack:
+        /* We're on the SYSENTER stack.  Switch off. */
+        movl    %esp, %ebp
+        movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
+        TRACE_IRQS_OFF
        call    do_debug
+        movl    %ebp, %esp
        jmp     ret_from_exception
 END(debug)
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
+ * NMI is doubly nasty.  It can happen on the first instruction of
- * a debug fault, and the debug fault hasn't yet been able to
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
- * clear up the stack. So we first check whether we got  an
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
- * NMI on the sysenter entry path, but after that we need to
+ * switched stacks.  We handle both conditions by simply checking whether we
- * check whether we got an NMI on the debug path where the debug
+ * interrupted kernel code running on the SYSENTER stack.
- * fault happened on the sysenter path.
 */
 ENTRY(nmi)
        ASM_CLAC
@@ -994,41 +1087,32 @@ ENTRY(nmi)
        popl    %eax
        je      nmi_espfix_stack
 #endif
-        cmpl    $entry_SYSENTER_32, (%esp)
-        je      nmi_stack_fixup
+        pushl   %eax                            # pt_regs->orig_ax
-        pushl   %eax
-        movl    %esp, %eax
-        /*
-         * Do not access memory above the end of our stack page,
-         * it might not exist.
-         */
-        andl    $(THREAD_SIZE-1), %eax
-        cmpl    $(THREAD_SIZE-20), %eax
-        popl    %eax
-        jae     nmi_stack_correct
-        cmpl    $entry_SYSENTER_32, 12(%esp)
-        je      nmi_debug_stack_check
-nmi_stack_correct:
-        pushl   %eax
        SAVE_ALL
        xorl    %edx, %edx                      # zero error code
        movl    %esp, %eax                      # pt_regs pointer
+        /* Are we currently on the SYSENTER stack? */
+        PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+        cmpl    $SIZEOF_SYSENTER_stack, %ecx
+        jb      .Lnmi_from_sysenter_stack
+        /* Not on SYSENTER stack. */
        call    do_nmi
        jmp     restore_all_notrace
-nmi_stack_fixup:
+.Lnmi_from_sysenter_stack:
-        FIX_STACK 12, nmi_stack_correct, 1
+        /*
-        jmp     nmi_stack_correct
+         * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+         * is using the thread stack right now, so it's safe for us to use it.
-nmi_debug_stack_check:
+         */
-        cmpw    $__KERNEL_CS, 16(%esp)
+        movl    %esp, %ebp
-        jne     nmi_stack_correct
+        movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
-        cmpl    $debug, (%esp)
+        call    do_nmi
-        jb      nmi_stack_correct
+        movl    %ebp, %esp
-        cmpl    $debug_esp_fix_insn, (%esp)
+        jmp     restore_all_notrace
-        ja      nmi_stack_correct
-        FIX_STACK 24, nmi_stack_correct, 1
-        jmp     nmi_stack_correct
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3cfceb6..858b555e274b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
 /*
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
 *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+        TRACE_IRQS_OFF
        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-        /*
-         * Re-enable interrupts.
-         * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-         * must execute atomically in the face of possible interrupt-driven
-         * task preemption. We must enable interrupts only after we're done
-         * with using rsp_scratch:
-         */
-        ENABLE_INTERRUPTS(CLBR_NONE)
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        pushq   %r11                            /* pt_regs->r11 */
        sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
-        testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+        /*
-        jnz     tracesys
+         * If we need to do entry work or if we guess we'll need to do
+         * exit work, go straight to the slow path.
+         */
+        testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+        jnz     entry_SYSCALL64_slow_path
 entry_SYSCALL_64_fastpath:
+        /*
+         * Easy case: enable interrupts and issue the syscall.  If the syscall
+         * needs pt_regs, we'll call a stub that disables interrupts again
+         * and jumps to the slow path.
+         */
+        TRACE_IRQS_ON
+        ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
        cmpq    $__NR_syscall_max, %rax
 #else
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
 #endif
        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
        movq    %r10, %rcx
+        /*
+         * This call instruction is handled specially in stub_ptregs_64.
+         * It might end up jumping to the slow path.  If it jumps, RAX
+         * and all argument registers are clobbered.
+         */
        call    *sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
        movq    %rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-        LOCKDEP_SYS_EXIT
-        /*
-         * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-         * it is too small to ever cause noticeable irq latency.
-         */
-        DISABLE_INTERRUPTS(CLBR_NONE)
        /*
-         * We must check ti flags with interrupts (or at least preemption)
+         * If we get here, then we know that pt_regs is clean for SYSRET64.
-         * off because we must *never* return to userspace without
+         * If we see that no exit work is required (which we are required
-         * processing exit work that is enqueued if we're preempted here.
+         * to check with IRQs off), then we can go straight to SYSRET64.
-         * In particular, returning to userspace with any of the one-shot
-         * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-         * very bad.
         */
+        DISABLE_INTERRUPTS(CLBR_NONE)
+        TRACE_IRQS_OFF
        testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-        jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
+        jnz     1f
-        RESTORE_C_REGS_EXCEPT_RCX_R11
+        LOCKDEP_SYS_EXIT
+        TRACE_IRQS_ON           /* user mode is traced as IRQs on */
        movq    RIP(%rsp), %rcx
        movq    EFLAGS(%rsp), %r11
+        RESTORE_C_REGS_EXCEPT_RCX_R11
        movq    RSP(%rsp), %rsp
-        /*
-         * 64-bit SYSRET restores rip from rcx,
-         * rflags from r11 (but RF and VM bits are forced to 0),
-         * cs and ss are loaded from MSRs.
-         * Restoration of rflags re-enables interrupts.
-         *
-         * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-         * descriptor is not reinitialized.  This means that we should
-         * avoid SYSRET with SS == NULL, which could happen if we schedule,
-         * exit the kernel, and re-enter using an interrupt vector.  (All
-         * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-         * from happening by reloading SS in __switch_to.  (Actually
-         * detecting the failure in 64-bit userspace is tricky but can be
-         * done.)
-         */
        USERGS_SYSRET64
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+        /*
+         * The fast path looked good when we started, but something changed
+         * along the way and we need to switch to the slow path.  Calling
+         * raise(3) will trigger this, for example.  IRQs are off.
+         */
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-        jmp int_ret_from_sys_call
-        /* Do syscall entry tracing */
-tracesys:
-        movq    %rsp, %rdi
-        movl    $AUDIT_ARCH_X86_64, %esi
-        call    syscall_trace_enter_phase1
-        test    %rax, %rax
-        jnz     tracesys_phase2                 /* if needed, run the slow path */
-        RESTORE_C_REGS_EXCEPT_RAX               /* else restore clobbered regs */
-        movq    ORIG_RAX(%rsp), %rax
-        jmp     entry_SYSCALL_64_fastpath       /* and return to the fast path */
-tracesys_phase2:
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-        movl    $AUDIT_ARCH_X86_64, %esi
+        call    syscall_return_slowpath /* returns with IRQs disabled */
-        movq    %rax, %rdx
+        jmp     return_from_SYSCALL_64
-        call    syscall_trace_enter_phase2
-        /*
-         * Reload registers from stack in case ptrace changed them.
-         * We don't reload %rax because syscall_trace_entry_phase2() returned
-         * the value it wants us to use in the table lookup.
-         */
-        RESTORE_C_REGS_EXCEPT_RAX
-        RESTORE_EXTRA_REGS
-#if __SYSCALL_MASK == ~0
-        cmpq    $__NR_syscall_max, %rax
-#else
-        andl    $__SYSCALL_MASK, %eax
-        cmpl    $__NR_syscall_max, %eax
-#endif
-        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
-        movq    %r10, %rcx                      /* fixup for C */
-        call    *sys_call_table(, %rax, 8)
-        movq    %rax, RAX(%rsp)
-1:
-        /* Use IRET because user could have changed pt_regs->foo */
-/*
+entry_SYSCALL64_slow_path:
- * Syscall return path ending with IRET.
+        /* IRQs are off. */
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-        call    syscall_return_slowpath /* returns with IRQs disabled */
+        call    do_syscall_64           /* returns with IRQs disabled */
+return_from_SYSCALL_64:
        RESTORE_EXTRA_REGS
        TRACE_IRQS_IRETQ                /* we're about to change IF */
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
        jmp     restore_c_regs_and_iret
 END(entry_SYSCALL_64)
+ENTRY(stub_ptregs_64)
+        /*
+         * Syscalls marked as needing ptregs land here.
+         * If we are on the fast path, we need to save the extra regs,
+         * which we achieve by trying again on the slow path.  If we are on
+         * the slow path, the extra regs are already saved.
+         *
+         * RAX stores a pointer to the C function implementing the syscall.
+         * IRQs are on.
+         */
+        cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+        jne     1f
-        .macro FORK_LIKE func
-ENTRY(stub_\func)
-        SAVE_EXTRA_REGS 8
-        jmp     sys_\func
-END(stub_\func)
-        .endm
-        FORK_LIKE  clone
-        FORK_LIKE  fork
-        FORK_LIKE  vfork
-ENTRY(stub_execve)
-        call    sys_execve
-return_from_execve:
-        testl   %eax, %eax
-        jz      1f
-        /* exec failed, can use fast SYSRET code path in this case */
-        ret
-1:
-        /* must use IRET code path (pt_regs->cs may have changed) */
-        addq    $8, %rsp
-        ZERO_EXTRA_REGS
-        movq    %rax, RAX(%rsp)
-        jmp     int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
-        .align  8
-GLOBAL(stub_execveat)
-        call    sys_execveat
-        jmp     return_from_execve
-END(stub_execveat)
-#if defined(CONFIG_X86_X32_ABI)
-        .align  8
-GLOBAL(stub_x32_execve)
-        call    compat_sys_execve
-        jmp     return_from_execve
-END(stub_x32_execve)
-        .align  8
-GLOBAL(stub_x32_execveat)
-        call    compat_sys_execveat
-        jmp     return_from_execve
-END(stub_x32_execveat)
-#endif
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
        /*
-         * SAVE_EXTRA_REGS result is not normally needed:
+         * Called from fast path -- disable IRQs again, pop return address
-         * sigreturn overwrites all pt_regs->GPREGS.
+         * and jump to slow path
-         * But sigreturn can fail (!), and there is no easy way to detect that.
-         * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
-         * we SAVE_EXTRA_REGS here.
         */
-        SAVE_EXTRA_REGS 8
+        DISABLE_INTERRUPTS(CLBR_NONE)
-        call    sys_rt_sigreturn
+        TRACE_IRQS_OFF
-return_from_stub:
+        popq    %rax
-        addq    $8, %rsp
+        jmp     entry_SYSCALL64_slow_path
-        RESTORE_EXTRA_REGS
-        movq    %rax, RAX(%rsp)
-        jmp     int_ret_from_sys_call
-END(stub_rt_sigreturn)
-#ifdef CONFIG_X86_X32_ABI
+1:
-ENTRY(stub_x32_rt_sigreturn)
+        /* Called from C */
-        SAVE_EXTRA_REGS 8
+        jmp     *%rax                           /* called from C */
-        call    sys32_x32_rt_sigreturn
+END(stub_ptregs_64)
-        jmp     return_from_stub
-END(stub_x32_rt_sigreturn)
+.macro ptregs_stub func
-#endif
+ENTRY(ptregs_\func)
+        leaq    \func(%rip), %rax
+        jmp     stub_ptregs_64
+END(ptregs_\func)
+.endm
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 /*
 * A newly forked process directly context switches into this address.
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
 * rdi: prev task we switched from
 */
 ENTRY(ret_from_fork)
        LOCK ; btr $TIF_FORK, TI_flags(%r8)
        pushq   $0x0002
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
        call    schedule_tail                   /* rdi: 'prev' task parameter */
-        RESTORE_EXTRA_REGS
        testb   $3, CS(%rsp)                    /* from kernel_thread? */
+        jnz     1f
        /*
-         * By the time we get here, we have no idea whether our pt_regs,
+         * We came from kernel_thread.  This code path is quite twisted, and
-         * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+         * someone should clean it up.
-         * the slow path, or one of the 32-bit compat paths.
+         *
-         * Use IRET code path to return, since it can safely handle
+         * copy_thread_tls stashes the function pointer in RBX and the
-         * all of the above.
+         * parameter to be passed in RBP.  The called function is permitted
+         * to call do_execve and thereby jump to user mode.
         */
-        jnz     int_ret_from_sys_call
+        movq    RBP(%rsp), %rdi
+        call    *RBX(%rsp)
+        movl    $0, RAX(%rsp)
        /*
-         * We came from kernel_thread
+         * Fall through as though we're exiting a syscall.  This makes a
-         * nb: we depend on RESTORE_EXTRA_REGS above
+         * twisted sort of sense if we just called do_execve.
         */
-        movq    %rbp, %rdi
-        call    *%rbx
+1:
-        movl    $0, RAX(%rsp)
+        movq    %rsp, %rdi
-        RESTORE_EXTRA_REGS
+        call    syscall_return_slowpath /* returns with IRQs disabled */
-        jmp     int_ret_from_sys_call
+        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+        SWAPGS
+        jmp     restore_regs_and_iret
 END(ret_from_fork)
 /*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990eeee40b..847f2f0c31e5 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
        .section .entry.text, "ax"
 /*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
 *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
 * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
 *
 * Arguments:
 * eax  system call number
@@ -35,10 +44,6 @@
 * edi  arg5
 * ebp  user stack
 * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
 */
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
         */
        pushfq                          /* pt_regs->flags (except IF = 0) */
        orl     $X86_EFLAGS_IF, (%rsp)  /* Fix saved flags */
-        ASM_CLAC                        /* Clear AC after saving FLAGS */
        pushq   $__USER32_CS            /* pt_regs->cs */
        xorq    %r8,%r8
        pushq   %r8                     /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
        cld
        /*
-         * Sysenter doesn't filter flags, so we need to clear NT
+         * SYSENTER doesn't filter flags, so we need to clear NT and AC
         * ourselves.  To save a few cycles, we can check whether
-         * NT was set instead of doing an unconditional popfq.
+         * either was set instead of doing an unconditional popfq.
         * This needs to happen before enabling interrupts so that
         * we don't get preempted with NT set.
         *
+         * If TF is set, we will single-step all the way to here -- do_debug
+         * will ignore all the traps.  (Yes, this is slow, but so is
+         * single-stepping in general.  This allows us to avoid having
+         * a more complicated code to handle the case where a user program
+         * forces us to single-step through the SYSENTER entry code.)
+         *
         * NB.: .Lsysenter_fix_flags is a label with the code under it moved
         * out-of-line as an optimization: NT is unlikely to be set in the
         * majority of the cases and instead of polluting the I$ unnecessarily,
         * we're keeping that code behind a branch which will predict as
         * not-taken and therefore its instructions won't be fetched.
         */
-        testl   $X86_EFLAGS_NT, EFLAGS(%rsp)
+        testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
        jnz     .Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
        pushq   $X86_EFLAGS_FIXED
        popfq
        jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 /*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
 *
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
- * then loads new ss, cs, and rip from previously programmed MSRs.
+ *    user task that did SYSCALL without immediately reloading SS
- * rflags gets masked by a value from another MSR (so CLD and CLAC
+ *    would randomly crash.
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
 *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
 *
@@ -236,7 +267,21 @@ sysret32_from_system_call:
 END(entry_SYSCALL_compat)
 /*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
 *
 * Arguments:
 * eax  system call number
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat)
 * edx  arg3
 * esi  arg4
 * edi  arg5
- * ebp  arg6    (note: not saved in the stack frame, should not be touched)
+ * ebp  arg6
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
 */
 ENTRY(entry_INT80_compat)
        /*
         * Interrupts are off on entry.
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat)
        TRACE_IRQS_OFF
        movq    %rsp, %rdi
-        call    do_syscall_32_irqs_off
+        call    do_int80_syscall_32
 .Lsyscall_32_done:
        /* Go back to user mode. */
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
-#ifdef CONFIG_IA32_EMULATION
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,19 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
-#ifdef CONFIG_X86_X32_ABI
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a50bdc..2e5b565adacc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12      common  brk                     sys_brk
 13      64      rt_sigaction            sys_rt_sigaction
 14      common  rt_sigprocmask          sys_rt_sigprocmask
-15      64      rt_sigreturn            stub_rt_sigreturn
+15      64      rt_sigreturn            sys_rt_sigreturn/ptregs
 16      64      ioctl                   sys_ioctl
 17      common  pread64                 sys_pread64
 18      common  pwrite64                sys_pwrite64
@@ -62,10 +62,10 @@
 53      common  socketpair              sys_socketpair
 54      64      setsockopt              sys_setsockopt
 55      64      getsockopt              sys_getsockopt
-56      common  clone                   stub_clone
+56      common  clone                   sys_clone/ptregs
-57      common  fork                    stub_fork
+57      common  fork                    sys_fork/ptregs
-58      common  vfork                   stub_vfork
+58      common  vfork                   sys_vfork/ptregs
-59      64      execve                  stub_execve
+59      64      execve                  sys_execve/ptregs
 60      common  exit                    sys_exit
 61      common  wait4                   sys_wait4
 62      common  kill                    sys_kill
@@ -178,7 +178,7 @@
 169     common  reboot                  sys_reboot
 170     common  sethostname             sys_sethostname
 171     common  setdomainname           sys_setdomainname
-172     common  iopl                    sys_iopl
+172     common  iopl                    sys_iopl/ptregs
 173     common  ioperm                  sys_ioperm
 174     64      create_module
 175     common  init_module             sys_init_module
@@ -328,7 +328,7 @@
 319     common  memfd_create            sys_memfd_create
 320     common  kexec_file_load         sys_kexec_file_load
 321     common  bpf                     sys_bpf
-322     64      execveat                stub_execveat
+322     64      execveat                sys_execveat/ptregs
 323     common  userfaultfd             sys_userfaultfd
 324     common  membarrier              sys_membarrier
 325     common  mlock2                  sys_mlock2
@@ -339,14 +339,14 @@
 # for native 64-bit operation.
 #
 512     x32     rt_sigaction            compat_sys_rt_sigaction
-513     x32     rt_sigreturn            stub_x32_rt_sigreturn
+513     x32     rt_sigreturn            sys32_x32_rt_sigreturn
 514     x32     ioctl                   compat_sys_ioctl
 515     x32     readv                   compat_sys_readv
 516     x32     writev                  compat_sys_writev
 517     x32     recvfrom                compat_sys_recvfrom
 518     x32     sendmsg                 compat_sys_sendmsg
 519     x32     recvmsg                 compat_sys_recvmsg
-520     x32     execve                  stub_x32_execve
+520     x32     execve                  compat_sys_execve/ptregs
 521     x32     ptrace                  compat_sys_ptrace
 522     x32     rt_sigpending           compat_sys_rt_sigpending
 523     x32     rt_sigtimedwait         compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
 542     x32     getsockopt              compat_sys_getsockopt
 543     x32     io_setup                compat_sys_io_setup
 544     x32     io_submit               compat_sys_io_submit
-545     x32     execveat                stub_x32_execveat
+545     x32     execveat                compat_sys_execveat/ptregs
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,63 @@
 in="$1"
 out="$2"
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"         # Strip the function name
+    qualifier="${qualifier:1}"                  # Strip the slash, if any
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+        echo "a compat entry for a 64-bit syscall makes no sense" >&2
+        exit 1
+    fi
+    if [ -z "$compat" ]; then
+        if [ -n "$entry" ]; then
+            syscall_macro "$abi" "$nr" "$entry"
+        fi
+    else
+        echo "#ifdef CONFIG_X86_32"
+        if [ -n "$entry" ]; then
+            syscall_macro "$abi" "$nr" "$entry"
+        fi
+        echo "#else"
+        syscall_macro "$abi" "$nr" "$compat"
+        echo "#endif"
+    fi
+}
 grep '^[0-9]' "$in" | sort -n | (
    while read nr abi name entry compat; do
        abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-        if [ -n "$compat" ]; then
+        if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
-            echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+            # COMMON is the same as 64, except that we don't expect X32
-        elif [ -n "$entry" ]; then
+            # programs to use it.  Our expectation has nothing to do with
-            echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+            # any generated code, so treat them the same.
+            emit 64 "$nr" "$entry" "$compat"
+        elif [ "$abi" == "X32" ]; then
+            # X32 is equivalent to 64 on an X32-compatible kernel.
+            echo "#ifdef CONFIG_X86_X32_ABI"
+            emit 64 "$nr" "$entry" "$compat"
+            echo "#endif"
+        elif [ "$abi" == "I386" ]; then
+            emit "$abi" "$nr" "$entry" "$compat"
+        else
+            echo "Unknown abi $abi" >&2
+            exit 1
        fi
    done
 ) > "$out"
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 3f69326ed545..63a03bb91497 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
        }
        fprintf(outfile, "\n};\n\n");
-        fprintf(outfile, "static struct page *pages[%lu];\n\n",
-                mapping_size / 4096);
        fprintf(outfile, "const struct vdso_image %s = {\n", name);
        fprintf(outfile, "\t.data = raw_data,\n");
        fprintf(outfile, "\t.size = %lu,\n", mapping_size);
-        fprintf(outfile, "\t.text_mapping = {\n");
-        fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
-        fprintf(outfile, "\t\t.pages = pages,\n");
-        fprintf(outfile, "\t},\n");
        if (alt_sec) {
                fprintf(outfile, "\t.alt = %lu,\n",
                        (unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 08a317a9ae4b..7853b53959cd 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
-#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/vdso.h>
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 3a1d9297074b..0109ac6cb79c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
 */
 #include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 /*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f69e264ac4..10f704584922 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/hpet.h>
 #include <asm/desc.h>
+#include <asm/cpufeature.h>
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
 void __init init_vdso_image(const struct vdso_image *image)
 {
-        int i;
-        int npages = (image->size) / PAGE_SIZE;
        BUG_ON(image->size % PAGE_SIZE != 0);
-        for (i = 0; i < npages; i++)
-                image->text_mapping.pages[i] =
-                        virt_to_page(image->data + i*PAGE_SIZE);
        apply_alternatives((struct alt_instr *)(image->data + image->alt),
                           (struct alt_instr *)(image->data + image->alt +
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 #endif
 }
+static int vdso_fault(const struct vm_special_mapping *sm,
+                      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+        if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+                return VM_FAULT_SIGBUS;
+        vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+        get_page(vmf->page);
+        return 0;
+}
+static const struct vm_special_mapping text_mapping = {
+        .name = "[vdso]",
+        .fault = vdso_fault,
+};
+static int vvar_fault(const struct vm_special_mapping *sm,
+                      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+        long sym_offset;
+        int ret = -EFAULT;
+        if (!image)
+                return VM_FAULT_SIGBUS;
+        sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+                image->sym_vvar_start;
+        /*
+         * Sanity check: a symbol offset of zero means that the page
+         * does not exist for this vdso image, not that the page is at
+         * offset zero relative to the text mapping.  This should be
+         * impossible here, because sym_offset should only be zero for
+         * the page past the end of the vvar mapping.
+         */
+        if (sym_offset == 0)
+                return VM_FAULT_SIGBUS;
+        if (sym_offset == image->sym_vvar_page) {
+                ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+                                    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+        } else if (sym_offset == image->sym_hpet_page) {
+#ifdef CONFIG_HPET_TIMER
+                if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
+                        ret = vm_insert_pfn_prot(
+                                vma,
+                                (unsigned long)vmf->virtual_address,
+                                hpet_address >> PAGE_SHIFT,
+                                pgprot_noncached(PAGE_READONLY));
+                }
+#endif
+        } else if (sym_offset == image->sym_pvclock_page) {
+                struct pvclock_vsyscall_time_info *pvti =
+                        pvclock_pvti_cpu0_va();
+                if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
+                        ret = vm_insert_pfn(
+                                vma,
+                                (unsigned long)vmf->virtual_address,
+                                __pa(pvti) >> PAGE_SHIFT);
+                }
+        }
+        if (ret == 0 || ret == -EBUSY)
+                return VM_FAULT_NOPAGE;
+        return VM_FAULT_SIGBUS;
+}
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long addr, text_start;
        int ret = 0;
-        static struct page *no_pages[] = {NULL};
+        static const struct vm_special_mapping vvar_mapping = {
-        static struct vm_special_mapping vvar_mapping = {
                .name = "[vvar]",
-                .pages = no_pages,
+                .fault = vvar_fault,
        };
-        struct pvclock_vsyscall_time_info *pvti;
        if (calculate_addr) {
                addr = vdso_addr(current->mm->start_stack,
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
        text_start = addr - image->sym_vvar_start;
        current->mm->context.vdso = (void __user *)text_start;
+        current->mm->context.vdso_image = image;
        /*
         * MAYWRITE to allow gdb to COW and set breakpoints
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                                       image->size,
                                       VM_READ|VM_EXEC|
                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-                                       &image->text_mapping);
+                                       &text_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
        vma = _install_special_mapping(mm,
                                       addr,
                                       -image->sym_vvar_start,
-                                       VM_READ|VM_MAYREAD,
+                                       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+                                       VM_PFNMAP,
                                       &vvar_mapping);
        if (IS_ERR(vma)) {
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                goto up_fail;
        }
-        if (image->sym_vvar_page)
-                ret = remap_pfn_range(vma,
-                                      text_start + image->sym_vvar_page,
-                                      __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-                                      PAGE_SIZE,
-                                      PAGE_READONLY);
-        if (ret)
-                goto up_fail;
-#ifdef CONFIG_HPET_TIMER
-        if (hpet_address && image->sym_hpet_page) {
-                ret = io_remap_pfn_range(vma,
-                        text_start + image->sym_hpet_page,
-                        hpet_address >> PAGE_SHIFT,
-                        PAGE_SIZE,
-                        pgprot_noncached(PAGE_READONLY));
-                if (ret)
-                        goto up_fail;
-        }
-#endif
-        pvti = pvclock_pvti_cpu0_va();
-        if (pvti && image->sym_pvclock_page) {
-                ret = remap_pfn_range(vma,
-                                      text_start + image->sym_pvclock_page,
-                                      __pa(pvti) >> PAGE_SHIFT,
-                                      PAGE_SIZE,
-                                      PAGE_READONLY);
-                if (ret)
-                        goto up_fail;
-        }
 up_fail:
        if (ret)
                current->mm->context.vdso = NULL;
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
 #ifdef CONFIG_NUMA
        node = cpu_to_node(cpu);
 #endif
-        if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+        if (static_cpu_has(X86_FEATURE_RDTSCP))
                write_rdtscp_aux((node << 12) | cpu);
        /*
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..0fb3a104ac62 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
+int vclocks_used __read_mostly;
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
 void update_vsyscall(struct timekeeper *tk)
 {
+        int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
        struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+        /* Mark the new vclock used. */
+        BUILD_BUG_ON(VCLOCK_MAX >= 32);
+        WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
        gtod_write_begin(vdata);
        /* copy vsyscall data */
-        vdata->vclock_mode      = tk->tkr_mono.clock->archdata.vclock_mode;
+        vdata->vclock_mode      = vclock_mode;
        vdata->cycle_last       = tk->tkr_mono.cycle_last;
        vdata->mask             = tk->tkr_mono.mask;
        vdata->mult             = tk->tkr_mono.mult;
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-15 12:32:27 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-15 12:32:27 -0400
commit	ba33ea811e1ff6726abb7f8f96df38c2d7b50304 (patch)
tree	29134e5cc7c19c8e520cb9336b476144d3d1252f /arch/x86/entry
parent	e23604edac2a7be6a8808a5d13fac6b9df4eb9a8 (diff)
parent	d05004944206cbbf1c453e179768163731c7c6f1 (diff)