aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@amacapital.net>2014-07-22 15:46:50 -0400
committerAndy Lutomirski <luto@amacapital.net>2015-02-01 07:03:01 -0500
commit2a23c6b8a9c42620182a2d2cfc7c16f6ff8c42b4 (patch)
treead051e5d83b4027ca6fa72fbb13724364f0480cf
parentb926e6f61a26036ee9eabe6761483954d481ad25 (diff)
x86_64, entry: Use sysret to return to userspace when possible
The x86_64 entry code currently jumps through complex and inconsistent hoops to try to minimize the impact of syscall exit work. For a true fast-path syscall, almost nothing needs to be done, so returning is just a check for exit work and sysret. For a full slow-path return from a syscall, the C exit hook is invoked if needed and we join the iret path. Using iret to return to userspace is very slow, so the entry code has accumulated various special cases to try to do certain forms of exit work without invoking iret. This is error-prone, since it duplicates assembly code paths, and it's dangerous, since sysret can malfunction in interesting ways if used carelessly. It's also inefficient, since a lot of useful cases aren't optimized and therefore force an iret out of a combination of paranoia and the fact that no one has bothered to write even more asm code to avoid it. I would argue that this approach is backwards. Rather than trying to avoid the iret path, we should instead try to make the iret path fast. Under a specific set of conditions, iret is unnecessary. In particular, if RIP==RCX, RFLAGS==R11, RIP is canonical, RF is not set, and both SS and CS are as expected, then movq 32(%rsp),%rsp;sysret does the same thing as iret. This set of conditions is nearly always satisfied on return from syscalls, and it can even occasionally be satisfied on return from an irq. Even with the careful checks for sysret applicability, this cuts nearly 80ns off of the overhead from syscalls with unoptimized exit work. This includes tracing and context tracking, and any return that invokes KVM's user return notifier. For example, the cost of getpid with CONFIG_CONTEXT_TRACKING_FORCE=y drops from ~360ns to ~280ns on my computer. This may allow the removal and even eventual conversion to C of a respectable amount of exit asm. This may require further tweaking to give the full benefit on Xen. It may be worthwhile to adjust signal delivery and exec to try hit the sysret path. This does not optimize returns to 32-bit userspace. Making the same optimization for CS == __USER32_CS is conceptually straightforward, but it will require some tedious code to handle the differences between sysretl and sysexitl. Link: http://lkml.kernel.org/r/71428f63e681e1b4aa1a781e3ef7c27f027d1103.1421453410.git.luto@amacapital.net Signed-off-by: Andy Lutomirski <luto@amacapital.net>
-rw-r--r--arch/x86/kernel/entry_64.S54
1 files changed, 54 insertions, 0 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 501212f14c87..eeab4cf8b2c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -794,6 +794,60 @@ retint_swapgs: /* return to user-space */
794 */ 794 */
795 DISABLE_INTERRUPTS(CLBR_ANY) 795 DISABLE_INTERRUPTS(CLBR_ANY)
796 TRACE_IRQS_IRETQ 796 TRACE_IRQS_IRETQ
797
798 /*
799 * Try to use SYSRET instead of IRET if we're returning to
800 * a completely clean 64-bit userspace context.
801 */
802 movq (RCX-R11)(%rsp), %rcx
803 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
804 jne opportunistic_sysret_failed
805
806 /*
807 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
808 * in kernel space. This essentially lets the user take over
809 * the kernel, since userspace controls RSP. It's not worth
810 * testing for canonicalness exactly -- this check detects any
811 * of the 17 high bits set, which is true for non-canonical
812 * or kernel addresses. (This will pessimize vsyscall=native.
813 * Big deal.)
814 *
815 * If virtual addresses ever become wider, this will need
816 * to be updated to remain correct on both old and new CPUs.
817 */
818 .ifne __VIRTUAL_MASK_SHIFT - 47
819 .error "virtual address width changed -- sysret checks need update"
820 .endif
821 shr $__VIRTUAL_MASK_SHIFT, %rcx
822 jnz opportunistic_sysret_failed
823
824 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
825 jne opportunistic_sysret_failed
826
827 movq (R11-ARGOFFSET)(%rsp), %r11
828 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
829 jne opportunistic_sysret_failed
830
831 testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
832 jnz opportunistic_sysret_failed
833
834 /* nothing to check for RSP */
835
836 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
837 jne opportunistic_sysret_failed
838
839 /*
840 * We win! This label is here just for ease of understanding
841 * perf profiles. Nothing jumps here.
842 */
843irq_return_via_sysret:
844 CFI_REMEMBER_STATE
845 RESTORE_ARGS 1,8,1
846 movq (RSP-RIP)(%rsp),%rsp
847 USERGS_SYSRET64
848 CFI_RESTORE_STATE
849
850opportunistic_sysret_failed:
797 SWAPGS 851 SWAPGS
798 jmp restore_args 852 jmp restore_args
799 853