diff options
author | Denys Vlasenko <dvlasenk@redhat.com> | 2015-04-02 12:46:59 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-04-08 03:02:12 -0400 |
commit | fffbb5dcfd29f8831e41b4dd2ab938bd36d35283 (patch) | |
tree | 2125f2510b43c95b1d7c86e41b20bec790c1cec2 | |
parent | 4bcc7827b02feea2c762fa6d46a1bffb300d7403 (diff) |
x86/asm/entry/64: Move opportunistic sysret code to syscall code path
This change does two things:
Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.
Removes "opportunistic sysret" code from "retint_swapgs:" code
block, since now it won't be reached by syscall return. This in
fact removes most of the code in question.
text data bss dec hex filename
12530 0 0 12530 30f2 entry_64.o.before
12562 0 0 12562 3112 entry_64.o
Run-tested.
Acked-and-Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/x86/kernel/entry_64.S | 158 |
1 files changed, 86 insertions, 72 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 65485b3baa59..e4c810395bae 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -354,8 +354,8 @@ GLOBAL(int_with_check) | |||
354 | movl TI_flags(%rcx),%edx | 354 | movl TI_flags(%rcx),%edx |
355 | andl %edi,%edx | 355 | andl %edi,%edx |
356 | jnz int_careful | 356 | jnz int_careful |
357 | andl $~TS_COMPAT,TI_status(%rcx) | 357 | andl $~TS_COMPAT,TI_status(%rcx) |
358 | jmp retint_swapgs | 358 | jmp syscall_return |
359 | 359 | ||
360 | /* Either reschedule or signal or syscall exit tracking needed. */ | 360 | /* Either reschedule or signal or syscall exit tracking needed. */ |
361 | /* First do a reschedule test. */ | 361 | /* First do a reschedule test. */ |
@@ -399,9 +399,86 @@ int_restore_rest: | |||
399 | DISABLE_INTERRUPTS(CLBR_NONE) | 399 | DISABLE_INTERRUPTS(CLBR_NONE) |
400 | TRACE_IRQS_OFF | 400 | TRACE_IRQS_OFF |
401 | jmp int_with_check | 401 | jmp int_with_check |
402 | |||
403 | syscall_return: | ||
404 | /* The IRETQ could re-enable interrupts: */ | ||
405 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
406 | TRACE_IRQS_IRETQ | ||
407 | |||
408 | /* | ||
409 | * Try to use SYSRET instead of IRET if we're returning to | ||
410 | * a completely clean 64-bit userspace context. | ||
411 | */ | ||
412 | movq RCX(%rsp),%rcx | ||
413 | cmpq %rcx,RIP(%rsp) /* RCX == RIP */ | ||
414 | jne opportunistic_sysret_failed | ||
415 | |||
416 | /* | ||
417 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP | ||
418 | * in kernel space. This essentially lets the user take over | ||
419 | * the kernel, since userspace controls RSP. It's not worth | ||
420 | * testing for canonicalness exactly -- this check detects any | ||
421 | * of the 17 high bits set, which is true for non-canonical | ||
422 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
423 | * Big deal.) | ||
424 | * | ||
425 | * If virtual addresses ever become wider, this will need | ||
426 | * to be updated to remain correct on both old and new CPUs. | ||
427 | */ | ||
428 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
429 | .error "virtual address width changed -- SYSRET checks need update" | ||
430 | .endif | ||
431 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
432 | jnz opportunistic_sysret_failed | ||
433 | |||
434 | cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ | ||
435 | jne opportunistic_sysret_failed | ||
436 | |||
437 | movq R11(%rsp),%r11 | ||
438 | cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ | ||
439 | jne opportunistic_sysret_failed | ||
440 | |||
441 | /* | ||
442 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
443 | * restoring TF results in a trap from userspace immediately after | ||
444 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
445 | * with register state that satisfies the opportunistic SYSRET | ||
446 | * conditions. For example, single-stepping this user code: | ||
447 | * | ||
448 | * movq $stuck_here,%rcx | ||
449 | * pushfq | ||
450 | * popq %r11 | ||
451 | * stuck_here: | ||
452 | * | ||
453 | * would never get past 'stuck_here'. | ||
454 | */ | ||
455 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
456 | jnz opportunistic_sysret_failed | ||
457 | |||
458 | /* nothing to check for RSP */ | ||
459 | |||
460 | cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ | ||
461 | jne opportunistic_sysret_failed | ||
462 | |||
463 | /* | ||
464 | * We win! This label is here just for ease of understanding | ||
465 | * perf profiles. Nothing jumps here. | ||
466 | */ | ||
467 | syscall_return_via_sysret: | ||
468 | CFI_REMEMBER_STATE | ||
469 | /* r11 is already restored (see code above) */ | ||
470 | RESTORE_C_REGS_EXCEPT_R11 | ||
471 | movq RSP(%rsp),%rsp | ||
472 | USERGS_SYSRET64 | ||
473 | CFI_RESTORE_STATE | ||
474 | |||
475 | opportunistic_sysret_failed: | ||
476 | SWAPGS | ||
477 | jmp restore_c_regs_and_iret | ||
402 | CFI_ENDPROC | 478 | CFI_ENDPROC |
403 | END(system_call) | 479 | END(system_call) |
404 | 480 | ||
481 | |||
405 | .macro FORK_LIKE func | 482 | .macro FORK_LIKE func |
406 | ENTRY(stub_\func) | 483 | ENTRY(stub_\func) |
407 | CFI_STARTPROC | 484 | CFI_STARTPROC |
@@ -673,76 +750,8 @@ retint_swapgs: /* return to user-space */ | |||
673 | DISABLE_INTERRUPTS(CLBR_ANY) | 750 | DISABLE_INTERRUPTS(CLBR_ANY) |
674 | TRACE_IRQS_IRETQ | 751 | TRACE_IRQS_IRETQ |
675 | 752 | ||
676 | /* | ||
677 | * Try to use SYSRET instead of IRET if we're returning to | ||
678 | * a completely clean 64-bit userspace context. | ||
679 | */ | ||
680 | movq RCX(%rsp),%rcx | ||
681 | cmpq %rcx,RIP(%rsp) /* RCX == RIP */ | ||
682 | jne opportunistic_sysret_failed | ||
683 | |||
684 | /* | ||
685 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
686 | * in kernel space. This essentially lets the user take over | ||
687 | * the kernel, since userspace controls RSP. It's not worth | ||
688 | * testing for canonicalness exactly -- this check detects any | ||
689 | * of the 17 high bits set, which is true for non-canonical | ||
690 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
691 | * Big deal.) | ||
692 | * | ||
693 | * If virtual addresses ever become wider, this will need | ||
694 | * to be updated to remain correct on both old and new CPUs. | ||
695 | */ | ||
696 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
697 | .error "virtual address width changed -- sysret checks need update" | ||
698 | .endif | ||
699 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
700 | jnz opportunistic_sysret_failed | ||
701 | |||
702 | cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ | ||
703 | jne opportunistic_sysret_failed | ||
704 | |||
705 | movq R11(%rsp),%r11 | ||
706 | cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ | ||
707 | jne opportunistic_sysret_failed | ||
708 | |||
709 | /* | ||
710 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
711 | * restoring TF results in a trap from userspace immediately after | ||
712 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
713 | * with register state that satisfies the opportunistic SYSRET | ||
714 | * conditions. For example, single-stepping this user code: | ||
715 | * | ||
716 | * movq $stuck_here,%rcx | ||
717 | * pushfq | ||
718 | * popq %r11 | ||
719 | * stuck_here: | ||
720 | * | ||
721 | * would never get past 'stuck_here'. | ||
722 | */ | ||
723 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
724 | jnz opportunistic_sysret_failed | ||
725 | |||
726 | /* nothing to check for RSP */ | ||
727 | |||
728 | cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ | ||
729 | jne opportunistic_sysret_failed | ||
730 | |||
731 | /* | ||
732 | * We win! This label is here just for ease of understanding | ||
733 | * perf profiles. Nothing jumps here. | ||
734 | */ | ||
735 | irq_return_via_sysret: | ||
736 | CFI_REMEMBER_STATE | ||
737 | /* r11 is already restored (see code above) */ | ||
738 | RESTORE_C_REGS_EXCEPT_R11 | ||
739 | movq RSP(%rsp),%rsp | ||
740 | USERGS_SYSRET64 | ||
741 | CFI_RESTORE_STATE | ||
742 | |||
743 | opportunistic_sysret_failed: | ||
744 | SWAPGS | 753 | SWAPGS |
745 | jmp restore_args | 754 | jmp restore_c_regs_and_iret |
746 | 755 | ||
747 | /* Returning to kernel space */ | 756 | /* Returning to kernel space */ |
748 | retint_kernel: | 757 | retint_kernel: |
@@ -761,7 +770,12 @@ retint_kernel: | |||
761 | * The iretq could re-enable interrupts: | 770 | * The iretq could re-enable interrupts: |
762 | */ | 771 | */ |
763 | TRACE_IRQS_IRETQ | 772 | TRACE_IRQS_IRETQ |
764 | restore_args: | 773 | |
774 | /* | ||
775 | * At this label, code paths which return to kernel and to user, | ||
776 | * which come from interrupts/exception and from syscalls, merge. | ||
777 | */ | ||
778 | restore_c_regs_and_iret: | ||
765 | RESTORE_C_REGS | 779 | RESTORE_C_REGS |
766 | REMOVE_PT_GPREGS_FROM_STACK 8 | 780 | REMOVE_PT_GPREGS_FROM_STACK 8 |
767 | 781 | ||