diff options
| -rw-r--r-- | arch/x86/kernel/entry_64.S | 106 | ||||
| -rw-r--r-- | arch/x86/kernel/traps.c | 25 |
2 files changed, 77 insertions, 54 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 501212f14c87..db13655c3a2a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -361,15 +361,12 @@ system_call_fastpath: | |||
| 361 | * Has incomplete stack frame and undefined top of stack. | 361 | * Has incomplete stack frame and undefined top of stack. |
| 362 | */ | 362 | */ |
| 363 | ret_from_sys_call: | 363 | ret_from_sys_call: |
| 364 | movl $_TIF_ALLWORK_MASK,%edi | 364 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| 365 | /* edi: flagmask */ | 365 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ |
| 366 | sysret_check: | 366 | |
| 367 | LOCKDEP_SYS_EXIT | 367 | LOCKDEP_SYS_EXIT |
| 368 | DISABLE_INTERRUPTS(CLBR_NONE) | 368 | DISABLE_INTERRUPTS(CLBR_NONE) |
| 369 | TRACE_IRQS_OFF | 369 | TRACE_IRQS_OFF |
| 370 | movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx | ||
| 371 | andl %edi,%edx | ||
| 372 | jnz sysret_careful | ||
| 373 | CFI_REMEMBER_STATE | 370 | CFI_REMEMBER_STATE |
| 374 | /* | 371 | /* |
| 375 | * sysretq will re-enable interrupts: | 372 | * sysretq will re-enable interrupts: |
| @@ -383,49 +380,10 @@ sysret_check: | |||
| 383 | USERGS_SYSRET64 | 380 | USERGS_SYSRET64 |
| 384 | 381 | ||
| 385 | CFI_RESTORE_STATE | 382 | CFI_RESTORE_STATE |
| 386 | /* Handle reschedules */ | ||
| 387 | /* edx: work, edi: workmask */ | ||
| 388 | sysret_careful: | ||
| 389 | bt $TIF_NEED_RESCHED,%edx | ||
| 390 | jnc sysret_signal | ||
| 391 | TRACE_IRQS_ON | ||
| 392 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
| 393 | pushq_cfi %rdi | ||
| 394 | SCHEDULE_USER | ||
| 395 | popq_cfi %rdi | ||
| 396 | jmp sysret_check | ||
| 397 | 383 | ||
| 398 | /* Handle a signal */ | 384 | int_ret_from_sys_call_fixup: |
| 399 | sysret_signal: | ||
| 400 | TRACE_IRQS_ON | ||
| 401 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
| 402 | #ifdef CONFIG_AUDITSYSCALL | ||
| 403 | bt $TIF_SYSCALL_AUDIT,%edx | ||
| 404 | jc sysret_audit | ||
| 405 | #endif | ||
| 406 | /* | ||
| 407 | * We have a signal, or exit tracing or single-step. | ||
| 408 | * These all wind up with the iret return path anyway, | ||
| 409 | * so just join that path right now. | ||
| 410 | */ | ||
| 411 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | 385 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET |
| 412 | jmp int_check_syscall_exit_work | 386 | jmp int_ret_from_sys_call |
| 413 | |||
| 414 | #ifdef CONFIG_AUDITSYSCALL | ||
| 415 | /* | ||
| 416 | * Return fast path for syscall audit. Call __audit_syscall_exit() | ||
| 417 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | ||
| 418 | * masked off. | ||
| 419 | */ | ||
| 420 | sysret_audit: | ||
| 421 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ | ||
| 422 | cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ | ||
| 423 | setbe %al /* 1 if so, 0 if not */ | ||
| 424 | movzbl %al,%edi /* zero-extend that into %edi */ | ||
| 425 | call __audit_syscall_exit | ||
| 426 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | ||
| 427 | jmp sysret_check | ||
| 428 | #endif /* CONFIG_AUDITSYSCALL */ | ||
| 429 | 387 | ||
| 430 | /* Do syscall tracing */ | 388 | /* Do syscall tracing */ |
| 431 | tracesys: | 389 | tracesys: |
| @@ -794,6 +752,60 @@ retint_swapgs: /* return to user-space */ | |||
| 794 | */ | 752 | */ |
| 795 | DISABLE_INTERRUPTS(CLBR_ANY) | 753 | DISABLE_INTERRUPTS(CLBR_ANY) |
| 796 | TRACE_IRQS_IRETQ | 754 | TRACE_IRQS_IRETQ |
| 755 | |||
| 756 | /* | ||
| 757 | * Try to use SYSRET instead of IRET if we're returning to | ||
| 758 | * a completely clean 64-bit userspace context. | ||
| 759 | */ | ||
| 760 | movq (RCX-R11)(%rsp), %rcx | ||
| 761 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
| 762 | jne opportunistic_sysret_failed | ||
| 763 | |||
| 764 | /* | ||
| 765 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
| 766 | * in kernel space. This essentially lets the user take over | ||
| 767 | * the kernel, since userspace controls RSP. It's not worth | ||
| 768 | * testing for canonicalness exactly -- this check detects any | ||
| 769 | * of the 17 high bits set, which is true for non-canonical | ||
| 770 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
| 771 | * Big deal.) | ||
| 772 | * | ||
| 773 | * If virtual addresses ever become wider, this will need | ||
| 774 | * to be updated to remain correct on both old and new CPUs. | ||
| 775 | */ | ||
| 776 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
| 777 | .error "virtual address width changed -- sysret checks need update" | ||
| 778 | .endif | ||
| 779 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
| 780 | jnz opportunistic_sysret_failed | ||
| 781 | |||
| 782 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
| 783 | jne opportunistic_sysret_failed | ||
| 784 | |||
| 785 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
| 786 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
| 787 | jne opportunistic_sysret_failed | ||
| 788 | |||
| 789 | testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ | ||
| 790 | jnz opportunistic_sysret_failed | ||
| 791 | |||
| 792 | /* nothing to check for RSP */ | ||
| 793 | |||
| 794 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
| 795 | jne opportunistic_sysret_failed | ||
| 796 | |||
| 797 | /* | ||
| 798 | * We win! This label is here just for ease of understanding | ||
| 799 | * perf profiles. Nothing jumps here. | ||
| 800 | */ | ||
| 801 | irq_return_via_sysret: | ||
| 802 | CFI_REMEMBER_STATE | ||
| 803 | RESTORE_ARGS 1,8,1 | ||
| 804 | movq (RSP-RIP)(%rsp),%rsp | ||
| 805 | USERGS_SYSRET64 | ||
| 806 | CFI_RESTORE_STATE | ||
| 807 | |||
| 808 | opportunistic_sysret_failed: | ||
| 797 | SWAPGS | 809 | SWAPGS |
| 798 | jmp restore_args | 810 | jmp restore_args |
| 799 | 811 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7176f84f95a4..c74f2f5652da 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
| @@ -110,15 +110,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
| 110 | 110 | ||
| 111 | enum ctx_state ist_enter(struct pt_regs *regs) | 111 | enum ctx_state ist_enter(struct pt_regs *regs) |
| 112 | { | 112 | { |
| 113 | /* | 113 | enum ctx_state prev_state; |
| 114 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
| 115 | * in which case we still shouldn't schedule. | ||
| 116 | */ | ||
| 117 | preempt_count_add(HARDIRQ_OFFSET); | ||
| 118 | 114 | ||
| 119 | if (user_mode_vm(regs)) { | 115 | if (user_mode_vm(regs)) { |
| 120 | /* Other than that, we're just an exception. */ | 116 | /* Other than that, we're just an exception. */ |
| 121 | return exception_enter(); | 117 | prev_state = exception_enter(); |
| 122 | } else { | 118 | } else { |
| 123 | /* | 119 | /* |
| 124 | * We might have interrupted pretty much anything. In | 120 | * We might have interrupted pretty much anything. In |
| @@ -127,12 +123,27 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
| 127 | * but we need to notify RCU. | 123 | * but we need to notify RCU. |
| 128 | */ | 124 | */ |
| 129 | rcu_nmi_enter(); | 125 | rcu_nmi_enter(); |
| 130 | return IN_KERNEL; /* the value is irrelevant. */ | 126 | prev_state = IN_KERNEL; /* the value is irrelevant. */ |
| 131 | } | 127 | } |
| 128 | |||
| 129 | /* | ||
| 130 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
| 131 | * in which case we still shouldn't schedule). | ||
| 132 | * | ||
| 133 | * This must be after exception_enter(), because exception_enter() | ||
| 134 | * won't do anything if in_interrupt() returns true. | ||
| 135 | */ | ||
| 136 | preempt_count_add(HARDIRQ_OFFSET); | ||
| 137 | |||
| 138 | /* This code is a bit fragile. Test it. */ | ||
| 139 | rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); | ||
| 140 | |||
| 141 | return prev_state; | ||
| 132 | } | 142 | } |
| 133 | 143 | ||
| 134 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | 144 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) |
| 135 | { | 145 | { |
| 146 | /* Must be before exception_exit. */ | ||
| 136 | preempt_count_sub(HARDIRQ_OFFSET); | 147 | preempt_count_sub(HARDIRQ_OFFSET); |
| 137 | 148 | ||
| 138 | if (user_mode_vm(regs)) | 149 | if (user_mode_vm(regs)) |
