diff options
-rw-r--r-- | arch/x86/kernel/entry_64.S | 106 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 25 |
2 files changed, 77 insertions, 54 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 501212f14c87..db13655c3a2a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -361,15 +361,12 @@ system_call_fastpath: | |||
361 | * Has incomplete stack frame and undefined top of stack. | 361 | * Has incomplete stack frame and undefined top of stack. |
362 | */ | 362 | */ |
363 | ret_from_sys_call: | 363 | ret_from_sys_call: |
364 | movl $_TIF_ALLWORK_MASK,%edi | 364 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
365 | /* edi: flagmask */ | 365 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ |
366 | sysret_check: | 366 | |
367 | LOCKDEP_SYS_EXIT | 367 | LOCKDEP_SYS_EXIT |
368 | DISABLE_INTERRUPTS(CLBR_NONE) | 368 | DISABLE_INTERRUPTS(CLBR_NONE) |
369 | TRACE_IRQS_OFF | 369 | TRACE_IRQS_OFF |
370 | movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx | ||
371 | andl %edi,%edx | ||
372 | jnz sysret_careful | ||
373 | CFI_REMEMBER_STATE | 370 | CFI_REMEMBER_STATE |
374 | /* | 371 | /* |
375 | * sysretq will re-enable interrupts: | 372 | * sysretq will re-enable interrupts: |
@@ -383,49 +380,10 @@ sysret_check: | |||
383 | USERGS_SYSRET64 | 380 | USERGS_SYSRET64 |
384 | 381 | ||
385 | CFI_RESTORE_STATE | 382 | CFI_RESTORE_STATE |
386 | /* Handle reschedules */ | ||
387 | /* edx: work, edi: workmask */ | ||
388 | sysret_careful: | ||
389 | bt $TIF_NEED_RESCHED,%edx | ||
390 | jnc sysret_signal | ||
391 | TRACE_IRQS_ON | ||
392 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
393 | pushq_cfi %rdi | ||
394 | SCHEDULE_USER | ||
395 | popq_cfi %rdi | ||
396 | jmp sysret_check | ||
397 | 383 | ||
398 | /* Handle a signal */ | 384 | int_ret_from_sys_call_fixup: |
399 | sysret_signal: | ||
400 | TRACE_IRQS_ON | ||
401 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
402 | #ifdef CONFIG_AUDITSYSCALL | ||
403 | bt $TIF_SYSCALL_AUDIT,%edx | ||
404 | jc sysret_audit | ||
405 | #endif | ||
406 | /* | ||
407 | * We have a signal, or exit tracing or single-step. | ||
408 | * These all wind up with the iret return path anyway, | ||
409 | * so just join that path right now. | ||
410 | */ | ||
411 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | 385 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET |
412 | jmp int_check_syscall_exit_work | 386 | jmp int_ret_from_sys_call |
413 | |||
414 | #ifdef CONFIG_AUDITSYSCALL | ||
415 | /* | ||
416 | * Return fast path for syscall audit. Call __audit_syscall_exit() | ||
417 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | ||
418 | * masked off. | ||
419 | */ | ||
420 | sysret_audit: | ||
421 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ | ||
422 | cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ | ||
423 | setbe %al /* 1 if so, 0 if not */ | ||
424 | movzbl %al,%edi /* zero-extend that into %edi */ | ||
425 | call __audit_syscall_exit | ||
426 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | ||
427 | jmp sysret_check | ||
428 | #endif /* CONFIG_AUDITSYSCALL */ | ||
429 | 387 | ||
430 | /* Do syscall tracing */ | 388 | /* Do syscall tracing */ |
431 | tracesys: | 389 | tracesys: |
@@ -794,6 +752,60 @@ retint_swapgs: /* return to user-space */ | |||
794 | */ | 752 | */ |
795 | DISABLE_INTERRUPTS(CLBR_ANY) | 753 | DISABLE_INTERRUPTS(CLBR_ANY) |
796 | TRACE_IRQS_IRETQ | 754 | TRACE_IRQS_IRETQ |
755 | |||
756 | /* | ||
757 | * Try to use SYSRET instead of IRET if we're returning to | ||
758 | * a completely clean 64-bit userspace context. | ||
759 | */ | ||
760 | movq (RCX-R11)(%rsp), %rcx | ||
761 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
762 | jne opportunistic_sysret_failed | ||
763 | |||
764 | /* | ||
765 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
766 | * in kernel space. This essentially lets the user take over | ||
767 | * the kernel, since userspace controls RSP. It's not worth | ||
768 | * testing for canonicalness exactly -- this check detects any | ||
769 | * of the 17 high bits set, which is true for non-canonical | ||
770 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
771 | * Big deal.) | ||
772 | * | ||
773 | * If virtual addresses ever become wider, this will need | ||
774 | * to be updated to remain correct on both old and new CPUs. | ||
775 | */ | ||
776 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
777 | .error "virtual address width changed -- sysret checks need update" | ||
778 | .endif | ||
779 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
780 | jnz opportunistic_sysret_failed | ||
781 | |||
782 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
783 | jne opportunistic_sysret_failed | ||
784 | |||
785 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
786 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
787 | jne opportunistic_sysret_failed | ||
788 | |||
789 | testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ | ||
790 | jnz opportunistic_sysret_failed | ||
791 | |||
792 | /* nothing to check for RSP */ | ||
793 | |||
794 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
795 | jne opportunistic_sysret_failed | ||
796 | |||
797 | /* | ||
798 | * We win! This label is here just for ease of understanding | ||
799 | * perf profiles. Nothing jumps here. | ||
800 | */ | ||
801 | irq_return_via_sysret: | ||
802 | CFI_REMEMBER_STATE | ||
803 | RESTORE_ARGS 1,8,1 | ||
804 | movq (RSP-RIP)(%rsp),%rsp | ||
805 | USERGS_SYSRET64 | ||
806 | CFI_RESTORE_STATE | ||
807 | |||
808 | opportunistic_sysret_failed: | ||
797 | SWAPGS | 809 | SWAPGS |
798 | jmp restore_args | 810 | jmp restore_args |
799 | 811 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7176f84f95a4..c74f2f5652da 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -110,15 +110,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
110 | 110 | ||
111 | enum ctx_state ist_enter(struct pt_regs *regs) | 111 | enum ctx_state ist_enter(struct pt_regs *regs) |
112 | { | 112 | { |
113 | /* | 113 | enum ctx_state prev_state; |
114 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
115 | * in which case we still shouldn't schedule. | ||
116 | */ | ||
117 | preempt_count_add(HARDIRQ_OFFSET); | ||
118 | 114 | ||
119 | if (user_mode_vm(regs)) { | 115 | if (user_mode_vm(regs)) { |
120 | /* Other than that, we're just an exception. */ | 116 | /* Other than that, we're just an exception. */ |
121 | return exception_enter(); | 117 | prev_state = exception_enter(); |
122 | } else { | 118 | } else { |
123 | /* | 119 | /* |
124 | * We might have interrupted pretty much anything. In | 120 | * We might have interrupted pretty much anything. In |
@@ -127,12 +123,27 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
127 | * but we need to notify RCU. | 123 | * but we need to notify RCU. |
128 | */ | 124 | */ |
129 | rcu_nmi_enter(); | 125 | rcu_nmi_enter(); |
130 | return IN_KERNEL; /* the value is irrelevant. */ | 126 | prev_state = IN_KERNEL; /* the value is irrelevant. */ |
131 | } | 127 | } |
128 | |||
129 | /* | ||
130 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
131 | * in which case we still shouldn't schedule). | ||
132 | * | ||
133 | * This must be after exception_enter(), because exception_enter() | ||
134 | * won't do anything if in_interrupt() returns true. | ||
135 | */ | ||
136 | preempt_count_add(HARDIRQ_OFFSET); | ||
137 | |||
138 | /* This code is a bit fragile. Test it. */ | ||
139 | rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); | ||
140 | |||
141 | return prev_state; | ||
132 | } | 142 | } |
133 | 143 | ||
134 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | 144 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) |
135 | { | 145 | { |
146 | /* Must be before exception_exit. */ | ||
136 | preempt_count_sub(HARDIRQ_OFFSET); | 147 | preempt_count_sub(HARDIRQ_OFFSET); |
137 | 148 | ||
138 | if (user_mode_vm(regs)) | 149 | if (user_mode_vm(regs)) |