aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/entry_64.S106
-rw-r--r--arch/x86/kernel/traps.c25
2 files changed, 77 insertions, 54 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 501212f14c87..db13655c3a2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -361,15 +361,12 @@ system_call_fastpath:
361 * Has incomplete stack frame and undefined top of stack. 361 * Has incomplete stack frame and undefined top of stack.
362 */ 362 */
363ret_from_sys_call: 363ret_from_sys_call:
364 movl $_TIF_ALLWORK_MASK,%edi 364 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
365 /* edi: flagmask */ 365 jnz int_ret_from_sys_call_fixup /* Go the the slow path */
366sysret_check: 366
367 LOCKDEP_SYS_EXIT 367 LOCKDEP_SYS_EXIT
368 DISABLE_INTERRUPTS(CLBR_NONE) 368 DISABLE_INTERRUPTS(CLBR_NONE)
369 TRACE_IRQS_OFF 369 TRACE_IRQS_OFF
370 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
371 andl %edi,%edx
372 jnz sysret_careful
373 CFI_REMEMBER_STATE 370 CFI_REMEMBER_STATE
374 /* 371 /*
375 * sysretq will re-enable interrupts: 372 * sysretq will re-enable interrupts:
@@ -383,49 +380,10 @@ sysret_check:
383 USERGS_SYSRET64 380 USERGS_SYSRET64
384 381
385 CFI_RESTORE_STATE 382 CFI_RESTORE_STATE
386 /* Handle reschedules */
387 /* edx: work, edi: workmask */
388sysret_careful:
389 bt $TIF_NEED_RESCHED,%edx
390 jnc sysret_signal
391 TRACE_IRQS_ON
392 ENABLE_INTERRUPTS(CLBR_NONE)
393 pushq_cfi %rdi
394 SCHEDULE_USER
395 popq_cfi %rdi
396 jmp sysret_check
397 383
398 /* Handle a signal */ 384int_ret_from_sys_call_fixup:
399sysret_signal:
400 TRACE_IRQS_ON
401 ENABLE_INTERRUPTS(CLBR_NONE)
402#ifdef CONFIG_AUDITSYSCALL
403 bt $TIF_SYSCALL_AUDIT,%edx
404 jc sysret_audit
405#endif
406 /*
407 * We have a signal, or exit tracing or single-step.
408 * These all wind up with the iret return path anyway,
409 * so just join that path right now.
410 */
411 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 385 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
412 jmp int_check_syscall_exit_work 386 jmp int_ret_from_sys_call
413
414#ifdef CONFIG_AUDITSYSCALL
415 /*
416 * Return fast path for syscall audit. Call __audit_syscall_exit()
417 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
418 * masked off.
419 */
420sysret_audit:
421 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
422 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
423 setbe %al /* 1 if so, 0 if not */
424 movzbl %al,%edi /* zero-extend that into %edi */
425 call __audit_syscall_exit
426 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
427 jmp sysret_check
428#endif /* CONFIG_AUDITSYSCALL */
429 387
430 /* Do syscall tracing */ 388 /* Do syscall tracing */
431tracesys: 389tracesys:
@@ -794,6 +752,60 @@ retint_swapgs: /* return to user-space */
794 */ 752 */
795 DISABLE_INTERRUPTS(CLBR_ANY) 753 DISABLE_INTERRUPTS(CLBR_ANY)
796 TRACE_IRQS_IRETQ 754 TRACE_IRQS_IRETQ
755
756 /*
757 * Try to use SYSRET instead of IRET if we're returning to
758 * a completely clean 64-bit userspace context.
759 */
760 movq (RCX-R11)(%rsp), %rcx
761 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
762 jne opportunistic_sysret_failed
763
764 /*
765 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
766 * in kernel space. This essentially lets the user take over
767 * the kernel, since userspace controls RSP. It's not worth
768 * testing for canonicalness exactly -- this check detects any
769 * of the 17 high bits set, which is true for non-canonical
770 * or kernel addresses. (This will pessimize vsyscall=native.
771 * Big deal.)
772 *
773 * If virtual addresses ever become wider, this will need
774 * to be updated to remain correct on both old and new CPUs.
775 */
776 .ifne __VIRTUAL_MASK_SHIFT - 47
777 .error "virtual address width changed -- sysret checks need update"
778 .endif
779 shr $__VIRTUAL_MASK_SHIFT, %rcx
780 jnz opportunistic_sysret_failed
781
782 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
783 jne opportunistic_sysret_failed
784
785 movq (R11-ARGOFFSET)(%rsp), %r11
786 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
787 jne opportunistic_sysret_failed
788
789 testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
790 jnz opportunistic_sysret_failed
791
792 /* nothing to check for RSP */
793
794 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
795 jne opportunistic_sysret_failed
796
797 /*
798 * We win! This label is here just for ease of understanding
799 * perf profiles. Nothing jumps here.
800 */
801irq_return_via_sysret:
802 CFI_REMEMBER_STATE
803 RESTORE_ARGS 1,8,1
804 movq (RSP-RIP)(%rsp),%rsp
805 USERGS_SYSRET64
806 CFI_RESTORE_STATE
807
808opportunistic_sysret_failed:
797 SWAPGS 809 SWAPGS
798 jmp restore_args 810 jmp restore_args
799 811
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7176f84f95a4..c74f2f5652da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -110,15 +110,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
110 110
111enum ctx_state ist_enter(struct pt_regs *regs) 111enum ctx_state ist_enter(struct pt_regs *regs)
112{ 112{
113 /* 113 enum ctx_state prev_state;
114 * We are atomic because we're on the IST stack (or we're on x86_32,
115 * in which case we still shouldn't schedule.
116 */
117 preempt_count_add(HARDIRQ_OFFSET);
118 114
119 if (user_mode_vm(regs)) { 115 if (user_mode_vm(regs)) {
120 /* Other than that, we're just an exception. */ 116 /* Other than that, we're just an exception. */
121 return exception_enter(); 117 prev_state = exception_enter();
122 } else { 118 } else {
123 /* 119 /*
124 * We might have interrupted pretty much anything. In 120 * We might have interrupted pretty much anything. In
@@ -127,12 +123,27 @@ enum ctx_state ist_enter(struct pt_regs *regs)
127 * but we need to notify RCU. 123 * but we need to notify RCU.
128 */ 124 */
129 rcu_nmi_enter(); 125 rcu_nmi_enter();
130 return IN_KERNEL; /* the value is irrelevant. */ 126 prev_state = IN_KERNEL; /* the value is irrelevant. */
131 } 127 }
128
129 /*
130 * We are atomic because we're on the IST stack (or we're on x86_32,
131 * in which case we still shouldn't schedule).
132 *
133 * This must be after exception_enter(), because exception_enter()
134 * won't do anything if in_interrupt() returns true.
135 */
136 preempt_count_add(HARDIRQ_OFFSET);
137
138 /* This code is a bit fragile. Test it. */
139 rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
140
141 return prev_state;
132} 142}
133 143
134void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) 144void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
135{ 145{
146 /* Must be before exception_exit. */
136 preempt_count_sub(HARDIRQ_OFFSET); 147 preempt_count_sub(HARDIRQ_OFFSET);
137 148
138 if (user_mode_vm(regs)) 149 if (user_mode_vm(regs))