aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/ia32
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2015-04-02 20:12:12 -0400
committerIngo Molnar <mingo@kernel.org>2015-04-03 03:14:00 -0400
commit4214a16b02971c60960afd675d03544e109e0d75 (patch)
tree55e0e1894558278786769103bd877711a7a2d083 /arch/x86/ia32
parentcf9328cc9989e028fdc64d8c0a7b1b043dc96735 (diff)
x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER
SYSEXIT is scary on 64-bit kernels -- SYSEXIT must be invoked with usergs and IRQs on. That means that we rely on STI to correctly mask interrupts for one instruction. This is okay by itself, but the semantics with respect to NMIs are unclear. Avoid the whole issue by using SYSRETL instead. For background, Intel CPUs don't allow SYSCALL from compat mode, but they do allow SYSRETL back to compat mode. Go figure. To avoid doing too much at once, this doesn't revamp the calling convention. We still return with EBP, EDX, and ECX on the user stack. Oddly this seems to be 30 cycles or so faster. Avoiding POPFQ and STI will account for under half of that, I think, so my best guess is that Intel just optimizes SYSRET much better than SYSEXIT. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/57a0bf1b5230b2716a64ebe48e9bc1110f7ab433.1428019097.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/ia32')
-rw-r--r--arch/x86/ia32/ia32entry.S53
1 files changed, 40 insertions, 13 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 8d01cce7b6b8..5d8f987a340d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -186,28 +186,55 @@ sysenter_dispatch:
186 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 186 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
187 jnz sysexit_audit 187 jnz sysexit_audit
188sysexit_from_sys_call: 188sysexit_from_sys_call:
189 /*
190 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
191 * NMI between STI and SYSEXIT has poorly specified behavior,
192 * and and NMI followed by an IRQ with usergs is fatal. So
193 * we just pretend we're using SYSEXIT but we really use
194 * SYSRETL instead.
195 *
196 * This code path is still called 'sysexit' because it pairs
197 * with 'sysenter' and it uses the SYSENTER calling convention.
198 */
189 andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 199 andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
190 /* clear IF, that popfq doesn't enable interrupts early */ 200 movl RIP(%rsp),%ecx /* User %eip */
191 andl $~0x200,EFLAGS(%rsp) 201 CFI_REGISTER rip,rcx
192 movl RIP(%rsp),%edx /* User %eip */
193 CFI_REGISTER rip,rdx
194 RESTORE_RSI_RDI 202 RESTORE_RSI_RDI
195 /* pop everything except ss,rsp,rflags slots */ 203 xorl %edx,%edx /* avoid info leaks */
196 REMOVE_PT_GPREGS_FROM_STACK 3*8
197 xorq %r8,%r8 204 xorq %r8,%r8
198 xorq %r9,%r9 205 xorq %r9,%r9
199 xorq %r10,%r10 206 xorq %r10,%r10
200 xorq %r11,%r11 207 movl EFLAGS(%rsp),%r11d /* User eflags */
201 popfq_cfi
202 /*CFI_RESTORE rflags*/ 208 /*CFI_RESTORE rflags*/
203 popq_cfi %rcx /* User %esp */
204 CFI_REGISTER rsp,rcx
205 TRACE_IRQS_ON 209 TRACE_IRQS_ON
210
206 /* 211 /*
207 * 32bit SYSEXIT restores eip from edx, esp from ecx. 212 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
208 * cs and ss are loaded from MSRs. 213 * since it avoids a dicey window with interrupts enabled.
209 */ 214 */
210 ENABLE_INTERRUPTS_SYSEXIT32 215 movl RSP(%rsp),%esp
216
217 /*
218 * USERGS_SYSRET32 does:
219 * gsbase = user's gs base
220 * eip = ecx
221 * rflags = r11
222 * cs = __USER32_CS
223 * ss = __USER_DS
224 *
225 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
226 *
227 * pop %ebp
228 * pop %edx
229 * pop %ecx
230 *
231 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
232 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
233 * address (already known to user code), and R12-R15 are
234 * callee-saved and therefore don't contain any interesting
235 * kernel data.
236 */
237 USERGS_SYSRET32
211 238
212 CFI_RESTORE_STATE 239 CFI_RESTORE_STATE
213 240