aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 12:32:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 12:32:27 -0400
commitba33ea811e1ff6726abb7f8f96df38c2d7b50304 (patch)
tree29134e5cc7c19c8e520cb9336b476144d3d1252f /arch/x86/entry
parente23604edac2a7be6a8808a5d13fac6b9df4eb9a8 (diff)
parentd05004944206cbbf1c453e179768163731c7c6f1 (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm updates from Ingo Molnar: "This is another big update. Main changes are: - lots of x86 system call (and other traps/exceptions) entry code enhancements. In particular the complex parts of the 64-bit entry code have been migrated to C code as well, and a number of dusty corners have been refreshed. (Andy Lutomirski) - vDSO special mapping robustification and general cleanups (Andy Lutomirski) - cpufeature refactoring, cleanups and speedups (Borislav Petkov) - lots of other changes ..." * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits) x86/cpufeature: Enable new AVX-512 features x86/entry/traps: Show unhandled signal for i386 in do_trap() x86/entry: Call enter_from_user_mode() with IRQs off x86/entry/32: Change INT80 to be an interrupt gate x86/entry: Improve system call entry comments x86/entry: Remove TIF_SINGLESTEP entry work x86/entry/32: Add and check a stack canary for the SYSENTER stack x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup x86/entry: Only allocate space for tss_struct::SYSENTER_stack if needed x86/entry: Vastly simplify SYSENTER TF (single-step) handling x86/entry/traps: Clear DR6 early in do_debug() and improve the comment x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions x86/entry/32: Restore FLAGS on SYSEXIT x86/entry/32: Filter NT and speed up AC filtering in SYSENTER x86/entry/compat: In SYSENTER, sink AC clearing below the existing FLAGS test selftests/x86: In syscall_nt, test NT|TF as well x86/asm-offsets: Remove PARAVIRT_enabled x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled uprobes: __create_xol_area() must nullify xol_mapping.fault x86/cpufeature: Create a new synthetic cpu capability for machine check recovery ...
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/calling.h31
-rw-r--r--arch/x86/entry/common.c106
-rw-r--r--arch/x86/entry/entry_32.S268
-rw-r--r--arch/x86/entry/entry_64.S286
-rw-r--r--arch/x86/entry/entry_64_compat.S102
-rw-r--r--arch/x86/entry/syscall_32.c10
-rw-r--r--arch/x86/entry/syscall_64.c13
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl20
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh58
-rw-r--r--arch/x86/entry/vdso/vdso2c.h7
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S2
-rw-r--r--arch/x86/entry/vdso/vma.c127
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c9
14 files changed, 566 insertions, 474 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e09868..9a9e5884066c 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
201 .byte 0xf1 201 .byte 0xf1
202 .endm 202 .endm
203 203
204#else /* CONFIG_X86_64 */
205
206/*
207 * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
208 * are different from the entry_32.S versions in not changing the segment
209 * registers. So only suitable for in kernel use, not when transitioning
210 * from or to user space. The resulting stack frame is not a standard
211 * pt_regs frame. The main use case is calling C code from assembler
212 * when all the registers need to be preserved.
213 */
214
215 .macro SAVE_ALL
216 pushl %eax
217 pushl %ebp
218 pushl %edi
219 pushl %esi
220 pushl %edx
221 pushl %ecx
222 pushl %ebx
223 .endm
224
225 .macro RESTORE_ALL
226 popl %ebx
227 popl %ecx
228 popl %edx
229 popl %esi
230 popl %edi
231 popl %ebp
232 popl %eax
233 .endm
234
235#endif /* CONFIG_X86_64 */ 204#endif /* CONFIG_X86_64 */
236 205
237/* 206/*
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03663740c866..e79d93d44ecd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
26#include <asm/traps.h> 26#include <asm/traps.h>
27#include <asm/vdso.h> 27#include <asm/vdso.h>
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/cpufeature.h>
29 30
30#define CREATE_TRACE_POINTS 31#define CREATE_TRACE_POINTS
31#include <trace/events/syscalls.h> 32#include <trace/events/syscalls.h>
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
44 CT_WARN_ON(ct_state() != CONTEXT_USER); 45 CT_WARN_ON(ct_state() != CONTEXT_USER);
45 user_exit(); 46 user_exit();
46} 47}
48#else
49static inline void enter_from_user_mode(void) {}
47#endif 50#endif
48 51
49static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 52static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
84 87
85 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; 88 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
86 89
87#ifdef CONFIG_CONTEXT_TRACKING
88 /*
89 * If TIF_NOHZ is set, we are required to call user_exit() before
90 * doing anything that could touch RCU.
91 */
92 if (work & _TIF_NOHZ) {
93 enter_from_user_mode();
94 work &= ~_TIF_NOHZ;
95 }
96#endif
97
98#ifdef CONFIG_SECCOMP 90#ifdef CONFIG_SECCOMP
99 /* 91 /*
100 * Do seccomp first -- it should minimize exposure of other 92 * Do seccomp first -- it should minimize exposure of other
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
171 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 163 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
172 BUG_ON(regs != task_pt_regs(current)); 164 BUG_ON(regs != task_pt_regs(current));
173 165
174 /*
175 * If we stepped into a sysenter/syscall insn, it trapped in
176 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
177 * If user-mode had set TF itself, then it's still clear from
178 * do_debug() and we need to set it again to restore the user
179 * state. If we entered on the slow path, TF was already set.
180 */
181 if (work & _TIF_SINGLESTEP)
182 regs->flags |= X86_EFLAGS_TF;
183
184#ifdef CONFIG_SECCOMP 166#ifdef CONFIG_SECCOMP
185 /* 167 /*
186 * Call seccomp_phase2 before running the other hooks so that 168 * Call seccomp_phase2 before running the other hooks so that
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
268/* Called with IRQs disabled. */ 250/* Called with IRQs disabled. */
269__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) 251__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
270{ 252{
253 struct thread_info *ti = pt_regs_to_thread_info(regs);
271 u32 cached_flags; 254 u32 cached_flags;
272 255
273 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) 256 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
275 258
276 lockdep_sys_exit(); 259 lockdep_sys_exit();
277 260
278 cached_flags = 261 cached_flags = READ_ONCE(ti->flags);
279 READ_ONCE(pt_regs_to_thread_info(regs)->flags);
280 262
281 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 263 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
282 exit_to_usermode_loop(regs, cached_flags); 264 exit_to_usermode_loop(regs, cached_flags);
283 265
266#ifdef CONFIG_COMPAT
267 /*
268 * Compat syscalls set TS_COMPAT. Make sure we clear it before
269 * returning to user mode. We need to clear it *after* signal
270 * handling, because syscall restart has a fixup for compat
271 * syscalls. The fixup is exercised by the ptrace_syscall_32
272 * selftest.
273 */
274 ti->status &= ~TS_COMPAT;
275#endif
276
284 user_enter(); 277 user_enter();
285} 278}
286 279
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
332 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) 325 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
333 syscall_slow_exit_work(regs, cached_flags); 326 syscall_slow_exit_work(regs, cached_flags);
334 327
335#ifdef CONFIG_COMPAT 328 local_irq_disable();
329 prepare_exit_to_usermode(regs);
330}
331
332#ifdef CONFIG_X86_64
333__visible void do_syscall_64(struct pt_regs *regs)
334{
335 struct thread_info *ti = pt_regs_to_thread_info(regs);
336 unsigned long nr = regs->orig_ax;
337
338 enter_from_user_mode();
339 local_irq_enable();
340
341 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
342 nr = syscall_trace_enter(regs);
343
336 /* 344 /*
337 * Compat syscalls set TS_COMPAT. Make sure we clear it before 345 * NB: Native and x32 syscalls are dispatched from the same
338 * returning to user mode. 346 * table. The only functional difference is the x32 bit in
347 * regs->orig_ax, which changes the behavior of some syscalls.
339 */ 348 */
340 ti->status &= ~TS_COMPAT; 349 if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
341#endif 350 regs->ax = sys_call_table[nr & __SYSCALL_MASK](
351 regs->di, regs->si, regs->dx,
352 regs->r10, regs->r8, regs->r9);
353 }
342 354
343 local_irq_disable(); 355 syscall_return_slowpath(regs);
344 prepare_exit_to_usermode(regs);
345} 356}
357#endif
346 358
347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 359#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
348/* 360/*
349 * Does a 32-bit syscall. Called with IRQs on and does all entry and 361 * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
350 * exit work and returns with IRQs off. This function is extremely hot 362 * all entry and exit work and returns with IRQs off. This function is
351 * in workloads that use it, and it's usually called from 363 * extremely hot in workloads that use it, and it's usually called from
352 * do_fast_syscall_32, so forcibly inline it to improve performance. 364 * do_fast_syscall_32, so forcibly inline it to improve performance.
353 */ 365 */
354#ifdef CONFIG_X86_32 366static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
355/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
356__visible
357#else
358/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
359static
360#endif
361__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
362{ 367{
363 struct thread_info *ti = pt_regs_to_thread_info(regs); 368 struct thread_info *ti = pt_regs_to_thread_info(regs);
364 unsigned int nr = (unsigned int)regs->orig_ax; 369 unsigned int nr = (unsigned int)regs->orig_ax;
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
393 syscall_return_slowpath(regs); 398 syscall_return_slowpath(regs);
394} 399}
395 400
396#ifdef CONFIG_X86_64 401/* Handles int $0x80 */
397/* Handles INT80 on 64-bit kernels */ 402__visible void do_int80_syscall_32(struct pt_regs *regs)
398__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
399{ 403{
404 enter_from_user_mode();
400 local_irq_enable(); 405 local_irq_enable();
401 do_syscall_32_irqs_on(regs); 406 do_syscall_32_irqs_on(regs);
402} 407}
403#endif
404 408
405/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 409/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
406__visible long do_fast_syscall_32(struct pt_regs *regs) 410__visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
420 */ 424 */
421 regs->ip = landing_pad; 425 regs->ip = landing_pad;
422 426
423 /* 427 enter_from_user_mode();
424 * Fetch EBP from where the vDSO stashed it. 428
425 *
426 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
427 */
428 local_irq_enable(); 429 local_irq_enable();
430
431 /* Fetch EBP from where the vDSO stashed it. */
429 if ( 432 if (
430#ifdef CONFIG_X86_64 433#ifdef CONFIG_X86_64
431 /* 434 /*
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
443 /* User code screwed up. */ 446 /* User code screwed up. */
444 local_irq_disable(); 447 local_irq_disable();
445 regs->ax = -EFAULT; 448 regs->ax = -EFAULT;
446#ifdef CONFIG_CONTEXT_TRACKING
447 enter_from_user_mode();
448#endif
449 prepare_exit_to_usermode(regs); 449 prepare_exit_to_usermode(regs);
450 return 0; /* Keep it simple: use IRET. */ 450 return 0; /* Keep it simple: use IRET. */
451 } 451 }
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bb3e376d0f33..10868aa734dc 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
40#include <asm/processor-flags.h> 40#include <asm/processor-flags.h>
41#include <asm/ftrace.h> 41#include <asm/ftrace.h>
42#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
43#include <asm/cpufeature.h> 43#include <asm/cpufeatures.h>
44#include <asm/alternative-asm.h> 44#include <asm/alternative-asm.h>
45#include <asm/asm.h> 45#include <asm/asm.h>
46#include <asm/smap.h> 46#include <asm/smap.h>
@@ -287,14 +287,64 @@ need_resched:
287END(resume_kernel) 287END(resume_kernel)
288#endif 288#endif
289 289
290 # SYSENTER call handler stub 290GLOBAL(__begin_SYSENTER_singlestep_region)
291/*
292 * All code from here through __end_SYSENTER_singlestep_region is subject
293 * to being single-stepped if a user program sets TF and executes SYSENTER.
294 * There is absolutely nothing that we can do to prevent this from happening
295 * (thanks Intel!). To keep our handling of this situation as simple as
296 * possible, we handle TF just like AC and NT, except that our #DB handler
297 * will ignore all of the single-step traps generated in this range.
298 */
299
300#ifdef CONFIG_XEN
301/*
302 * Xen doesn't set %esp to be precisely what the normal SYSENTER
303 * entry point expects, so fix it up before using the normal path.
304 */
305ENTRY(xen_sysenter_target)
306 addl $5*4, %esp /* remove xen-provided frame */
307 jmp sysenter_past_esp
308#endif
309
310/*
311 * 32-bit SYSENTER entry.
312 *
313 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
314 * if X86_FEATURE_SEP is available. This is the preferred system call
315 * entry on 32-bit systems.
316 *
317 * The SYSENTER instruction, in principle, should *only* occur in the
318 * vDSO. In practice, a small number of Android devices were shipped
319 * with a copy of Bionic that inlined a SYSENTER instruction. This
320 * never happened in any of Google's Bionic versions -- it only happened
321 * in a narrow range of Intel-provided versions.
322 *
323 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
324 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
325 * SYSENTER does not save anything on the stack,
326 * and does not save old EIP (!!!), ESP, or EFLAGS.
327 *
328 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
329 * user and/or vm86 state), we explicitly disable the SYSENTER
330 * instruction in vm86 mode by reprogramming the MSRs.
331 *
332 * Arguments:
333 * eax system call number
334 * ebx arg1
335 * ecx arg2
336 * edx arg3
337 * esi arg4
338 * edi arg5
339 * ebp user stack
340 * 0(%ebp) arg6
341 */
291ENTRY(entry_SYSENTER_32) 342ENTRY(entry_SYSENTER_32)
292 movl TSS_sysenter_sp0(%esp), %esp 343 movl TSS_sysenter_sp0(%esp), %esp
293sysenter_past_esp: 344sysenter_past_esp:
294 pushl $__USER_DS /* pt_regs->ss */ 345 pushl $__USER_DS /* pt_regs->ss */
295 pushl %ebp /* pt_regs->sp (stashed in bp) */ 346 pushl %ebp /* pt_regs->sp (stashed in bp) */
296 pushfl /* pt_regs->flags (except IF = 0) */ 347 pushfl /* pt_regs->flags (except IF = 0) */
297 ASM_CLAC /* Clear AC after saving FLAGS */
298 orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ 348 orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
299 pushl $__USER_CS /* pt_regs->cs */ 349 pushl $__USER_CS /* pt_regs->cs */
300 pushl $0 /* pt_regs->ip = 0 (placeholder) */ 350 pushl $0 /* pt_regs->ip = 0 (placeholder) */
@@ -302,6 +352,29 @@ sysenter_past_esp:
302 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 352 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
303 353
304 /* 354 /*
355 * SYSENTER doesn't filter flags, so we need to clear NT, AC
356 * and TF ourselves. To save a few cycles, we can check whether
357 * either was set instead of doing an unconditional popfq.
358 * This needs to happen before enabling interrupts so that
359 * we don't get preempted with NT set.
360 *
361 * If TF is set, we will single-step all the way to here -- do_debug
362 * will ignore all the traps. (Yes, this is slow, but so is
363 * single-stepping in general. This allows us to avoid having
364 * a more complicated code to handle the case where a user program
365 * forces us to single-step through the SYSENTER entry code.)
366 *
367 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
368 * out-of-line as an optimization: NT is unlikely to be set in the
369 * majority of the cases and instead of polluting the I$ unnecessarily,
370 * we're keeping that code behind a branch which will predict as
371 * not-taken and therefore its instructions won't be fetched.
372 */
373 testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
374 jnz .Lsysenter_fix_flags
375.Lsysenter_flags_fixed:
376
377 /*
305 * User mode is traced as though IRQs are on, and SYSENTER 378 * User mode is traced as though IRQs are on, and SYSENTER
306 * turned them off. 379 * turned them off.
307 */ 380 */
@@ -327,6 +400,15 @@ sysenter_past_esp:
327 popl %eax /* pt_regs->ax */ 400 popl %eax /* pt_regs->ax */
328 401
329 /* 402 /*
403 * Restore all flags except IF. (We restore IF separately because
404 * STI gives a one-instruction window in which we won't be interrupted,
405 * whereas POPF does not.)
406 */
407 addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
408 btr $X86_EFLAGS_IF_BIT, (%esp)
409 popfl
410
411 /*
330 * Return back to the vDSO, which will pop ecx and edx. 412 * Return back to the vDSO, which will pop ecx and edx.
331 * Don't bother with DS and ES (they already contain __USER_DS). 413 * Don't bother with DS and ES (they already contain __USER_DS).
332 */ 414 */
@@ -339,28 +421,63 @@ sysenter_past_esp:
339.popsection 421.popsection
340 _ASM_EXTABLE(1b, 2b) 422 _ASM_EXTABLE(1b, 2b)
341 PTGS_TO_GS_EX 423 PTGS_TO_GS_EX
424
425.Lsysenter_fix_flags:
426 pushl $X86_EFLAGS_FIXED
427 popfl
428 jmp .Lsysenter_flags_fixed
429GLOBAL(__end_SYSENTER_singlestep_region)
342ENDPROC(entry_SYSENTER_32) 430ENDPROC(entry_SYSENTER_32)
343 431
344 # system call handler stub 432/*
433 * 32-bit legacy system call entry.
434 *
435 * 32-bit x86 Linux system calls traditionally used the INT $0x80
436 * instruction. INT $0x80 lands here.
437 *
438 * This entry point can be used by any 32-bit perform system calls.
439 * Instances of INT $0x80 can be found inline in various programs and
440 * libraries. It is also used by the vDSO's __kernel_vsyscall
441 * fallback for hardware that doesn't support a faster entry method.
442 * Restarted 32-bit system calls also fall back to INT $0x80
443 * regardless of what instruction was originally used to do the system
444 * call. (64-bit programs can use INT $0x80 as well, but they can
445 * only run on 64-bit kernels and therefore land in
446 * entry_INT80_compat.)
447 *
448 * This is considered a slow path. It is not used by most libc
449 * implementations on modern hardware except during process startup.
450 *
451 * Arguments:
452 * eax system call number
453 * ebx arg1
454 * ecx arg2
455 * edx arg3
456 * esi arg4
457 * edi arg5
458 * ebp arg6
459 */
345ENTRY(entry_INT80_32) 460ENTRY(entry_INT80_32)
346 ASM_CLAC 461 ASM_CLAC
347 pushl %eax /* pt_regs->orig_ax */ 462 pushl %eax /* pt_regs->orig_ax */
348 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 463 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
349 464
350 /* 465 /*
351 * User mode is traced as though IRQs are on. Unlike the 64-bit 466 * User mode is traced as though IRQs are on, and the interrupt gate
352 * case, INT80 is a trap gate on 32-bit kernels, so interrupts 467 * turned them off.
353 * are already on (unless user code is messing around with iopl).
354 */ 468 */
469 TRACE_IRQS_OFF
355 470
356 movl %esp, %eax 471 movl %esp, %eax
357 call do_syscall_32_irqs_on 472 call do_int80_syscall_32
358.Lsyscall_32_done: 473.Lsyscall_32_done:
359 474
360restore_all: 475restore_all:
361 TRACE_IRQS_IRET 476 TRACE_IRQS_IRET
362restore_all_notrace: 477restore_all_notrace:
363#ifdef CONFIG_X86_ESPFIX32 478#ifdef CONFIG_X86_ESPFIX32
479 ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
480
364 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 481 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
365 /* 482 /*
366 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we 483 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +504,6 @@ ENTRY(iret_exc )
387 504
388#ifdef CONFIG_X86_ESPFIX32 505#ifdef CONFIG_X86_ESPFIX32
389ldt_ss: 506ldt_ss:
390#ifdef CONFIG_PARAVIRT
391 /*
392 * The kernel can't run on a non-flat stack if paravirt mode
393 * is active. Rather than try to fixup the high bits of
394 * ESP, bypass this code entirely. This may break DOSemu
395 * and/or Wine support in a paravirt VM, although the option
396 * is still available to implement the setting of the high
397 * 16-bits in the INTERRUPT_RETURN paravirt-op.
398 */
399 cmpl $0, pv_info+PARAVIRT_enabled
400 jne restore_nocheck
401#endif
402
403/* 507/*
404 * Setup and switch to ESPFIX stack 508 * Setup and switch to ESPFIX stack
405 * 509 *
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
632END(spurious_interrupt_bug) 736END(spurious_interrupt_bug)
633 737
634#ifdef CONFIG_XEN 738#ifdef CONFIG_XEN
635/*
636 * Xen doesn't set %esp to be precisely what the normal SYSENTER
637 * entry point expects, so fix it up before using the normal path.
638 */
639ENTRY(xen_sysenter_target)
640 addl $5*4, %esp /* remove xen-provided frame */
641 jmp sysenter_past_esp
642
643ENTRY(xen_hypervisor_callback) 739ENTRY(xen_hypervisor_callback)
644 pushl $-1 /* orig_ax = -1 => not a system call */ 740 pushl $-1 /* orig_ax = -1 => not a system call */
645 SAVE_ALL 741 SAVE_ALL
@@ -939,51 +1035,48 @@ error_code:
939 jmp ret_from_exception 1035 jmp ret_from_exception
940END(page_fault) 1036END(page_fault)
941 1037
942/*
943 * Debug traps and NMI can happen at the one SYSENTER instruction
944 * that sets up the real kernel stack. Check here, since we can't
945 * allow the wrong stack to be used.
946 *
947 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
948 * already pushed 3 words if it hits on the sysenter instruction:
949 * eflags, cs and eip.
950 *
951 * We just load the right stack, and push the three (known) values
952 * by hand onto the new stack - while updating the return eip past
953 * the instruction that would have done it for sysenter.
954 */
955.macro FIX_STACK offset ok label
956 cmpw $__KERNEL_CS, 4(%esp)
957 jne \ok
958\label:
959 movl TSS_sysenter_sp0 + \offset(%esp), %esp
960 pushfl
961 pushl $__KERNEL_CS
962 pushl $sysenter_past_esp
963.endm
964
965ENTRY(debug) 1038ENTRY(debug)
1039 /*
1040 * #DB can happen at the first instruction of
1041 * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
1042 * happens, then we will be running on a very small stack. We
1043 * need to detect this condition and switch to the thread
1044 * stack before calling any C code at all.
1045 *
1046 * If you edit this code, keep in mind that NMIs can happen in here.
1047 */
966 ASM_CLAC 1048 ASM_CLAC
967 cmpl $entry_SYSENTER_32, (%esp)
968 jne debug_stack_correct
969 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
970debug_stack_correct:
971 pushl $-1 # mark this as an int 1049 pushl $-1 # mark this as an int
972 SAVE_ALL 1050 SAVE_ALL
973 TRACE_IRQS_OFF
974 xorl %edx, %edx # error code 0 1051 xorl %edx, %edx # error code 0
975 movl %esp, %eax # pt_regs pointer 1052 movl %esp, %eax # pt_regs pointer
1053
1054 /* Are we currently on the SYSENTER stack? */
1055 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
1056 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
1057 cmpl $SIZEOF_SYSENTER_stack, %ecx
1058 jb .Ldebug_from_sysenter_stack
1059
1060 TRACE_IRQS_OFF
1061 call do_debug
1062 jmp ret_from_exception
1063
1064.Ldebug_from_sysenter_stack:
1065 /* We're on the SYSENTER stack. Switch off. */
1066 movl %esp, %ebp
1067 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1068 TRACE_IRQS_OFF
976 call do_debug 1069 call do_debug
1070 movl %ebp, %esp
977 jmp ret_from_exception 1071 jmp ret_from_exception
978END(debug) 1072END(debug)
979 1073
980/* 1074/*
981 * NMI is doubly nasty. It can happen _while_ we're handling 1075 * NMI is doubly nasty. It can happen on the first instruction of
982 * a debug fault, and the debug fault hasn't yet been able to 1076 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
983 * clear up the stack. So we first check whether we got an 1077 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
984 * NMI on the sysenter entry path, but after that we need to 1078 * switched stacks. We handle both conditions by simply checking whether we
985 * check whether we got an NMI on the debug path where the debug 1079 * interrupted kernel code running on the SYSENTER stack.
986 * fault happened on the sysenter path.
987 */ 1080 */
988ENTRY(nmi) 1081ENTRY(nmi)
989 ASM_CLAC 1082 ASM_CLAC
@@ -994,41 +1087,32 @@ ENTRY(nmi)
994 popl %eax 1087 popl %eax
995 je nmi_espfix_stack 1088 je nmi_espfix_stack
996#endif 1089#endif
997 cmpl $entry_SYSENTER_32, (%esp) 1090
998 je nmi_stack_fixup 1091 pushl %eax # pt_regs->orig_ax
999 pushl %eax
1000 movl %esp, %eax
1001 /*
1002 * Do not access memory above the end of our stack page,
1003 * it might not exist.
1004 */
1005 andl $(THREAD_SIZE-1), %eax
1006 cmpl $(THREAD_SIZE-20), %eax
1007 popl %eax
1008 jae nmi_stack_correct
1009 cmpl $entry_SYSENTER_32, 12(%esp)
1010 je nmi_debug_stack_check
1011nmi_stack_correct:
1012 pushl %eax
1013 SAVE_ALL 1092 SAVE_ALL
1014 xorl %edx, %edx # zero error code 1093 xorl %edx, %edx # zero error code
1015 movl %esp, %eax # pt_regs pointer 1094 movl %esp, %eax # pt_regs pointer
1095
1096 /* Are we currently on the SYSENTER stack? */
1097 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
1098 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
1099 cmpl $SIZEOF_SYSENTER_stack, %ecx
1100 jb .Lnmi_from_sysenter_stack
1101
1102 /* Not on SYSENTER stack. */
1016 call do_nmi 1103 call do_nmi
1017 jmp restore_all_notrace 1104 jmp restore_all_notrace
1018 1105
1019nmi_stack_fixup: 1106.Lnmi_from_sysenter_stack:
1020 FIX_STACK 12, nmi_stack_correct, 1 1107 /*
1021 jmp nmi_stack_correct 1108 * We're on the SYSENTER stack. Switch off. No one (not even debug)
1022 1109 * is using the thread stack right now, so it's safe for us to use it.
1023nmi_debug_stack_check: 1110 */
1024 cmpw $__KERNEL_CS, 16(%esp) 1111 movl %esp, %ebp
1025 jne nmi_stack_correct 1112 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1026 cmpl $debug, (%esp) 1113 call do_nmi
1027 jb nmi_stack_correct 1114 movl %ebp, %esp
1028 cmpl $debug_esp_fix_insn, (%esp) 1115 jmp restore_all_notrace
1029 ja nmi_stack_correct
1030 FIX_STACK 24, nmi_stack_correct, 1
1031 jmp nmi_stack_correct
1032 1116
1033#ifdef CONFIG_X86_ESPFIX32 1117#ifdef CONFIG_X86_ESPFIX32
1034nmi_espfix_stack: 1118nmi_espfix_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3cfceb6..858b555e274b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
103/* 103/*
104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
105 * 105 *
106 * This is the only entry point used for 64-bit system calls. The
107 * hardware interface is reasonably well designed and the register to
108 * argument mapping Linux uses fits well with the registers that are
109 * available when SYSCALL is used.
110 *
111 * SYSCALL instructions can be found inlined in libc implementations as
112 * well as some other programs and libraries. There are also a handful
113 * of SYSCALL instructions in the vDSO used, for example, as a
114 * clock_gettimeofday fallback.
115 *
106 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 116 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
107 * then loads new ss, cs, and rip from previously programmed MSRs. 117 * then loads new ss, cs, and rip from previously programmed MSRs.
108 * rflags gets masked by a value from another MSR (so CLD and CLAC 118 * rflags gets masked by a value from another MSR (so CLD and CLAC
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
145 movq %rsp, PER_CPU_VAR(rsp_scratch) 155 movq %rsp, PER_CPU_VAR(rsp_scratch)
146 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 156 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
147 157
158 TRACE_IRQS_OFF
159
148 /* Construct struct pt_regs on stack */ 160 /* Construct struct pt_regs on stack */
149 pushq $__USER_DS /* pt_regs->ss */ 161 pushq $__USER_DS /* pt_regs->ss */
150 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 162 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
151 /*
152 * Re-enable interrupts.
153 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
154 * must execute atomically in the face of possible interrupt-driven
155 * task preemption. We must enable interrupts only after we're done
156 * with using rsp_scratch:
157 */
158 ENABLE_INTERRUPTS(CLBR_NONE)
159 pushq %r11 /* pt_regs->flags */ 163 pushq %r11 /* pt_regs->flags */
160 pushq $__USER_CS /* pt_regs->cs */ 164 pushq $__USER_CS /* pt_regs->cs */
161 pushq %rcx /* pt_regs->ip */ 165 pushq %rcx /* pt_regs->ip */
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
171 pushq %r11 /* pt_regs->r11 */ 175 pushq %r11 /* pt_regs->r11 */
172 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 176 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
173 177
174 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 178 /*
175 jnz tracesys 179 * If we need to do entry work or if we guess we'll need to do
180 * exit work, go straight to the slow path.
181 */
182 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
183 jnz entry_SYSCALL64_slow_path
184
176entry_SYSCALL_64_fastpath: 185entry_SYSCALL_64_fastpath:
186 /*
187 * Easy case: enable interrupts and issue the syscall. If the syscall
188 * needs pt_regs, we'll call a stub that disables interrupts again
189 * and jumps to the slow path.
190 */
191 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS(CLBR_NONE)
177#if __SYSCALL_MASK == ~0 193#if __SYSCALL_MASK == ~0
178 cmpq $__NR_syscall_max, %rax 194 cmpq $__NR_syscall_max, %rax
179#else 195#else
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
182#endif 198#endif
183 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 199 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
184 movq %r10, %rcx 200 movq %r10, %rcx
201
202 /*
203 * This call instruction is handled specially in stub_ptregs_64.
204 * It might end up jumping to the slow path. If it jumps, RAX
205 * and all argument registers are clobbered.
206 */
185 call *sys_call_table(, %rax, 8) 207 call *sys_call_table(, %rax, 8)
208.Lentry_SYSCALL_64_after_fastpath_call:
209
186 movq %rax, RAX(%rsp) 210 movq %rax, RAX(%rsp)
1871: 2111:
188/*
189 * Syscall return path ending with SYSRET (fast path).
190 * Has incompletely filled pt_regs.
191 */
192 LOCKDEP_SYS_EXIT
193 /*
194 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
195 * it is too small to ever cause noticeable irq latency.
196 */
197 DISABLE_INTERRUPTS(CLBR_NONE)
198 212
199 /* 213 /*
200 * We must check ti flags with interrupts (or at least preemption) 214 * If we get here, then we know that pt_regs is clean for SYSRET64.
201 * off because we must *never* return to userspace without 215 * If we see that no exit work is required (which we are required
202 * processing exit work that is enqueued if we're preempted here. 216 * to check with IRQs off), then we can go straight to SYSRET64.
203 * In particular, returning to userspace with any of the one-shot
204 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
205 * very bad.
206 */ 217 */
218 DISABLE_INTERRUPTS(CLBR_NONE)
219 TRACE_IRQS_OFF
207 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 220 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
208 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ 221 jnz 1f
209 222
210 RESTORE_C_REGS_EXCEPT_RCX_R11 223 LOCKDEP_SYS_EXIT
224 TRACE_IRQS_ON /* user mode is traced as IRQs on */
211 movq RIP(%rsp), %rcx 225 movq RIP(%rsp), %rcx
212 movq EFLAGS(%rsp), %r11 226 movq EFLAGS(%rsp), %r11
227 RESTORE_C_REGS_EXCEPT_RCX_R11
213 movq RSP(%rsp), %rsp 228 movq RSP(%rsp), %rsp
214 /*
215 * 64-bit SYSRET restores rip from rcx,
216 * rflags from r11 (but RF and VM bits are forced to 0),
217 * cs and ss are loaded from MSRs.
218 * Restoration of rflags re-enables interrupts.
219 *
220 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
221 * descriptor is not reinitialized. This means that we should
222 * avoid SYSRET with SS == NULL, which could happen if we schedule,
223 * exit the kernel, and re-enter using an interrupt vector. (All
224 * interrupt entries on x86_64 set SS to NULL.) We prevent that
225 * from happening by reloading SS in __switch_to. (Actually
226 * detecting the failure in 64-bit userspace is tricky but can be
227 * done.)
228 */
229 USERGS_SYSRET64 229 USERGS_SYSRET64
230 230
231GLOBAL(int_ret_from_sys_call_irqs_off) 2311:
232 /*
233 * The fast path looked good when we started, but something changed
234 * along the way and we need to switch to the slow path. Calling
235 * raise(3) will trigger this, for example. IRQs are off.
236 */
232 TRACE_IRQS_ON 237 TRACE_IRQS_ON
233 ENABLE_INTERRUPTS(CLBR_NONE) 238 ENABLE_INTERRUPTS(CLBR_NONE)
234 jmp int_ret_from_sys_call
235
236 /* Do syscall entry tracing */
237tracesys:
238 movq %rsp, %rdi
239 movl $AUDIT_ARCH_X86_64, %esi
240 call syscall_trace_enter_phase1
241 test %rax, %rax
242 jnz tracesys_phase2 /* if needed, run the slow path */
243 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
244 movq ORIG_RAX(%rsp), %rax
245 jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
246
247tracesys_phase2:
248 SAVE_EXTRA_REGS 239 SAVE_EXTRA_REGS
249 movq %rsp, %rdi 240 movq %rsp, %rdi
250 movl $AUDIT_ARCH_X86_64, %esi 241 call syscall_return_slowpath /* returns with IRQs disabled */
251 movq %rax, %rdx 242 jmp return_from_SYSCALL_64
252 call syscall_trace_enter_phase2
253
254 /*
255 * Reload registers from stack in case ptrace changed them.
256 * We don't reload %rax because syscall_trace_entry_phase2() returned
257 * the value it wants us to use in the table lookup.
258 */
259 RESTORE_C_REGS_EXCEPT_RAX
260 RESTORE_EXTRA_REGS
261#if __SYSCALL_MASK == ~0
262 cmpq $__NR_syscall_max, %rax
263#else
264 andl $__SYSCALL_MASK, %eax
265 cmpl $__NR_syscall_max, %eax
266#endif
267 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
268 movq %r10, %rcx /* fixup for C */
269 call *sys_call_table(, %rax, 8)
270 movq %rax, RAX(%rsp)
2711:
272 /* Use IRET because user could have changed pt_regs->foo */
273 243
274/* 244entry_SYSCALL64_slow_path:
275 * Syscall return path ending with IRET. 245 /* IRQs are off. */
276 * Has correct iret frame.
277 */
278GLOBAL(int_ret_from_sys_call)
279 SAVE_EXTRA_REGS 246 SAVE_EXTRA_REGS
280 movq %rsp, %rdi 247 movq %rsp, %rdi
281 call syscall_return_slowpath /* returns with IRQs disabled */ 248 call do_syscall_64 /* returns with IRQs disabled */
249
250return_from_SYSCALL_64:
282 RESTORE_EXTRA_REGS 251 RESTORE_EXTRA_REGS
283 TRACE_IRQS_IRETQ /* we're about to change IF */ 252 TRACE_IRQS_IRETQ /* we're about to change IF */
284 253
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
355 jmp restore_c_regs_and_iret 324 jmp restore_c_regs_and_iret
356END(entry_SYSCALL_64) 325END(entry_SYSCALL_64)
357 326
327ENTRY(stub_ptregs_64)
328 /*
329 * Syscalls marked as needing ptregs land here.
330 * If we are on the fast path, we need to save the extra regs,
331 * which we achieve by trying again on the slow path. If we are on
332 * the slow path, the extra regs are already saved.
333 *
334 * RAX stores a pointer to the C function implementing the syscall.
335 * IRQs are on.
336 */
337 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
338 jne 1f
358 339
359 .macro FORK_LIKE func
360ENTRY(stub_\func)
361 SAVE_EXTRA_REGS 8
362 jmp sys_\func
363END(stub_\func)
364 .endm
365
366 FORK_LIKE clone
367 FORK_LIKE fork
368 FORK_LIKE vfork
369
370ENTRY(stub_execve)
371 call sys_execve
372return_from_execve:
373 testl %eax, %eax
374 jz 1f
375 /* exec failed, can use fast SYSRET code path in this case */
376 ret
3771:
378 /* must use IRET code path (pt_regs->cs may have changed) */
379 addq $8, %rsp
380 ZERO_EXTRA_REGS
381 movq %rax, RAX(%rsp)
382 jmp int_ret_from_sys_call
383END(stub_execve)
384/*
385 * Remaining execve stubs are only 7 bytes long.
386 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
387 */
388 .align 8
389GLOBAL(stub_execveat)
390 call sys_execveat
391 jmp return_from_execve
392END(stub_execveat)
393
394#if defined(CONFIG_X86_X32_ABI)
395 .align 8
396GLOBAL(stub_x32_execve)
397 call compat_sys_execve
398 jmp return_from_execve
399END(stub_x32_execve)
400 .align 8
401GLOBAL(stub_x32_execveat)
402 call compat_sys_execveat
403 jmp return_from_execve
404END(stub_x32_execveat)
405#endif
406
407/*
408 * sigreturn is special because it needs to restore all registers on return.
409 * This cannot be done with SYSRET, so use the IRET return path instead.
410 */
411ENTRY(stub_rt_sigreturn)
412 /* 340 /*
413 * SAVE_EXTRA_REGS result is not normally needed: 341 * Called from fast path -- disable IRQs again, pop return address
414 * sigreturn overwrites all pt_regs->GPREGS. 342 * and jump to slow path
415 * But sigreturn can fail (!), and there is no easy way to detect that.
416 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
417 * we SAVE_EXTRA_REGS here.
418 */ 343 */
419 SAVE_EXTRA_REGS 8 344 DISABLE_INTERRUPTS(CLBR_NONE)
420 call sys_rt_sigreturn 345 TRACE_IRQS_OFF
421return_from_stub: 346 popq %rax
422 addq $8, %rsp 347 jmp entry_SYSCALL64_slow_path
423 RESTORE_EXTRA_REGS
424 movq %rax, RAX(%rsp)
425 jmp int_ret_from_sys_call
426END(stub_rt_sigreturn)
427 348
428#ifdef CONFIG_X86_X32_ABI 3491:
429ENTRY(stub_x32_rt_sigreturn) 350 /* Called from C */
430 SAVE_EXTRA_REGS 8 351 jmp *%rax /* called from C */
431 call sys32_x32_rt_sigreturn 352END(stub_ptregs_64)
432 jmp return_from_stub 353
433END(stub_x32_rt_sigreturn) 354.macro ptregs_stub func
434#endif 355ENTRY(ptregs_\func)
356 leaq \func(%rip), %rax
357 jmp stub_ptregs_64
358END(ptregs_\func)
359.endm
360
361/* Instantiate ptregs_stub for each ptregs-using syscall */
362#define __SYSCALL_64_QUAL_(sym)
363#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
364#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
365#include <asm/syscalls_64.h>
435 366
436/* 367/*
437 * A newly forked process directly context switches into this address. 368 * A newly forked process directly context switches into this address.
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
439 * rdi: prev task we switched from 370 * rdi: prev task we switched from
440 */ 371 */
441ENTRY(ret_from_fork) 372ENTRY(ret_from_fork)
442
443 LOCK ; btr $TIF_FORK, TI_flags(%r8) 373 LOCK ; btr $TIF_FORK, TI_flags(%r8)
444 374
445 pushq $0x0002 375 pushq $0x0002
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
447 377
448 call schedule_tail /* rdi: 'prev' task parameter */ 378 call schedule_tail /* rdi: 'prev' task parameter */
449 379
450 RESTORE_EXTRA_REGS
451
452 testb $3, CS(%rsp) /* from kernel_thread? */ 380 testb $3, CS(%rsp) /* from kernel_thread? */
381 jnz 1f
453 382
454 /* 383 /*
455 * By the time we get here, we have no idea whether our pt_regs, 384 * We came from kernel_thread. This code path is quite twisted, and
456 * ti flags, and ti status came from the 64-bit SYSCALL fast path, 385 * someone should clean it up.
457 * the slow path, or one of the 32-bit compat paths. 386 *
458 * Use IRET code path to return, since it can safely handle 387 * copy_thread_tls stashes the function pointer in RBX and the
459 * all of the above. 388 * parameter to be passed in RBP. The called function is permitted
389 * to call do_execve and thereby jump to user mode.
460 */ 390 */
461 jnz int_ret_from_sys_call 391 movq RBP(%rsp), %rdi
392 call *RBX(%rsp)
393 movl $0, RAX(%rsp)
462 394
463 /* 395 /*
464 * We came from kernel_thread 396 * Fall through as though we're exiting a syscall. This makes a
465 * nb: we depend on RESTORE_EXTRA_REGS above 397 * twisted sort of sense if we just called do_execve.
466 */ 398 */
467 movq %rbp, %rdi 399
468 call *%rbx 4001:
469 movl $0, RAX(%rsp) 401 movq %rsp, %rdi
470 RESTORE_EXTRA_REGS 402 call syscall_return_slowpath /* returns with IRQs disabled */
471 jmp int_ret_from_sys_call 403 TRACE_IRQS_ON /* user mode is traced as IRQS on */
404 SWAPGS
405 jmp restore_regs_and_iret
472END(ret_from_fork) 406END(ret_from_fork)
473 407
474/* 408/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990eeee40b..847f2f0c31e5 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
19 .section .entry.text, "ax" 19 .section .entry.text, "ax"
20 20
21/* 21/*
22 * 32-bit SYSENTER instruction entry. 22 * 32-bit SYSENTER entry.
23 * 23 *
24 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 24 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
25 * IF and VM in rflags are cleared (IOW: interrupts are off). 25 * on 64-bit kernels running on Intel CPUs.
26 *
27 * The SYSENTER instruction, in principle, should *only* occur in the
28 * vDSO. In practice, a small number of Android devices were shipped
29 * with a copy of Bionic that inlined a SYSENTER instruction. This
30 * never happened in any of Google's Bionic versions -- it only happened
31 * in a narrow range of Intel-provided versions.
32 *
33 * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
34 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
26 * SYSENTER does not save anything on the stack, 35 * SYSENTER does not save anything on the stack,
27 * and does not save old rip (!!!) and rflags. 36 * and does not save old RIP (!!!), RSP, or RFLAGS.
28 * 37 *
29 * Arguments: 38 * Arguments:
30 * eax system call number 39 * eax system call number
@@ -35,10 +44,6 @@
35 * edi arg5 44 * edi arg5
36 * ebp user stack 45 * ebp user stack
37 * 0(%ebp) arg6 46 * 0(%ebp) arg6
38 *
39 * This is purely a fast path. For anything complicated we use the int 0x80
40 * path below. We set up a complete hardware stack frame to share code
41 * with the int 0x80 path.
42 */ 47 */
43ENTRY(entry_SYSENTER_compat) 48ENTRY(entry_SYSENTER_compat)
44 /* Interrupts are off on entry. */ 49 /* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
66 */ 71 */
67 pushfq /* pt_regs->flags (except IF = 0) */ 72 pushfq /* pt_regs->flags (except IF = 0) */
68 orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ 73 orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
69 ASM_CLAC /* Clear AC after saving FLAGS */
70
71 pushq $__USER32_CS /* pt_regs->cs */ 74 pushq $__USER32_CS /* pt_regs->cs */
72 xorq %r8,%r8 75 xorq %r8,%r8
73 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ 76 pushq %r8 /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
90 cld 93 cld
91 94
92 /* 95 /*
93 * Sysenter doesn't filter flags, so we need to clear NT 96 * SYSENTER doesn't filter flags, so we need to clear NT and AC
94 * ourselves. To save a few cycles, we can check whether 97 * ourselves. To save a few cycles, we can check whether
95 * NT was set instead of doing an unconditional popfq. 98 * either was set instead of doing an unconditional popfq.
96 * This needs to happen before enabling interrupts so that 99 * This needs to happen before enabling interrupts so that
97 * we don't get preempted with NT set. 100 * we don't get preempted with NT set.
98 * 101 *
102 * If TF is set, we will single-step all the way to here -- do_debug
103 * will ignore all the traps. (Yes, this is slow, but so is
104 * single-stepping in general. This allows us to avoid having
105 * a more complicated code to handle the case where a user program
106 * forces us to single-step through the SYSENTER entry code.)
107 *
99 * NB.: .Lsysenter_fix_flags is a label with the code under it moved 108 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
100 * out-of-line as an optimization: NT is unlikely to be set in the 109 * out-of-line as an optimization: NT is unlikely to be set in the
101 * majority of the cases and instead of polluting the I$ unnecessarily, 110 * majority of the cases and instead of polluting the I$ unnecessarily,
102 * we're keeping that code behind a branch which will predict as 111 * we're keeping that code behind a branch which will predict as
103 * not-taken and therefore its instructions won't be fetched. 112 * not-taken and therefore its instructions won't be fetched.
104 */ 113 */
105 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 114 testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
106 jnz .Lsysenter_fix_flags 115 jnz .Lsysenter_fix_flags
107.Lsysenter_flags_fixed: 116.Lsysenter_flags_fixed:
108 117
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
123 pushq $X86_EFLAGS_FIXED 132 pushq $X86_EFLAGS_FIXED
124 popfq 133 popfq
125 jmp .Lsysenter_flags_fixed 134 jmp .Lsysenter_flags_fixed
135GLOBAL(__end_entry_SYSENTER_compat)
126ENDPROC(entry_SYSENTER_compat) 136ENDPROC(entry_SYSENTER_compat)
127 137
128/* 138/*
129 * 32-bit SYSCALL instruction entry. 139 * 32-bit SYSCALL entry.
140 *
141 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
142 * on 64-bit kernels running on AMD CPUs.
143 *
144 * The SYSCALL instruction, in principle, should *only* occur in the
145 * vDSO. In practice, it appears that this really is the case.
146 * As evidence:
147 *
148 * - The calling convention for SYSCALL has changed several times without
149 * anyone noticing.
130 * 150 *
131 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 151 * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
132 * then loads new ss, cs, and rip from previously programmed MSRs. 152 * user task that did SYSCALL without immediately reloading SS
133 * rflags gets masked by a value from another MSR (so CLD and CLAC 153 * would randomly crash.
134 * are not needed). SYSCALL does not save anything on the stack
135 * and does not change rsp.
136 * 154 *
137 * Note: rflags saving+masking-with-MSR happens only in Long mode 155 * - Most programmers do not directly target AMD CPUs, and the 32-bit
156 * SYSCALL instruction does not exist on Intel CPUs. Even on AMD
157 * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
158 * because the SYSCALL instruction in legacy/native 32-bit mode (as
159 * opposed to compat mode) is sufficiently poorly designed as to be
160 * essentially unusable.
161 *
162 * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
163 * RFLAGS to R11, then loads new SS, CS, and RIP from previously
164 * programmed MSRs. RFLAGS gets masked by a value from another MSR
165 * (so CLD and CLAC are not needed). SYSCALL does not save anything on
166 * the stack and does not change RSP.
167 *
168 * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
138 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 169 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
139 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 170 * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
140 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 171 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
141 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 172 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
142 * 173 *
@@ -236,7 +267,21 @@ sysret32_from_system_call:
236END(entry_SYSCALL_compat) 267END(entry_SYSCALL_compat)
237 268
238/* 269/*
239 * Emulated IA32 system calls via int 0x80. 270 * 32-bit legacy system call entry.
271 *
272 * 32-bit x86 Linux system calls traditionally used the INT $0x80
273 * instruction. INT $0x80 lands here.
274 *
275 * This entry point can be used by 32-bit and 64-bit programs to perform
276 * 32-bit system calls. Instances of INT $0x80 can be found inline in
277 * various programs and libraries. It is also used by the vDSO's
278 * __kernel_vsyscall fallback for hardware that doesn't support a faster
279 * entry method. Restarted 32-bit system calls also fall back to INT
280 * $0x80 regardless of what instruction was originally used to do the
281 * system call.
282 *
283 * This is considered a slow path. It is not used by most libc
284 * implementations on modern hardware except during process startup.
240 * 285 *
241 * Arguments: 286 * Arguments:
242 * eax system call number 287 * eax system call number
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat)
245 * edx arg3 290 * edx arg3
246 * esi arg4 291 * esi arg4
247 * edi arg5 292 * edi arg5
248 * ebp arg6 (note: not saved in the stack frame, should not be touched) 293 * ebp arg6
249 *
250 * Notes:
251 * Uses the same stack frame as the x86-64 version.
252 * All registers except eax must be saved (but ptrace may violate that).
253 * Arguments are zero extended. For system calls that want sign extension and
254 * take long arguments a wrapper is needed. Most calls can just be called
255 * directly.
256 * Assumes it is only called from user space and entered with interrupts off.
257 */ 294 */
258
259ENTRY(entry_INT80_compat) 295ENTRY(entry_INT80_compat)
260 /* 296 /*
261 * Interrupts are off on entry. 297 * Interrupts are off on entry.
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat)
300 TRACE_IRQS_OFF 336 TRACE_IRQS_OFF
301 337
302 movq %rsp, %rdi 338 movq %rsp, %rdi
303 call do_syscall_32_irqs_off 339 call do_int80_syscall_32
304.Lsyscall_32_done: 340.Lsyscall_32_done:
305 341
306 /* Go back to user mode. */ 342 /* Go back to user mode. */
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/syscall.h> 7#include <asm/syscall.h>
8 8
9#ifdef CONFIG_IA32_EMULATION 9#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
10#define SYM(sym, compat) compat
11#else
12#define SYM(sym, compat) sym
13#endif
14
15#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
16#include <asm/syscalls_32.h> 10#include <asm/syscalls_32.h>
17#undef __SYSCALL_I386 11#undef __SYSCALL_I386
18 12
19#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), 13#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
20 14
21extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 15extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
22 16
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,19 +6,14 @@
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/syscall.h> 7#include <asm/syscall.h>
8 8
9#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 9#define __SYSCALL_64_QUAL_(sym) sym
10#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
10 11
11#ifdef CONFIG_X86_X32_ABI 12#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
12# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
13#else
14# define __SYSCALL_X32(nr, sym, compat) /* nothing */
15#endif
16
17#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
18#include <asm/syscalls_64.h> 13#include <asm/syscalls_64.h>
19#undef __SYSCALL_64 14#undef __SYSCALL_64
20 15
21#define __SYSCALL_64(nr, sym, compat) [nr] = sym, 16#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
22 17
23extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 18extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
24 19
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a50bdc..2e5b565adacc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
2112 common brk sys_brk 2112 common brk sys_brk
2213 64 rt_sigaction sys_rt_sigaction 2213 64 rt_sigaction sys_rt_sigaction
2314 common rt_sigprocmask sys_rt_sigprocmask 2314 common rt_sigprocmask sys_rt_sigprocmask
2415 64 rt_sigreturn stub_rt_sigreturn 2415 64 rt_sigreturn sys_rt_sigreturn/ptregs
2516 64 ioctl sys_ioctl 2516 64 ioctl sys_ioctl
2617 common pread64 sys_pread64 2617 common pread64 sys_pread64
2718 common pwrite64 sys_pwrite64 2718 common pwrite64 sys_pwrite64
@@ -62,10 +62,10 @@
6253 common socketpair sys_socketpair 6253 common socketpair sys_socketpair
6354 64 setsockopt sys_setsockopt 6354 64 setsockopt sys_setsockopt
6455 64 getsockopt sys_getsockopt 6455 64 getsockopt sys_getsockopt
6556 common clone stub_clone 6556 common clone sys_clone/ptregs
6657 common fork stub_fork 6657 common fork sys_fork/ptregs
6758 common vfork stub_vfork 6758 common vfork sys_vfork/ptregs
6859 64 execve stub_execve 6859 64 execve sys_execve/ptregs
6960 common exit sys_exit 6960 common exit sys_exit
7061 common wait4 sys_wait4 7061 common wait4 sys_wait4
7162 common kill sys_kill 7162 common kill sys_kill
@@ -178,7 +178,7 @@
178169 common reboot sys_reboot 178169 common reboot sys_reboot
179170 common sethostname sys_sethostname 179170 common sethostname sys_sethostname
180171 common setdomainname sys_setdomainname 180171 common setdomainname sys_setdomainname
181172 common iopl sys_iopl 181172 common iopl sys_iopl/ptregs
182173 common ioperm sys_ioperm 182173 common ioperm sys_ioperm
183174 64 create_module 183174 64 create_module
184175 common init_module sys_init_module 184175 common init_module sys_init_module
@@ -328,7 +328,7 @@
328319 common memfd_create sys_memfd_create 328319 common memfd_create sys_memfd_create
329320 common kexec_file_load sys_kexec_file_load 329320 common kexec_file_load sys_kexec_file_load
330321 common bpf sys_bpf 330321 common bpf sys_bpf
331322 64 execveat stub_execveat 331322 64 execveat sys_execveat/ptregs
332323 common userfaultfd sys_userfaultfd 332323 common userfaultfd sys_userfaultfd
333324 common membarrier sys_membarrier 333324 common membarrier sys_membarrier
334325 common mlock2 sys_mlock2 334325 common mlock2 sys_mlock2
@@ -339,14 +339,14 @@
339# for native 64-bit operation. 339# for native 64-bit operation.
340# 340#
341512 x32 rt_sigaction compat_sys_rt_sigaction 341512 x32 rt_sigaction compat_sys_rt_sigaction
342513 x32 rt_sigreturn stub_x32_rt_sigreturn 342513 x32 rt_sigreturn sys32_x32_rt_sigreturn
343514 x32 ioctl compat_sys_ioctl 343514 x32 ioctl compat_sys_ioctl
344515 x32 readv compat_sys_readv 344515 x32 readv compat_sys_readv
345516 x32 writev compat_sys_writev 345516 x32 writev compat_sys_writev
346517 x32 recvfrom compat_sys_recvfrom 346517 x32 recvfrom compat_sys_recvfrom
347518 x32 sendmsg compat_sys_sendmsg 347518 x32 sendmsg compat_sys_sendmsg
348519 x32 recvmsg compat_sys_recvmsg 348519 x32 recvmsg compat_sys_recvmsg
349520 x32 execve stub_x32_execve 349520 x32 execve compat_sys_execve/ptregs
350521 x32 ptrace compat_sys_ptrace 350521 x32 ptrace compat_sys_ptrace
351522 x32 rt_sigpending compat_sys_rt_sigpending 351522 x32 rt_sigpending compat_sys_rt_sigpending
352523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 352523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
371542 x32 getsockopt compat_sys_getsockopt 371542 x32 getsockopt compat_sys_getsockopt
372543 x32 io_setup compat_sys_io_setup 372543 x32 io_setup compat_sys_io_setup
373544 x32 io_submit compat_sys_io_submit 373544 x32 io_submit compat_sys_io_submit
374545 x32 execveat stub_x32_execveat 374545 x32 execveat compat_sys_execveat/ptregs
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,63 @@
3in="$1" 3in="$1"
4out="$2" 4out="$2"
5 5
6syscall_macro() {
7 abi="$1"
8 nr="$2"
9 entry="$3"
10
11 # Entry can be either just a function name or "function/qualifier"
12 real_entry="${entry%%/*}"
13 qualifier="${entry:${#real_entry}}" # Strip the function name
14 qualifier="${qualifier:1}" # Strip the slash, if any
15
16 echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
17}
18
19emit() {
20 abi="$1"
21 nr="$2"
22 entry="$3"
23 compat="$4"
24
25 if [ "$abi" == "64" -a -n "$compat" ]; then
26 echo "a compat entry for a 64-bit syscall makes no sense" >&2
27 exit 1
28 fi
29
30 if [ -z "$compat" ]; then
31 if [ -n "$entry" ]; then
32 syscall_macro "$abi" "$nr" "$entry"
33 fi
34 else
35 echo "#ifdef CONFIG_X86_32"
36 if [ -n "$entry" ]; then
37 syscall_macro "$abi" "$nr" "$entry"
38 fi
39 echo "#else"
40 syscall_macro "$abi" "$nr" "$compat"
41 echo "#endif"
42 fi
43}
44
6grep '^[0-9]' "$in" | sort -n | ( 45grep '^[0-9]' "$in" | sort -n | (
7 while read nr abi name entry compat; do 46 while read nr abi name entry compat; do
8 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` 47 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
9 if [ -n "$compat" ]; then 48 if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
10 echo "__SYSCALL_${abi}($nr, $entry, $compat)" 49 # COMMON is the same as 64, except that we don't expect X32
11 elif [ -n "$entry" ]; then 50 # programs to use it. Our expectation has nothing to do with
12 echo "__SYSCALL_${abi}($nr, $entry, $entry)" 51 # any generated code, so treat them the same.
52 emit 64 "$nr" "$entry" "$compat"
53 elif [ "$abi" == "X32" ]; then
54 # X32 is equivalent to 64 on an X32-compatible kernel.
55 echo "#ifdef CONFIG_X86_X32_ABI"
56 emit 64 "$nr" "$entry" "$compat"
57 echo "#endif"
58 elif [ "$abi" == "I386" ]; then
59 emit "$abi" "$nr" "$entry" "$compat"
60 else
61 echo "Unknown abi $abi" >&2
62 exit 1
13 fi 63 fi
14 done 64 done
15) > "$out" 65) > "$out"
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 3f69326ed545..63a03bb91497 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
150 } 150 }
151 fprintf(outfile, "\n};\n\n"); 151 fprintf(outfile, "\n};\n\n");
152 152
153 fprintf(outfile, "static struct page *pages[%lu];\n\n",
154 mapping_size / 4096);
155
156 fprintf(outfile, "const struct vdso_image %s = {\n", name); 153 fprintf(outfile, "const struct vdso_image %s = {\n", name);
157 fprintf(outfile, "\t.data = raw_data,\n"); 154 fprintf(outfile, "\t.data = raw_data,\n");
158 fprintf(outfile, "\t.size = %lu,\n", mapping_size); 155 fprintf(outfile, "\t.size = %lu,\n", mapping_size);
159 fprintf(outfile, "\t.text_mapping = {\n");
160 fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
161 fprintf(outfile, "\t\t.pages = pages,\n");
162 fprintf(outfile, "\t},\n");
163 if (alt_sec) { 156 if (alt_sec) {
164 fprintf(outfile, "\t.alt = %lu,\n", 157 fprintf(outfile, "\t.alt = %lu,\n",
165 (unsigned long)GET_LE(&alt_sec->sh_offset)); 158 (unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 08a317a9ae4b..7853b53959cd 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/mm_types.h> 12#include <linux/mm_types.h>
13 13
14#include <asm/cpufeature.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/vdso.h> 15#include <asm/vdso.h>
17 16
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 3a1d9297074b..0109ac6cb79c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
3*/ 3*/
4 4
5#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
6#include <asm/cpufeature.h> 6#include <asm/cpufeatures.h>
7#include <asm/alternative-asm.h> 7#include <asm/alternative-asm.h>
8 8
9/* 9/*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f69e264ac4..10f704584922 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
20#include <asm/page.h> 20#include <asm/page.h>
21#include <asm/hpet.h> 21#include <asm/hpet.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/cpufeature.h>
23 24
24#if defined(CONFIG_X86_64) 25#if defined(CONFIG_X86_64)
25unsigned int __read_mostly vdso64_enabled = 1; 26unsigned int __read_mostly vdso64_enabled = 1;
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
27 28
28void __init init_vdso_image(const struct vdso_image *image) 29void __init init_vdso_image(const struct vdso_image *image)
29{ 30{
30 int i;
31 int npages = (image->size) / PAGE_SIZE;
32
33 BUG_ON(image->size % PAGE_SIZE != 0); 31 BUG_ON(image->size % PAGE_SIZE != 0);
34 for (i = 0; i < npages; i++)
35 image->text_mapping.pages[i] =
36 virt_to_page(image->data + i*PAGE_SIZE);
37 32
38 apply_alternatives((struct alt_instr *)(image->data + image->alt), 33 apply_alternatives((struct alt_instr *)(image->data + image->alt),
39 (struct alt_instr *)(image->data + image->alt + 34 (struct alt_instr *)(image->data + image->alt +
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
90#endif 85#endif
91} 86}
92 87
88static int vdso_fault(const struct vm_special_mapping *sm,
89 struct vm_area_struct *vma, struct vm_fault *vmf)
90{
91 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
92
93 if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
94 return VM_FAULT_SIGBUS;
95
96 vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
97 get_page(vmf->page);
98 return 0;
99}
100
101static const struct vm_special_mapping text_mapping = {
102 .name = "[vdso]",
103 .fault = vdso_fault,
104};
105
106static int vvar_fault(const struct vm_special_mapping *sm,
107 struct vm_area_struct *vma, struct vm_fault *vmf)
108{
109 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
110 long sym_offset;
111 int ret = -EFAULT;
112
113 if (!image)
114 return VM_FAULT_SIGBUS;
115
116 sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
117 image->sym_vvar_start;
118
119 /*
120 * Sanity check: a symbol offset of zero means that the page
121 * does not exist for this vdso image, not that the page is at
122 * offset zero relative to the text mapping. This should be
123 * impossible here, because sym_offset should only be zero for
124 * the page past the end of the vvar mapping.
125 */
126 if (sym_offset == 0)
127 return VM_FAULT_SIGBUS;
128
129 if (sym_offset == image->sym_vvar_page) {
130 ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
131 __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
132 } else if (sym_offset == image->sym_hpet_page) {
133#ifdef CONFIG_HPET_TIMER
134 if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
135 ret = vm_insert_pfn_prot(
136 vma,
137 (unsigned long)vmf->virtual_address,
138 hpet_address >> PAGE_SHIFT,
139 pgprot_noncached(PAGE_READONLY));
140 }
141#endif
142 } else if (sym_offset == image->sym_pvclock_page) {
143 struct pvclock_vsyscall_time_info *pvti =
144 pvclock_pvti_cpu0_va();
145 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
146 ret = vm_insert_pfn(
147 vma,
148 (unsigned long)vmf->virtual_address,
149 __pa(pvti) >> PAGE_SHIFT);
150 }
151 }
152
153 if (ret == 0 || ret == -EBUSY)
154 return VM_FAULT_NOPAGE;
155
156 return VM_FAULT_SIGBUS;
157}
158
93static int map_vdso(const struct vdso_image *image, bool calculate_addr) 159static int map_vdso(const struct vdso_image *image, bool calculate_addr)
94{ 160{
95 struct mm_struct *mm = current->mm; 161 struct mm_struct *mm = current->mm;
96 struct vm_area_struct *vma; 162 struct vm_area_struct *vma;
97 unsigned long addr, text_start; 163 unsigned long addr, text_start;
98 int ret = 0; 164 int ret = 0;
99 static struct page *no_pages[] = {NULL}; 165 static const struct vm_special_mapping vvar_mapping = {
100 static struct vm_special_mapping vvar_mapping = {
101 .name = "[vvar]", 166 .name = "[vvar]",
102 .pages = no_pages, 167 .fault = vvar_fault,
103 }; 168 };
104 struct pvclock_vsyscall_time_info *pvti;
105 169
106 if (calculate_addr) { 170 if (calculate_addr) {
107 addr = vdso_addr(current->mm->start_stack, 171 addr = vdso_addr(current->mm->start_stack,
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
121 185
122 text_start = addr - image->sym_vvar_start; 186 text_start = addr - image->sym_vvar_start;
123 current->mm->context.vdso = (void __user *)text_start; 187 current->mm->context.vdso = (void __user *)text_start;
188 current->mm->context.vdso_image = image;
124 189
125 /* 190 /*
126 * MAYWRITE to allow gdb to COW and set breakpoints 191 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
130 image->size, 195 image->size,
131 VM_READ|VM_EXEC| 196 VM_READ|VM_EXEC|
132 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 197 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
133 &image->text_mapping); 198 &text_mapping);
134 199
135 if (IS_ERR(vma)) { 200 if (IS_ERR(vma)) {
136 ret = PTR_ERR(vma); 201 ret = PTR_ERR(vma);
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
140 vma = _install_special_mapping(mm, 205 vma = _install_special_mapping(mm,
141 addr, 206 addr,
142 -image->sym_vvar_start, 207 -image->sym_vvar_start,
143 VM_READ|VM_MAYREAD, 208 VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
209 VM_PFNMAP,
144 &vvar_mapping); 210 &vvar_mapping);
145 211
146 if (IS_ERR(vma)) { 212 if (IS_ERR(vma)) {
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
148 goto up_fail; 214 goto up_fail;
149 } 215 }
150 216
151 if (image->sym_vvar_page)
152 ret = remap_pfn_range(vma,
153 text_start + image->sym_vvar_page,
154 __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
155 PAGE_SIZE,
156 PAGE_READONLY);
157
158 if (ret)
159 goto up_fail;
160
161#ifdef CONFIG_HPET_TIMER
162 if (hpet_address && image->sym_hpet_page) {
163 ret = io_remap_pfn_range(vma,
164 text_start + image->sym_hpet_page,
165 hpet_address >> PAGE_SHIFT,
166 PAGE_SIZE,
167 pgprot_noncached(PAGE_READONLY));
168
169 if (ret)
170 goto up_fail;
171 }
172#endif
173
174 pvti = pvclock_pvti_cpu0_va();
175 if (pvti && image->sym_pvclock_page) {
176 ret = remap_pfn_range(vma,
177 text_start + image->sym_pvclock_page,
178 __pa(pvti) >> PAGE_SHIFT,
179 PAGE_SIZE,
180 PAGE_READONLY);
181
182 if (ret)
183 goto up_fail;
184 }
185
186up_fail: 217up_fail:
187 if (ret) 218 if (ret)
188 current->mm->context.vdso = NULL; 219 current->mm->context.vdso = NULL;
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
254#ifdef CONFIG_NUMA 285#ifdef CONFIG_NUMA
255 node = cpu_to_node(cpu); 286 node = cpu_to_node(cpu);
256#endif 287#endif
257 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 288 if (static_cpu_has(X86_FEATURE_RDTSCP))
258 write_rdtscp_aux((node << 12) | cpu); 289 write_rdtscp_aux((node << 12) | cpu);
259 290
260 /* 291 /*
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..0fb3a104ac62 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
16#include <asm/vgtod.h> 16#include <asm/vgtod.h>
17#include <asm/vvar.h> 17#include <asm/vvar.h>
18 18
19int vclocks_used __read_mostly;
20
19DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); 21DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
20 22
21void update_vsyscall_tz(void) 23void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
26 28
27void update_vsyscall(struct timekeeper *tk) 29void update_vsyscall(struct timekeeper *tk)
28{ 30{
31 int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
29 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; 32 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
30 33
34 /* Mark the new vclock used. */
35 BUILD_BUG_ON(VCLOCK_MAX >= 32);
36 WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
37
31 gtod_write_begin(vdata); 38 gtod_write_begin(vdata);
32 39
33 /* copy vsyscall data */ 40 /* copy vsyscall data */
34 vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; 41 vdata->vclock_mode = vclock_mode;
35 vdata->cycle_last = tk->tkr_mono.cycle_last; 42 vdata->cycle_last = tk->tkr_mono.cycle_last;
36 vdata->mask = tk->tkr_mono.mask; 43 vdata->mask = tk->tkr_mono.mask;
37 vdata->mult = tk->tkr_mono.mult; 44 vdata->mult = tk->tkr_mono.mult;