diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-15 12:32:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-15 12:32:27 -0400 |
commit | ba33ea811e1ff6726abb7f8f96df38c2d7b50304 (patch) | |
tree | 29134e5cc7c19c8e520cb9336b476144d3d1252f /arch/x86/entry | |
parent | e23604edac2a7be6a8808a5d13fac6b9df4eb9a8 (diff) | |
parent | d05004944206cbbf1c453e179768163731c7c6f1 (diff) |
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm updates from Ingo Molnar:
"This is another big update. Main changes are:
- lots of x86 system call (and other traps/exceptions) entry code
enhancements. In particular the complex parts of the 64-bit entry
code have been migrated to C code as well, and a number of dusty
corners have been refreshed. (Andy Lutomirski)
- vDSO special mapping robustification and general cleanups (Andy
Lutomirski)
- cpufeature refactoring, cleanups and speedups (Borislav Petkov)
- lots of other changes ..."
* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
x86/cpufeature: Enable new AVX-512 features
x86/entry/traps: Show unhandled signal for i386 in do_trap()
x86/entry: Call enter_from_user_mode() with IRQs off
x86/entry/32: Change INT80 to be an interrupt gate
x86/entry: Improve system call entry comments
x86/entry: Remove TIF_SINGLESTEP entry work
x86/entry/32: Add and check a stack canary for the SYSENTER stack
x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup
x86/entry: Only allocate space for tss_struct::SYSENTER_stack if needed
x86/entry: Vastly simplify SYSENTER TF (single-step) handling
x86/entry/traps: Clear DR6 early in do_debug() and improve the comment
x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions
x86/entry/32: Restore FLAGS on SYSEXIT
x86/entry/32: Filter NT and speed up AC filtering in SYSENTER
x86/entry/compat: In SYSENTER, sink AC clearing below the existing FLAGS test
selftests/x86: In syscall_nt, test NT|TF as well
x86/asm-offsets: Remove PARAVIRT_enabled
x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled
uprobes: __create_xol_area() must nullify xol_mapping.fault
x86/cpufeature: Create a new synthetic cpu capability for machine check recovery
...
Diffstat (limited to 'arch/x86/entry')
-rw-r--r-- | arch/x86/entry/calling.h | 31 | ||||
-rw-r--r-- | arch/x86/entry/common.c | 106 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 268 | ||||
-rw-r--r-- | arch/x86/entry/entry_64.S | 286 | ||||
-rw-r--r-- | arch/x86/entry/entry_64_compat.S | 102 | ||||
-rw-r--r-- | arch/x86/entry/syscall_32.c | 10 | ||||
-rw-r--r-- | arch/x86/entry/syscall_64.c | 13 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_64.tbl | 20 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscalltbl.sh | 58 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso2c.h | 7 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32-setup.c | 1 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32/system_call.S | 2 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 127 | ||||
-rw-r--r-- | arch/x86/entry/vsyscall/vsyscall_gtod.c | 9 |
14 files changed, 566 insertions, 474 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index e32206e09868..9a9e5884066c 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h | |||
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with | |||
201 | .byte 0xf1 | 201 | .byte 0xf1 |
202 | .endm | 202 | .endm |
203 | 203 | ||
204 | #else /* CONFIG_X86_64 */ | ||
205 | |||
206 | /* | ||
207 | * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These | ||
208 | * are different from the entry_32.S versions in not changing the segment | ||
209 | * registers. So only suitable for in kernel use, not when transitioning | ||
210 | * from or to user space. The resulting stack frame is not a standard | ||
211 | * pt_regs frame. The main use case is calling C code from assembler | ||
212 | * when all the registers need to be preserved. | ||
213 | */ | ||
214 | |||
215 | .macro SAVE_ALL | ||
216 | pushl %eax | ||
217 | pushl %ebp | ||
218 | pushl %edi | ||
219 | pushl %esi | ||
220 | pushl %edx | ||
221 | pushl %ecx | ||
222 | pushl %ebx | ||
223 | .endm | ||
224 | |||
225 | .macro RESTORE_ALL | ||
226 | popl %ebx | ||
227 | popl %ecx | ||
228 | popl %edx | ||
229 | popl %esi | ||
230 | popl %edi | ||
231 | popl %ebp | ||
232 | popl %eax | ||
233 | .endm | ||
234 | |||
235 | #endif /* CONFIG_X86_64 */ | 204 | #endif /* CONFIG_X86_64 */ |
236 | 205 | ||
237 | /* | 206 | /* |
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03663740c866..e79d93d44ecd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <asm/traps.h> | 26 | #include <asm/traps.h> |
27 | #include <asm/vdso.h> | 27 | #include <asm/vdso.h> |
28 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
29 | #include <asm/cpufeature.h> | ||
29 | 30 | ||
30 | #define CREATE_TRACE_POINTS | 31 | #define CREATE_TRACE_POINTS |
31 | #include <trace/events/syscalls.h> | 32 | #include <trace/events/syscalls.h> |
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void) | |||
44 | CT_WARN_ON(ct_state() != CONTEXT_USER); | 45 | CT_WARN_ON(ct_state() != CONTEXT_USER); |
45 | user_exit(); | 46 | user_exit(); |
46 | } | 47 | } |
48 | #else | ||
49 | static inline void enter_from_user_mode(void) {} | ||
47 | #endif | 50 | #endif |
48 | 51 | ||
49 | static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) | 52 | static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) |
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) | |||
84 | 87 | ||
85 | work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; | 88 | work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; |
86 | 89 | ||
87 | #ifdef CONFIG_CONTEXT_TRACKING | ||
88 | /* | ||
89 | * If TIF_NOHZ is set, we are required to call user_exit() before | ||
90 | * doing anything that could touch RCU. | ||
91 | */ | ||
92 | if (work & _TIF_NOHZ) { | ||
93 | enter_from_user_mode(); | ||
94 | work &= ~_TIF_NOHZ; | ||
95 | } | ||
96 | #endif | ||
97 | |||
98 | #ifdef CONFIG_SECCOMP | 90 | #ifdef CONFIG_SECCOMP |
99 | /* | 91 | /* |
100 | * Do seccomp first -- it should minimize exposure of other | 92 | * Do seccomp first -- it should minimize exposure of other |
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, | |||
171 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) | 163 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) |
172 | BUG_ON(regs != task_pt_regs(current)); | 164 | BUG_ON(regs != task_pt_regs(current)); |
173 | 165 | ||
174 | /* | ||
175 | * If we stepped into a sysenter/syscall insn, it trapped in | ||
176 | * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. | ||
177 | * If user-mode had set TF itself, then it's still clear from | ||
178 | * do_debug() and we need to set it again to restore the user | ||
179 | * state. If we entered on the slow path, TF was already set. | ||
180 | */ | ||
181 | if (work & _TIF_SINGLESTEP) | ||
182 | regs->flags |= X86_EFLAGS_TF; | ||
183 | |||
184 | #ifdef CONFIG_SECCOMP | 166 | #ifdef CONFIG_SECCOMP |
185 | /* | 167 | /* |
186 | * Call seccomp_phase2 before running the other hooks so that | 168 | * Call seccomp_phase2 before running the other hooks so that |
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) | |||
268 | /* Called with IRQs disabled. */ | 250 | /* Called with IRQs disabled. */ |
269 | __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) | 251 | __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) |
270 | { | 252 | { |
253 | struct thread_info *ti = pt_regs_to_thread_info(regs); | ||
271 | u32 cached_flags; | 254 | u32 cached_flags; |
272 | 255 | ||
273 | if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) | 256 | if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) |
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) | |||
275 | 258 | ||
276 | lockdep_sys_exit(); | 259 | lockdep_sys_exit(); |
277 | 260 | ||
278 | cached_flags = | 261 | cached_flags = READ_ONCE(ti->flags); |
279 | READ_ONCE(pt_regs_to_thread_info(regs)->flags); | ||
280 | 262 | ||
281 | if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) | 263 | if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) |
282 | exit_to_usermode_loop(regs, cached_flags); | 264 | exit_to_usermode_loop(regs, cached_flags); |
283 | 265 | ||
266 | #ifdef CONFIG_COMPAT | ||
267 | /* | ||
268 | * Compat syscalls set TS_COMPAT. Make sure we clear it before | ||
269 | * returning to user mode. We need to clear it *after* signal | ||
270 | * handling, because syscall restart has a fixup for compat | ||
271 | * syscalls. The fixup is exercised by the ptrace_syscall_32 | ||
272 | * selftest. | ||
273 | */ | ||
274 | ti->status &= ~TS_COMPAT; | ||
275 | #endif | ||
276 | |||
284 | user_enter(); | 277 | user_enter(); |
285 | } | 278 | } |
286 | 279 | ||
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) | |||
332 | if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) | 325 | if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) |
333 | syscall_slow_exit_work(regs, cached_flags); | 326 | syscall_slow_exit_work(regs, cached_flags); |
334 | 327 | ||
335 | #ifdef CONFIG_COMPAT | 328 | local_irq_disable(); |
329 | prepare_exit_to_usermode(regs); | ||
330 | } | ||
331 | |||
332 | #ifdef CONFIG_X86_64 | ||
333 | __visible void do_syscall_64(struct pt_regs *regs) | ||
334 | { | ||
335 | struct thread_info *ti = pt_regs_to_thread_info(regs); | ||
336 | unsigned long nr = regs->orig_ax; | ||
337 | |||
338 | enter_from_user_mode(); | ||
339 | local_irq_enable(); | ||
340 | |||
341 | if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) | ||
342 | nr = syscall_trace_enter(regs); | ||
343 | |||
336 | /* | 344 | /* |
337 | * Compat syscalls set TS_COMPAT. Make sure we clear it before | 345 | * NB: Native and x32 syscalls are dispatched from the same |
338 | * returning to user mode. | 346 | * table. The only functional difference is the x32 bit in |
347 | * regs->orig_ax, which changes the behavior of some syscalls. | ||
339 | */ | 348 | */ |
340 | ti->status &= ~TS_COMPAT; | 349 | if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { |
341 | #endif | 350 | regs->ax = sys_call_table[nr & __SYSCALL_MASK]( |
351 | regs->di, regs->si, regs->dx, | ||
352 | regs->r10, regs->r8, regs->r9); | ||
353 | } | ||
342 | 354 | ||
343 | local_irq_disable(); | 355 | syscall_return_slowpath(regs); |
344 | prepare_exit_to_usermode(regs); | ||
345 | } | 356 | } |
357 | #endif | ||
346 | 358 | ||
347 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) | 359 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
348 | /* | 360 | /* |
349 | * Does a 32-bit syscall. Called with IRQs on and does all entry and | 361 | * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does |
350 | * exit work and returns with IRQs off. This function is extremely hot | 362 | * all entry and exit work and returns with IRQs off. This function is |
351 | * in workloads that use it, and it's usually called from | 363 | * extremely hot in workloads that use it, and it's usually called from |
352 | * do_fast_syscall_32, so forcibly inline it to improve performance. | 364 | * do_fast_syscall_32, so forcibly inline it to improve performance. |
353 | */ | 365 | */ |
354 | #ifdef CONFIG_X86_32 | 366 | static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) |
355 | /* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ | ||
356 | __visible | ||
357 | #else | ||
358 | /* 64-bit kernels use do_syscall_32_irqs_off() instead. */ | ||
359 | static | ||
360 | #endif | ||
361 | __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) | ||
362 | { | 367 | { |
363 | struct thread_info *ti = pt_regs_to_thread_info(regs); | 368 | struct thread_info *ti = pt_regs_to_thread_info(regs); |
364 | unsigned int nr = (unsigned int)regs->orig_ax; | 369 | unsigned int nr = (unsigned int)regs->orig_ax; |
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) | |||
393 | syscall_return_slowpath(regs); | 398 | syscall_return_slowpath(regs); |
394 | } | 399 | } |
395 | 400 | ||
396 | #ifdef CONFIG_X86_64 | 401 | /* Handles int $0x80 */ |
397 | /* Handles INT80 on 64-bit kernels */ | 402 | __visible void do_int80_syscall_32(struct pt_regs *regs) |
398 | __visible void do_syscall_32_irqs_off(struct pt_regs *regs) | ||
399 | { | 403 | { |
404 | enter_from_user_mode(); | ||
400 | local_irq_enable(); | 405 | local_irq_enable(); |
401 | do_syscall_32_irqs_on(regs); | 406 | do_syscall_32_irqs_on(regs); |
402 | } | 407 | } |
403 | #endif | ||
404 | 408 | ||
405 | /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ | 409 | /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ |
406 | __visible long do_fast_syscall_32(struct pt_regs *regs) | 410 | __visible long do_fast_syscall_32(struct pt_regs *regs) |
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) | |||
420 | */ | 424 | */ |
421 | regs->ip = landing_pad; | 425 | regs->ip = landing_pad; |
422 | 426 | ||
423 | /* | 427 | enter_from_user_mode(); |
424 | * Fetch EBP from where the vDSO stashed it. | 428 | |
425 | * | ||
426 | * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! | ||
427 | */ | ||
428 | local_irq_enable(); | 429 | local_irq_enable(); |
430 | |||
431 | /* Fetch EBP from where the vDSO stashed it. */ | ||
429 | if ( | 432 | if ( |
430 | #ifdef CONFIG_X86_64 | 433 | #ifdef CONFIG_X86_64 |
431 | /* | 434 | /* |
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) | |||
443 | /* User code screwed up. */ | 446 | /* User code screwed up. */ |
444 | local_irq_disable(); | 447 | local_irq_disable(); |
445 | regs->ax = -EFAULT; | 448 | regs->ax = -EFAULT; |
446 | #ifdef CONFIG_CONTEXT_TRACKING | ||
447 | enter_from_user_mode(); | ||
448 | #endif | ||
449 | prepare_exit_to_usermode(regs); | 449 | prepare_exit_to_usermode(regs); |
450 | return 0; /* Keep it simple: use IRET. */ | 450 | return 0; /* Keep it simple: use IRET. */ |
451 | } | 451 | } |
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bb3e376d0f33..10868aa734dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S | |||
@@ -40,7 +40,7 @@ | |||
40 | #include <asm/processor-flags.h> | 40 | #include <asm/processor-flags.h> |
41 | #include <asm/ftrace.h> | 41 | #include <asm/ftrace.h> |
42 | #include <asm/irq_vectors.h> | 42 | #include <asm/irq_vectors.h> |
43 | #include <asm/cpufeature.h> | 43 | #include <asm/cpufeatures.h> |
44 | #include <asm/alternative-asm.h> | 44 | #include <asm/alternative-asm.h> |
45 | #include <asm/asm.h> | 45 | #include <asm/asm.h> |
46 | #include <asm/smap.h> | 46 | #include <asm/smap.h> |
@@ -287,14 +287,64 @@ need_resched: | |||
287 | END(resume_kernel) | 287 | END(resume_kernel) |
288 | #endif | 288 | #endif |
289 | 289 | ||
290 | # SYSENTER call handler stub | 290 | GLOBAL(__begin_SYSENTER_singlestep_region) |
291 | /* | ||
292 | * All code from here through __end_SYSENTER_singlestep_region is subject | ||
293 | * to being single-stepped if a user program sets TF and executes SYSENTER. | ||
294 | * There is absolutely nothing that we can do to prevent this from happening | ||
295 | * (thanks Intel!). To keep our handling of this situation as simple as | ||
296 | * possible, we handle TF just like AC and NT, except that our #DB handler | ||
297 | * will ignore all of the single-step traps generated in this range. | ||
298 | */ | ||
299 | |||
300 | #ifdef CONFIG_XEN | ||
301 | /* | ||
302 | * Xen doesn't set %esp to be precisely what the normal SYSENTER | ||
303 | * entry point expects, so fix it up before using the normal path. | ||
304 | */ | ||
305 | ENTRY(xen_sysenter_target) | ||
306 | addl $5*4, %esp /* remove xen-provided frame */ | ||
307 | jmp sysenter_past_esp | ||
308 | #endif | ||
309 | |||
310 | /* | ||
311 | * 32-bit SYSENTER entry. | ||
312 | * | ||
313 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here | ||
314 | * if X86_FEATURE_SEP is available. This is the preferred system call | ||
315 | * entry on 32-bit systems. | ||
316 | * | ||
317 | * The SYSENTER instruction, in principle, should *only* occur in the | ||
318 | * vDSO. In practice, a small number of Android devices were shipped | ||
319 | * with a copy of Bionic that inlined a SYSENTER instruction. This | ||
320 | * never happened in any of Google's Bionic versions -- it only happened | ||
321 | * in a narrow range of Intel-provided versions. | ||
322 | * | ||
323 | * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. | ||
324 | * IF and VM in RFLAGS are cleared (IOW: interrupts are off). | ||
325 | * SYSENTER does not save anything on the stack, | ||
326 | * and does not save old EIP (!!!), ESP, or EFLAGS. | ||
327 | * | ||
328 | * To avoid losing track of EFLAGS.VM (and thus potentially corrupting | ||
329 | * user and/or vm86 state), we explicitly disable the SYSENTER | ||
330 | * instruction in vm86 mode by reprogramming the MSRs. | ||
331 | * | ||
332 | * Arguments: | ||
333 | * eax system call number | ||
334 | * ebx arg1 | ||
335 | * ecx arg2 | ||
336 | * edx arg3 | ||
337 | * esi arg4 | ||
338 | * edi arg5 | ||
339 | * ebp user stack | ||
340 | * 0(%ebp) arg6 | ||
341 | */ | ||
291 | ENTRY(entry_SYSENTER_32) | 342 | ENTRY(entry_SYSENTER_32) |
292 | movl TSS_sysenter_sp0(%esp), %esp | 343 | movl TSS_sysenter_sp0(%esp), %esp |
293 | sysenter_past_esp: | 344 | sysenter_past_esp: |
294 | pushl $__USER_DS /* pt_regs->ss */ | 345 | pushl $__USER_DS /* pt_regs->ss */ |
295 | pushl %ebp /* pt_regs->sp (stashed in bp) */ | 346 | pushl %ebp /* pt_regs->sp (stashed in bp) */ |
296 | pushfl /* pt_regs->flags (except IF = 0) */ | 347 | pushfl /* pt_regs->flags (except IF = 0) */ |
297 | ASM_CLAC /* Clear AC after saving FLAGS */ | ||
298 | orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ | 348 | orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ |
299 | pushl $__USER_CS /* pt_regs->cs */ | 349 | pushl $__USER_CS /* pt_regs->cs */ |
300 | pushl $0 /* pt_regs->ip = 0 (placeholder) */ | 350 | pushl $0 /* pt_regs->ip = 0 (placeholder) */ |
@@ -302,6 +352,29 @@ sysenter_past_esp: | |||
302 | SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ | 352 | SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ |
303 | 353 | ||
304 | /* | 354 | /* |
355 | * SYSENTER doesn't filter flags, so we need to clear NT, AC | ||
356 | * and TF ourselves. To save a few cycles, we can check whether | ||
357 | * either was set instead of doing an unconditional popfq. | ||
358 | * This needs to happen before enabling interrupts so that | ||
359 | * we don't get preempted with NT set. | ||
360 | * | ||
361 | * If TF is set, we will single-step all the way to here -- do_debug | ||
362 | * will ignore all the traps. (Yes, this is slow, but so is | ||
363 | * single-stepping in general. This allows us to avoid having | ||
364 | * a more complicated code to handle the case where a user program | ||
365 | * forces us to single-step through the SYSENTER entry code.) | ||
366 | * | ||
367 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved | ||
368 | * out-of-line as an optimization: NT is unlikely to be set in the | ||
369 | * majority of the cases and instead of polluting the I$ unnecessarily, | ||
370 | * we're keeping that code behind a branch which will predict as | ||
371 | * not-taken and therefore its instructions won't be fetched. | ||
372 | */ | ||
373 | testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) | ||
374 | jnz .Lsysenter_fix_flags | ||
375 | .Lsysenter_flags_fixed: | ||
376 | |||
377 | /* | ||
305 | * User mode is traced as though IRQs are on, and SYSENTER | 378 | * User mode is traced as though IRQs are on, and SYSENTER |
306 | * turned them off. | 379 | * turned them off. |
307 | */ | 380 | */ |
@@ -327,6 +400,15 @@ sysenter_past_esp: | |||
327 | popl %eax /* pt_regs->ax */ | 400 | popl %eax /* pt_regs->ax */ |
328 | 401 | ||
329 | /* | 402 | /* |
403 | * Restore all flags except IF. (We restore IF separately because | ||
404 | * STI gives a one-instruction window in which we won't be interrupted, | ||
405 | * whereas POPF does not.) | ||
406 | */ | ||
407 | addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ | ||
408 | btr $X86_EFLAGS_IF_BIT, (%esp) | ||
409 | popfl | ||
410 | |||
411 | /* | ||
330 | * Return back to the vDSO, which will pop ecx and edx. | 412 | * Return back to the vDSO, which will pop ecx and edx. |
331 | * Don't bother with DS and ES (they already contain __USER_DS). | 413 | * Don't bother with DS and ES (they already contain __USER_DS). |
332 | */ | 414 | */ |
@@ -339,28 +421,63 @@ sysenter_past_esp: | |||
339 | .popsection | 421 | .popsection |
340 | _ASM_EXTABLE(1b, 2b) | 422 | _ASM_EXTABLE(1b, 2b) |
341 | PTGS_TO_GS_EX | 423 | PTGS_TO_GS_EX |
424 | |||
425 | .Lsysenter_fix_flags: | ||
426 | pushl $X86_EFLAGS_FIXED | ||
427 | popfl | ||
428 | jmp .Lsysenter_flags_fixed | ||
429 | GLOBAL(__end_SYSENTER_singlestep_region) | ||
342 | ENDPROC(entry_SYSENTER_32) | 430 | ENDPROC(entry_SYSENTER_32) |
343 | 431 | ||
344 | # system call handler stub | 432 | /* |
433 | * 32-bit legacy system call entry. | ||
434 | * | ||
435 | * 32-bit x86 Linux system calls traditionally used the INT $0x80 | ||
436 | * instruction. INT $0x80 lands here. | ||
437 | * | ||
438 | * This entry point can be used by any 32-bit perform system calls. | ||
439 | * Instances of INT $0x80 can be found inline in various programs and | ||
440 | * libraries. It is also used by the vDSO's __kernel_vsyscall | ||
441 | * fallback for hardware that doesn't support a faster entry method. | ||
442 | * Restarted 32-bit system calls also fall back to INT $0x80 | ||
443 | * regardless of what instruction was originally used to do the system | ||
444 | * call. (64-bit programs can use INT $0x80 as well, but they can | ||
445 | * only run on 64-bit kernels and therefore land in | ||
446 | * entry_INT80_compat.) | ||
447 | * | ||
448 | * This is considered a slow path. It is not used by most libc | ||
449 | * implementations on modern hardware except during process startup. | ||
450 | * | ||
451 | * Arguments: | ||
452 | * eax system call number | ||
453 | * ebx arg1 | ||
454 | * ecx arg2 | ||
455 | * edx arg3 | ||
456 | * esi arg4 | ||
457 | * edi arg5 | ||
458 | * ebp arg6 | ||
459 | */ | ||
345 | ENTRY(entry_INT80_32) | 460 | ENTRY(entry_INT80_32) |
346 | ASM_CLAC | 461 | ASM_CLAC |
347 | pushl %eax /* pt_regs->orig_ax */ | 462 | pushl %eax /* pt_regs->orig_ax */ |
348 | SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ | 463 | SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ |
349 | 464 | ||
350 | /* | 465 | /* |
351 | * User mode is traced as though IRQs are on. Unlike the 64-bit | 466 | * User mode is traced as though IRQs are on, and the interrupt gate |
352 | * case, INT80 is a trap gate on 32-bit kernels, so interrupts | 467 | * turned them off. |
353 | * are already on (unless user code is messing around with iopl). | ||
354 | */ | 468 | */ |
469 | TRACE_IRQS_OFF | ||
355 | 470 | ||
356 | movl %esp, %eax | 471 | movl %esp, %eax |
357 | call do_syscall_32_irqs_on | 472 | call do_int80_syscall_32 |
358 | .Lsyscall_32_done: | 473 | .Lsyscall_32_done: |
359 | 474 | ||
360 | restore_all: | 475 | restore_all: |
361 | TRACE_IRQS_IRET | 476 | TRACE_IRQS_IRET |
362 | restore_all_notrace: | 477 | restore_all_notrace: |
363 | #ifdef CONFIG_X86_ESPFIX32 | 478 | #ifdef CONFIG_X86_ESPFIX32 |
479 | ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX | ||
480 | |||
364 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS | 481 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS |
365 | /* | 482 | /* |
366 | * Warning: PT_OLDSS(%esp) contains the wrong/random values if we | 483 | * Warning: PT_OLDSS(%esp) contains the wrong/random values if we |
@@ -387,19 +504,6 @@ ENTRY(iret_exc ) | |||
387 | 504 | ||
388 | #ifdef CONFIG_X86_ESPFIX32 | 505 | #ifdef CONFIG_X86_ESPFIX32 |
389 | ldt_ss: | 506 | ldt_ss: |
390 | #ifdef CONFIG_PARAVIRT | ||
391 | /* | ||
392 | * The kernel can't run on a non-flat stack if paravirt mode | ||
393 | * is active. Rather than try to fixup the high bits of | ||
394 | * ESP, bypass this code entirely. This may break DOSemu | ||
395 | * and/or Wine support in a paravirt VM, although the option | ||
396 | * is still available to implement the setting of the high | ||
397 | * 16-bits in the INTERRUPT_RETURN paravirt-op. | ||
398 | */ | ||
399 | cmpl $0, pv_info+PARAVIRT_enabled | ||
400 | jne restore_nocheck | ||
401 | #endif | ||
402 | |||
403 | /* | 507 | /* |
404 | * Setup and switch to ESPFIX stack | 508 | * Setup and switch to ESPFIX stack |
405 | * | 509 | * |
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug) | |||
632 | END(spurious_interrupt_bug) | 736 | END(spurious_interrupt_bug) |
633 | 737 | ||
634 | #ifdef CONFIG_XEN | 738 | #ifdef CONFIG_XEN |
635 | /* | ||
636 | * Xen doesn't set %esp to be precisely what the normal SYSENTER | ||
637 | * entry point expects, so fix it up before using the normal path. | ||
638 | */ | ||
639 | ENTRY(xen_sysenter_target) | ||
640 | addl $5*4, %esp /* remove xen-provided frame */ | ||
641 | jmp sysenter_past_esp | ||
642 | |||
643 | ENTRY(xen_hypervisor_callback) | 739 | ENTRY(xen_hypervisor_callback) |
644 | pushl $-1 /* orig_ax = -1 => not a system call */ | 740 | pushl $-1 /* orig_ax = -1 => not a system call */ |
645 | SAVE_ALL | 741 | SAVE_ALL |
@@ -939,51 +1035,48 @@ error_code: | |||
939 | jmp ret_from_exception | 1035 | jmp ret_from_exception |
940 | END(page_fault) | 1036 | END(page_fault) |
941 | 1037 | ||
942 | /* | ||
943 | * Debug traps and NMI can happen at the one SYSENTER instruction | ||
944 | * that sets up the real kernel stack. Check here, since we can't | ||
945 | * allow the wrong stack to be used. | ||
946 | * | ||
947 | * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have | ||
948 | * already pushed 3 words if it hits on the sysenter instruction: | ||
949 | * eflags, cs and eip. | ||
950 | * | ||
951 | * We just load the right stack, and push the three (known) values | ||
952 | * by hand onto the new stack - while updating the return eip past | ||
953 | * the instruction that would have done it for sysenter. | ||
954 | */ | ||
955 | .macro FIX_STACK offset ok label | ||
956 | cmpw $__KERNEL_CS, 4(%esp) | ||
957 | jne \ok | ||
958 | \label: | ||
959 | movl TSS_sysenter_sp0 + \offset(%esp), %esp | ||
960 | pushfl | ||
961 | pushl $__KERNEL_CS | ||
962 | pushl $sysenter_past_esp | ||
963 | .endm | ||
964 | |||
965 | ENTRY(debug) | 1038 | ENTRY(debug) |
1039 | /* | ||
1040 | * #DB can happen at the first instruction of | ||
1041 | * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this | ||
1042 | * happens, then we will be running on a very small stack. We | ||
1043 | * need to detect this condition and switch to the thread | ||
1044 | * stack before calling any C code at all. | ||
1045 | * | ||
1046 | * If you edit this code, keep in mind that NMIs can happen in here. | ||
1047 | */ | ||
966 | ASM_CLAC | 1048 | ASM_CLAC |
967 | cmpl $entry_SYSENTER_32, (%esp) | ||
968 | jne debug_stack_correct | ||
969 | FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn | ||
970 | debug_stack_correct: | ||
971 | pushl $-1 # mark this as an int | 1049 | pushl $-1 # mark this as an int |
972 | SAVE_ALL | 1050 | SAVE_ALL |
973 | TRACE_IRQS_OFF | ||
974 | xorl %edx, %edx # error code 0 | 1051 | xorl %edx, %edx # error code 0 |
975 | movl %esp, %eax # pt_regs pointer | 1052 | movl %esp, %eax # pt_regs pointer |
1053 | |||
1054 | /* Are we currently on the SYSENTER stack? */ | ||
1055 | PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) | ||
1056 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ | ||
1057 | cmpl $SIZEOF_SYSENTER_stack, %ecx | ||
1058 | jb .Ldebug_from_sysenter_stack | ||
1059 | |||
1060 | TRACE_IRQS_OFF | ||
1061 | call do_debug | ||
1062 | jmp ret_from_exception | ||
1063 | |||
1064 | .Ldebug_from_sysenter_stack: | ||
1065 | /* We're on the SYSENTER stack. Switch off. */ | ||
1066 | movl %esp, %ebp | ||
1067 | movl PER_CPU_VAR(cpu_current_top_of_stack), %esp | ||
1068 | TRACE_IRQS_OFF | ||
976 | call do_debug | 1069 | call do_debug |
1070 | movl %ebp, %esp | ||
977 | jmp ret_from_exception | 1071 | jmp ret_from_exception |
978 | END(debug) | 1072 | END(debug) |
979 | 1073 | ||
980 | /* | 1074 | /* |
981 | * NMI is doubly nasty. It can happen _while_ we're handling | 1075 | * NMI is doubly nasty. It can happen on the first instruction of |
982 | * a debug fault, and the debug fault hasn't yet been able to | 1076 | * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning |
983 | * clear up the stack. So we first check whether we got an | 1077 | * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 |
984 | * NMI on the sysenter entry path, but after that we need to | 1078 | * switched stacks. We handle both conditions by simply checking whether we |
985 | * check whether we got an NMI on the debug path where the debug | 1079 | * interrupted kernel code running on the SYSENTER stack. |
986 | * fault happened on the sysenter path. | ||
987 | */ | 1080 | */ |
988 | ENTRY(nmi) | 1081 | ENTRY(nmi) |
989 | ASM_CLAC | 1082 | ASM_CLAC |
@@ -994,41 +1087,32 @@ ENTRY(nmi) | |||
994 | popl %eax | 1087 | popl %eax |
995 | je nmi_espfix_stack | 1088 | je nmi_espfix_stack |
996 | #endif | 1089 | #endif |
997 | cmpl $entry_SYSENTER_32, (%esp) | 1090 | |
998 | je nmi_stack_fixup | 1091 | pushl %eax # pt_regs->orig_ax |
999 | pushl %eax | ||
1000 | movl %esp, %eax | ||
1001 | /* | ||
1002 | * Do not access memory above the end of our stack page, | ||
1003 | * it might not exist. | ||
1004 | */ | ||
1005 | andl $(THREAD_SIZE-1), %eax | ||
1006 | cmpl $(THREAD_SIZE-20), %eax | ||
1007 | popl %eax | ||
1008 | jae nmi_stack_correct | ||
1009 | cmpl $entry_SYSENTER_32, 12(%esp) | ||
1010 | je nmi_debug_stack_check | ||
1011 | nmi_stack_correct: | ||
1012 | pushl %eax | ||
1013 | SAVE_ALL | 1092 | SAVE_ALL |
1014 | xorl %edx, %edx # zero error code | 1093 | xorl %edx, %edx # zero error code |
1015 | movl %esp, %eax # pt_regs pointer | 1094 | movl %esp, %eax # pt_regs pointer |
1095 | |||
1096 | /* Are we currently on the SYSENTER stack? */ | ||
1097 | PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) | ||
1098 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ | ||
1099 | cmpl $SIZEOF_SYSENTER_stack, %ecx | ||
1100 | jb .Lnmi_from_sysenter_stack | ||
1101 | |||
1102 | /* Not on SYSENTER stack. */ | ||
1016 | call do_nmi | 1103 | call do_nmi |
1017 | jmp restore_all_notrace | 1104 | jmp restore_all_notrace |
1018 | 1105 | ||
1019 | nmi_stack_fixup: | 1106 | .Lnmi_from_sysenter_stack: |
1020 | FIX_STACK 12, nmi_stack_correct, 1 | 1107 | /* |
1021 | jmp nmi_stack_correct | 1108 | * We're on the SYSENTER stack. Switch off. No one (not even debug) |
1022 | 1109 | * is using the thread stack right now, so it's safe for us to use it. | |
1023 | nmi_debug_stack_check: | 1110 | */ |
1024 | cmpw $__KERNEL_CS, 16(%esp) | 1111 | movl %esp, %ebp |
1025 | jne nmi_stack_correct | 1112 | movl PER_CPU_VAR(cpu_current_top_of_stack), %esp |
1026 | cmpl $debug, (%esp) | 1113 | call do_nmi |
1027 | jb nmi_stack_correct | 1114 | movl %ebp, %esp |
1028 | cmpl $debug_esp_fix_insn, (%esp) | 1115 | jmp restore_all_notrace |
1029 | ja nmi_stack_correct | ||
1030 | FIX_STACK 24, nmi_stack_correct, 1 | ||
1031 | jmp nmi_stack_correct | ||
1032 | 1116 | ||
1033 | #ifdef CONFIG_X86_ESPFIX32 | 1117 | #ifdef CONFIG_X86_ESPFIX32 |
1034 | nmi_espfix_stack: | 1118 | nmi_espfix_stack: |
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9d34d3cfceb6..858b555e274b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64) | |||
103 | /* | 103 | /* |
104 | * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. | 104 | * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. |
105 | * | 105 | * |
106 | * This is the only entry point used for 64-bit system calls. The | ||
107 | * hardware interface is reasonably well designed and the register to | ||
108 | * argument mapping Linux uses fits well with the registers that are | ||
109 | * available when SYSCALL is used. | ||
110 | * | ||
111 | * SYSCALL instructions can be found inlined in libc implementations as | ||
112 | * well as some other programs and libraries. There are also a handful | ||
113 | * of SYSCALL instructions in the vDSO used, for example, as a | ||
114 | * clock_gettimeofday fallback. | ||
115 | * | ||
106 | * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, | 116 | * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
107 | * then loads new ss, cs, and rip from previously programmed MSRs. | 117 | * then loads new ss, cs, and rip from previously programmed MSRs. |
108 | * rflags gets masked by a value from another MSR (so CLD and CLAC | 118 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) | |||
145 | movq %rsp, PER_CPU_VAR(rsp_scratch) | 155 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
146 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 156 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
147 | 157 | ||
158 | TRACE_IRQS_OFF | ||
159 | |||
148 | /* Construct struct pt_regs on stack */ | 160 | /* Construct struct pt_regs on stack */ |
149 | pushq $__USER_DS /* pt_regs->ss */ | 161 | pushq $__USER_DS /* pt_regs->ss */ |
150 | pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ | 162 | pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ |
151 | /* | ||
152 | * Re-enable interrupts. | ||
153 | * We use 'rsp_scratch' as a scratch space, hence irq-off block above | ||
154 | * must execute atomically in the face of possible interrupt-driven | ||
155 | * task preemption. We must enable interrupts only after we're done | ||
156 | * with using rsp_scratch: | ||
157 | */ | ||
158 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
159 | pushq %r11 /* pt_regs->flags */ | 163 | pushq %r11 /* pt_regs->flags */ |
160 | pushq $__USER_CS /* pt_regs->cs */ | 164 | pushq $__USER_CS /* pt_regs->cs */ |
161 | pushq %rcx /* pt_regs->ip */ | 165 | pushq %rcx /* pt_regs->ip */ |
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) | |||
171 | pushq %r11 /* pt_regs->r11 */ | 175 | pushq %r11 /* pt_regs->r11 */ |
172 | sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ | 176 | sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ |
173 | 177 | ||
174 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | 178 | /* |
175 | jnz tracesys | 179 | * If we need to do entry work or if we guess we'll need to do |
180 | * exit work, go straight to the slow path. | ||
181 | */ | ||
182 | testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | ||
183 | jnz entry_SYSCALL64_slow_path | ||
184 | |||
176 | entry_SYSCALL_64_fastpath: | 185 | entry_SYSCALL_64_fastpath: |
186 | /* | ||
187 | * Easy case: enable interrupts and issue the syscall. If the syscall | ||
188 | * needs pt_regs, we'll call a stub that disables interrupts again | ||
189 | * and jumps to the slow path. | ||
190 | */ | ||
191 | TRACE_IRQS_ON | ||
192 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
177 | #if __SYSCALL_MASK == ~0 | 193 | #if __SYSCALL_MASK == ~0 |
178 | cmpq $__NR_syscall_max, %rax | 194 | cmpq $__NR_syscall_max, %rax |
179 | #else | 195 | #else |
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath: | |||
182 | #endif | 198 | #endif |
183 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ | 199 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
184 | movq %r10, %rcx | 200 | movq %r10, %rcx |
201 | |||
202 | /* | ||
203 | * This call instruction is handled specially in stub_ptregs_64. | ||
204 | * It might end up jumping to the slow path. If it jumps, RAX | ||
205 | * and all argument registers are clobbered. | ||
206 | */ | ||
185 | call *sys_call_table(, %rax, 8) | 207 | call *sys_call_table(, %rax, 8) |
208 | .Lentry_SYSCALL_64_after_fastpath_call: | ||
209 | |||
186 | movq %rax, RAX(%rsp) | 210 | movq %rax, RAX(%rsp) |
187 | 1: | 211 | 1: |
188 | /* | ||
189 | * Syscall return path ending with SYSRET (fast path). | ||
190 | * Has incompletely filled pt_regs. | ||
191 | */ | ||
192 | LOCKDEP_SYS_EXIT | ||
193 | /* | ||
194 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
195 | * it is too small to ever cause noticeable irq latency. | ||
196 | */ | ||
197 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
198 | 212 | ||
199 | /* | 213 | /* |
200 | * We must check ti flags with interrupts (or at least preemption) | 214 | * If we get here, then we know that pt_regs is clean for SYSRET64. |
201 | * off because we must *never* return to userspace without | 215 | * If we see that no exit work is required (which we are required |
202 | * processing exit work that is enqueued if we're preempted here. | 216 | * to check with IRQs off), then we can go straight to SYSRET64. |
203 | * In particular, returning to userspace with any of the one-shot | ||
204 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is | ||
205 | * very bad. | ||
206 | */ | 217 | */ |
218 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
219 | TRACE_IRQS_OFF | ||
207 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | 220 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
208 | jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ | 221 | jnz 1f |
209 | 222 | ||
210 | RESTORE_C_REGS_EXCEPT_RCX_R11 | 223 | LOCKDEP_SYS_EXIT |
224 | TRACE_IRQS_ON /* user mode is traced as IRQs on */ | ||
211 | movq RIP(%rsp), %rcx | 225 | movq RIP(%rsp), %rcx |
212 | movq EFLAGS(%rsp), %r11 | 226 | movq EFLAGS(%rsp), %r11 |
227 | RESTORE_C_REGS_EXCEPT_RCX_R11 | ||
213 | movq RSP(%rsp), %rsp | 228 | movq RSP(%rsp), %rsp |
214 | /* | ||
215 | * 64-bit SYSRET restores rip from rcx, | ||
216 | * rflags from r11 (but RF and VM bits are forced to 0), | ||
217 | * cs and ss are loaded from MSRs. | ||
218 | * Restoration of rflags re-enables interrupts. | ||
219 | * | ||
220 | * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss | ||
221 | * descriptor is not reinitialized. This means that we should | ||
222 | * avoid SYSRET with SS == NULL, which could happen if we schedule, | ||
223 | * exit the kernel, and re-enter using an interrupt vector. (All | ||
224 | * interrupt entries on x86_64 set SS to NULL.) We prevent that | ||
225 | * from happening by reloading SS in __switch_to. (Actually | ||
226 | * detecting the failure in 64-bit userspace is tricky but can be | ||
227 | * done.) | ||
228 | */ | ||
229 | USERGS_SYSRET64 | 229 | USERGS_SYSRET64 |
230 | 230 | ||
231 | GLOBAL(int_ret_from_sys_call_irqs_off) | 231 | 1: |
232 | /* | ||
233 | * The fast path looked good when we started, but something changed | ||
234 | * along the way and we need to switch to the slow path. Calling | ||
235 | * raise(3) will trigger this, for example. IRQs are off. | ||
236 | */ | ||
232 | TRACE_IRQS_ON | 237 | TRACE_IRQS_ON |
233 | ENABLE_INTERRUPTS(CLBR_NONE) | 238 | ENABLE_INTERRUPTS(CLBR_NONE) |
234 | jmp int_ret_from_sys_call | ||
235 | |||
236 | /* Do syscall entry tracing */ | ||
237 | tracesys: | ||
238 | movq %rsp, %rdi | ||
239 | movl $AUDIT_ARCH_X86_64, %esi | ||
240 | call syscall_trace_enter_phase1 | ||
241 | test %rax, %rax | ||
242 | jnz tracesys_phase2 /* if needed, run the slow path */ | ||
243 | RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ | ||
244 | movq ORIG_RAX(%rsp), %rax | ||
245 | jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ | ||
246 | |||
247 | tracesys_phase2: | ||
248 | SAVE_EXTRA_REGS | 239 | SAVE_EXTRA_REGS |
249 | movq %rsp, %rdi | 240 | movq %rsp, %rdi |
250 | movl $AUDIT_ARCH_X86_64, %esi | 241 | call syscall_return_slowpath /* returns with IRQs disabled */ |
251 | movq %rax, %rdx | 242 | jmp return_from_SYSCALL_64 |
252 | call syscall_trace_enter_phase2 | ||
253 | |||
254 | /* | ||
255 | * Reload registers from stack in case ptrace changed them. | ||
256 | * We don't reload %rax because syscall_trace_entry_phase2() returned | ||
257 | * the value it wants us to use in the table lookup. | ||
258 | */ | ||
259 | RESTORE_C_REGS_EXCEPT_RAX | ||
260 | RESTORE_EXTRA_REGS | ||
261 | #if __SYSCALL_MASK == ~0 | ||
262 | cmpq $__NR_syscall_max, %rax | ||
263 | #else | ||
264 | andl $__SYSCALL_MASK, %eax | ||
265 | cmpl $__NR_syscall_max, %eax | ||
266 | #endif | ||
267 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ | ||
268 | movq %r10, %rcx /* fixup for C */ | ||
269 | call *sys_call_table(, %rax, 8) | ||
270 | movq %rax, RAX(%rsp) | ||
271 | 1: | ||
272 | /* Use IRET because user could have changed pt_regs->foo */ | ||
273 | 243 | ||
274 | /* | 244 | entry_SYSCALL64_slow_path: |
275 | * Syscall return path ending with IRET. | 245 | /* IRQs are off. */ |
276 | * Has correct iret frame. | ||
277 | */ | ||
278 | GLOBAL(int_ret_from_sys_call) | ||
279 | SAVE_EXTRA_REGS | 246 | SAVE_EXTRA_REGS |
280 | movq %rsp, %rdi | 247 | movq %rsp, %rdi |
281 | call syscall_return_slowpath /* returns with IRQs disabled */ | 248 | call do_syscall_64 /* returns with IRQs disabled */ |
249 | |||
250 | return_from_SYSCALL_64: | ||
282 | RESTORE_EXTRA_REGS | 251 | RESTORE_EXTRA_REGS |
283 | TRACE_IRQS_IRETQ /* we're about to change IF */ | 252 | TRACE_IRQS_IRETQ /* we're about to change IF */ |
284 | 253 | ||
@@ -355,83 +324,45 @@ opportunistic_sysret_failed: | |||
355 | jmp restore_c_regs_and_iret | 324 | jmp restore_c_regs_and_iret |
356 | END(entry_SYSCALL_64) | 325 | END(entry_SYSCALL_64) |
357 | 326 | ||
327 | ENTRY(stub_ptregs_64) | ||
328 | /* | ||
329 | * Syscalls marked as needing ptregs land here. | ||
330 | * If we are on the fast path, we need to save the extra regs, | ||
331 | * which we achieve by trying again on the slow path. If we are on | ||
332 | * the slow path, the extra regs are already saved. | ||
333 | * | ||
334 | * RAX stores a pointer to the C function implementing the syscall. | ||
335 | * IRQs are on. | ||
336 | */ | ||
337 | cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) | ||
338 | jne 1f | ||
358 | 339 | ||
359 | .macro FORK_LIKE func | ||
360 | ENTRY(stub_\func) | ||
361 | SAVE_EXTRA_REGS 8 | ||
362 | jmp sys_\func | ||
363 | END(stub_\func) | ||
364 | .endm | ||
365 | |||
366 | FORK_LIKE clone | ||
367 | FORK_LIKE fork | ||
368 | FORK_LIKE vfork | ||
369 | |||
370 | ENTRY(stub_execve) | ||
371 | call sys_execve | ||
372 | return_from_execve: | ||
373 | testl %eax, %eax | ||
374 | jz 1f | ||
375 | /* exec failed, can use fast SYSRET code path in this case */ | ||
376 | ret | ||
377 | 1: | ||
378 | /* must use IRET code path (pt_regs->cs may have changed) */ | ||
379 | addq $8, %rsp | ||
380 | ZERO_EXTRA_REGS | ||
381 | movq %rax, RAX(%rsp) | ||
382 | jmp int_ret_from_sys_call | ||
383 | END(stub_execve) | ||
384 | /* | ||
385 | * Remaining execve stubs are only 7 bytes long. | ||
386 | * ENTRY() often aligns to 16 bytes, which in this case has no benefits. | ||
387 | */ | ||
388 | .align 8 | ||
389 | GLOBAL(stub_execveat) | ||
390 | call sys_execveat | ||
391 | jmp return_from_execve | ||
392 | END(stub_execveat) | ||
393 | |||
394 | #if defined(CONFIG_X86_X32_ABI) | ||
395 | .align 8 | ||
396 | GLOBAL(stub_x32_execve) | ||
397 | call compat_sys_execve | ||
398 | jmp return_from_execve | ||
399 | END(stub_x32_execve) | ||
400 | .align 8 | ||
401 | GLOBAL(stub_x32_execveat) | ||
402 | call compat_sys_execveat | ||
403 | jmp return_from_execve | ||
404 | END(stub_x32_execveat) | ||
405 | #endif | ||
406 | |||
407 | /* | ||
408 | * sigreturn is special because it needs to restore all registers on return. | ||
409 | * This cannot be done with SYSRET, so use the IRET return path instead. | ||
410 | */ | ||
411 | ENTRY(stub_rt_sigreturn) | ||
412 | /* | 340 | /* |
413 | * SAVE_EXTRA_REGS result is not normally needed: | 341 | * Called from fast path -- disable IRQs again, pop return address |
414 | * sigreturn overwrites all pt_regs->GPREGS. | 342 | * and jump to slow path |
415 | * But sigreturn can fail (!), and there is no easy way to detect that. | ||
416 | * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, | ||
417 | * we SAVE_EXTRA_REGS here. | ||
418 | */ | 343 | */ |
419 | SAVE_EXTRA_REGS 8 | 344 | DISABLE_INTERRUPTS(CLBR_NONE) |
420 | call sys_rt_sigreturn | 345 | TRACE_IRQS_OFF |
421 | return_from_stub: | 346 | popq %rax |
422 | addq $8, %rsp | 347 | jmp entry_SYSCALL64_slow_path |
423 | RESTORE_EXTRA_REGS | ||
424 | movq %rax, RAX(%rsp) | ||
425 | jmp int_ret_from_sys_call | ||
426 | END(stub_rt_sigreturn) | ||
427 | 348 | ||
428 | #ifdef CONFIG_X86_X32_ABI | 349 | 1: |
429 | ENTRY(stub_x32_rt_sigreturn) | 350 | /* Called from C */ |
430 | SAVE_EXTRA_REGS 8 | 351 | jmp *%rax /* called from C */ |
431 | call sys32_x32_rt_sigreturn | 352 | END(stub_ptregs_64) |
432 | jmp return_from_stub | 353 | |
433 | END(stub_x32_rt_sigreturn) | 354 | .macro ptregs_stub func |
434 | #endif | 355 | ENTRY(ptregs_\func) |
356 | leaq \func(%rip), %rax | ||
357 | jmp stub_ptregs_64 | ||
358 | END(ptregs_\func) | ||
359 | .endm | ||
360 | |||
361 | /* Instantiate ptregs_stub for each ptregs-using syscall */ | ||
362 | #define __SYSCALL_64_QUAL_(sym) | ||
363 | #define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym | ||
364 | #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) | ||
365 | #include <asm/syscalls_64.h> | ||
435 | 366 | ||
436 | /* | 367 | /* |
437 | * A newly forked process directly context switches into this address. | 368 | * A newly forked process directly context switches into this address. |
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn) | |||
439 | * rdi: prev task we switched from | 370 | * rdi: prev task we switched from |
440 | */ | 371 | */ |
441 | ENTRY(ret_from_fork) | 372 | ENTRY(ret_from_fork) |
442 | |||
443 | LOCK ; btr $TIF_FORK, TI_flags(%r8) | 373 | LOCK ; btr $TIF_FORK, TI_flags(%r8) |
444 | 374 | ||
445 | pushq $0x0002 | 375 | pushq $0x0002 |
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork) | |||
447 | 377 | ||
448 | call schedule_tail /* rdi: 'prev' task parameter */ | 378 | call schedule_tail /* rdi: 'prev' task parameter */ |
449 | 379 | ||
450 | RESTORE_EXTRA_REGS | ||
451 | |||
452 | testb $3, CS(%rsp) /* from kernel_thread? */ | 380 | testb $3, CS(%rsp) /* from kernel_thread? */ |
381 | jnz 1f | ||
453 | 382 | ||
454 | /* | 383 | /* |
455 | * By the time we get here, we have no idea whether our pt_regs, | 384 | * We came from kernel_thread. This code path is quite twisted, and |
456 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | 385 | * someone should clean it up. |
457 | * the slow path, or one of the 32-bit compat paths. | 386 | * |
458 | * Use IRET code path to return, since it can safely handle | 387 | * copy_thread_tls stashes the function pointer in RBX and the |
459 | * all of the above. | 388 | * parameter to be passed in RBP. The called function is permitted |
389 | * to call do_execve and thereby jump to user mode. | ||
460 | */ | 390 | */ |
461 | jnz int_ret_from_sys_call | 391 | movq RBP(%rsp), %rdi |
392 | call *RBX(%rsp) | ||
393 | movl $0, RAX(%rsp) | ||
462 | 394 | ||
463 | /* | 395 | /* |
464 | * We came from kernel_thread | 396 | * Fall through as though we're exiting a syscall. This makes a |
465 | * nb: we depend on RESTORE_EXTRA_REGS above | 397 | * twisted sort of sense if we just called do_execve. |
466 | */ | 398 | */ |
467 | movq %rbp, %rdi | 399 | |
468 | call *%rbx | 400 | 1: |
469 | movl $0, RAX(%rsp) | 401 | movq %rsp, %rdi |
470 | RESTORE_EXTRA_REGS | 402 | call syscall_return_slowpath /* returns with IRQs disabled */ |
471 | jmp int_ret_from_sys_call | 403 | TRACE_IRQS_ON /* user mode is traced as IRQS on */ |
404 | SWAPGS | ||
405 | jmp restore_regs_and_iret | ||
472 | END(ret_from_fork) | 406 | END(ret_from_fork) |
473 | 407 | ||
474 | /* | 408 | /* |
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3c990eeee40b..847f2f0c31e5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S | |||
@@ -19,12 +19,21 @@ | |||
19 | .section .entry.text, "ax" | 19 | .section .entry.text, "ax" |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * 32-bit SYSENTER instruction entry. | 22 | * 32-bit SYSENTER entry. |
23 | * | 23 | * |
24 | * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. | 24 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here |
25 | * IF and VM in rflags are cleared (IOW: interrupts are off). | 25 | * on 64-bit kernels running on Intel CPUs. |
26 | * | ||
27 | * The SYSENTER instruction, in principle, should *only* occur in the | ||
28 | * vDSO. In practice, a small number of Android devices were shipped | ||
29 | * with a copy of Bionic that inlined a SYSENTER instruction. This | ||
30 | * never happened in any of Google's Bionic versions -- it only happened | ||
31 | * in a narrow range of Intel-provided versions. | ||
32 | * | ||
33 | * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. | ||
34 | * IF and VM in RFLAGS are cleared (IOW: interrupts are off). | ||
26 | * SYSENTER does not save anything on the stack, | 35 | * SYSENTER does not save anything on the stack, |
27 | * and does not save old rip (!!!) and rflags. | 36 | * and does not save old RIP (!!!), RSP, or RFLAGS. |
28 | * | 37 | * |
29 | * Arguments: | 38 | * Arguments: |
30 | * eax system call number | 39 | * eax system call number |
@@ -35,10 +44,6 @@ | |||
35 | * edi arg5 | 44 | * edi arg5 |
36 | * ebp user stack | 45 | * ebp user stack |
37 | * 0(%ebp) arg6 | 46 | * 0(%ebp) arg6 |
38 | * | ||
39 | * This is purely a fast path. For anything complicated we use the int 0x80 | ||
40 | * path below. We set up a complete hardware stack frame to share code | ||
41 | * with the int 0x80 path. | ||
42 | */ | 47 | */ |
43 | ENTRY(entry_SYSENTER_compat) | 48 | ENTRY(entry_SYSENTER_compat) |
44 | /* Interrupts are off on entry. */ | 49 | /* Interrupts are off on entry. */ |
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat) | |||
66 | */ | 71 | */ |
67 | pushfq /* pt_regs->flags (except IF = 0) */ | 72 | pushfq /* pt_regs->flags (except IF = 0) */ |
68 | orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ | 73 | orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ |
69 | ASM_CLAC /* Clear AC after saving FLAGS */ | ||
70 | |||
71 | pushq $__USER32_CS /* pt_regs->cs */ | 74 | pushq $__USER32_CS /* pt_regs->cs */ |
72 | xorq %r8,%r8 | 75 | xorq %r8,%r8 |
73 | pushq %r8 /* pt_regs->ip = 0 (placeholder) */ | 76 | pushq %r8 /* pt_regs->ip = 0 (placeholder) */ |
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat) | |||
90 | cld | 93 | cld |
91 | 94 | ||
92 | /* | 95 | /* |
93 | * Sysenter doesn't filter flags, so we need to clear NT | 96 | * SYSENTER doesn't filter flags, so we need to clear NT and AC |
94 | * ourselves. To save a few cycles, we can check whether | 97 | * ourselves. To save a few cycles, we can check whether |
95 | * NT was set instead of doing an unconditional popfq. | 98 | * either was set instead of doing an unconditional popfq. |
96 | * This needs to happen before enabling interrupts so that | 99 | * This needs to happen before enabling interrupts so that |
97 | * we don't get preempted with NT set. | 100 | * we don't get preempted with NT set. |
98 | * | 101 | * |
102 | * If TF is set, we will single-step all the way to here -- do_debug | ||
103 | * will ignore all the traps. (Yes, this is slow, but so is | ||
104 | * single-stepping in general. This allows us to avoid having | ||
105 | * a more complicated code to handle the case where a user program | ||
106 | * forces us to single-step through the SYSENTER entry code.) | ||
107 | * | ||
99 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved | 108 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved |
100 | * out-of-line as an optimization: NT is unlikely to be set in the | 109 | * out-of-line as an optimization: NT is unlikely to be set in the |
101 | * majority of the cases and instead of polluting the I$ unnecessarily, | 110 | * majority of the cases and instead of polluting the I$ unnecessarily, |
102 | * we're keeping that code behind a branch which will predict as | 111 | * we're keeping that code behind a branch which will predict as |
103 | * not-taken and therefore its instructions won't be fetched. | 112 | * not-taken and therefore its instructions won't be fetched. |
104 | */ | 113 | */ |
105 | testl $X86_EFLAGS_NT, EFLAGS(%rsp) | 114 | testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) |
106 | jnz .Lsysenter_fix_flags | 115 | jnz .Lsysenter_fix_flags |
107 | .Lsysenter_flags_fixed: | 116 | .Lsysenter_flags_fixed: |
108 | 117 | ||
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat) | |||
123 | pushq $X86_EFLAGS_FIXED | 132 | pushq $X86_EFLAGS_FIXED |
124 | popfq | 133 | popfq |
125 | jmp .Lsysenter_flags_fixed | 134 | jmp .Lsysenter_flags_fixed |
135 | GLOBAL(__end_entry_SYSENTER_compat) | ||
126 | ENDPROC(entry_SYSENTER_compat) | 136 | ENDPROC(entry_SYSENTER_compat) |
127 | 137 | ||
128 | /* | 138 | /* |
129 | * 32-bit SYSCALL instruction entry. | 139 | * 32-bit SYSCALL entry. |
140 | * | ||
141 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here | ||
142 | * on 64-bit kernels running on AMD CPUs. | ||
143 | * | ||
144 | * The SYSCALL instruction, in principle, should *only* occur in the | ||
145 | * vDSO. In practice, it appears that this really is the case. | ||
146 | * As evidence: | ||
147 | * | ||
148 | * - The calling convention for SYSCALL has changed several times without | ||
149 | * anyone noticing. | ||
130 | * | 150 | * |
131 | * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, | 151 | * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything |
132 | * then loads new ss, cs, and rip from previously programmed MSRs. | 152 | * user task that did SYSCALL without immediately reloading SS |
133 | * rflags gets masked by a value from another MSR (so CLD and CLAC | 153 | * would randomly crash. |
134 | * are not needed). SYSCALL does not save anything on the stack | ||
135 | * and does not change rsp. | ||
136 | * | 154 | * |
137 | * Note: rflags saving+masking-with-MSR happens only in Long mode | 155 | * - Most programmers do not directly target AMD CPUs, and the 32-bit |
156 | * SYSCALL instruction does not exist on Intel CPUs. Even on AMD | ||
157 | * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels | ||
158 | * because the SYSCALL instruction in legacy/native 32-bit mode (as | ||
159 | * opposed to compat mode) is sufficiently poorly designed as to be | ||
160 | * essentially unusable. | ||
161 | * | ||
162 | * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves | ||
163 | * RFLAGS to R11, then loads new SS, CS, and RIP from previously | ||
164 | * programmed MSRs. RFLAGS gets masked by a value from another MSR | ||
165 | * (so CLD and CLAC are not needed). SYSCALL does not save anything on | ||
166 | * the stack and does not change RSP. | ||
167 | * | ||
168 | * Note: RFLAGS saving+masking-with-MSR happens only in Long mode | ||
138 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). | 169 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). |
139 | * Don't get confused: rflags saving+masking depends on Long Mode Active bit | 170 | * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit |
140 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes | 171 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes |
141 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | 172 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). |
142 | * | 173 | * |
@@ -236,7 +267,21 @@ sysret32_from_system_call: | |||
236 | END(entry_SYSCALL_compat) | 267 | END(entry_SYSCALL_compat) |
237 | 268 | ||
238 | /* | 269 | /* |
239 | * Emulated IA32 system calls via int 0x80. | 270 | * 32-bit legacy system call entry. |
271 | * | ||
272 | * 32-bit x86 Linux system calls traditionally used the INT $0x80 | ||
273 | * instruction. INT $0x80 lands here. | ||
274 | * | ||
275 | * This entry point can be used by 32-bit and 64-bit programs to perform | ||
276 | * 32-bit system calls. Instances of INT $0x80 can be found inline in | ||
277 | * various programs and libraries. It is also used by the vDSO's | ||
278 | * __kernel_vsyscall fallback for hardware that doesn't support a faster | ||
279 | * entry method. Restarted 32-bit system calls also fall back to INT | ||
280 | * $0x80 regardless of what instruction was originally used to do the | ||
281 | * system call. | ||
282 | * | ||
283 | * This is considered a slow path. It is not used by most libc | ||
284 | * implementations on modern hardware except during process startup. | ||
240 | * | 285 | * |
241 | * Arguments: | 286 | * Arguments: |
242 | * eax system call number | 287 | * eax system call number |
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat) | |||
245 | * edx arg3 | 290 | * edx arg3 |
246 | * esi arg4 | 291 | * esi arg4 |
247 | * edi arg5 | 292 | * edi arg5 |
248 | * ebp arg6 (note: not saved in the stack frame, should not be touched) | 293 | * ebp arg6 |
249 | * | ||
250 | * Notes: | ||
251 | * Uses the same stack frame as the x86-64 version. | ||
252 | * All registers except eax must be saved (but ptrace may violate that). | ||
253 | * Arguments are zero extended. For system calls that want sign extension and | ||
254 | * take long arguments a wrapper is needed. Most calls can just be called | ||
255 | * directly. | ||
256 | * Assumes it is only called from user space and entered with interrupts off. | ||
257 | */ | 294 | */ |
258 | |||
259 | ENTRY(entry_INT80_compat) | 295 | ENTRY(entry_INT80_compat) |
260 | /* | 296 | /* |
261 | * Interrupts are off on entry. | 297 | * Interrupts are off on entry. |
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat) | |||
300 | TRACE_IRQS_OFF | 336 | TRACE_IRQS_OFF |
301 | 337 | ||
302 | movq %rsp, %rdi | 338 | movq %rsp, %rdi |
303 | call do_syscall_32_irqs_off | 339 | call do_int80_syscall_32 |
304 | .Lsyscall_32_done: | 340 | .Lsyscall_32_done: |
305 | 341 | ||
306 | /* Go back to user mode. */ | 342 | /* Go back to user mode. */ |
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 9a6649857106..8f895ee13a1c 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c | |||
@@ -6,17 +6,11 @@ | |||
6 | #include <asm/asm-offsets.h> | 6 | #include <asm/asm-offsets.h> |
7 | #include <asm/syscall.h> | 7 | #include <asm/syscall.h> |
8 | 8 | ||
9 | #ifdef CONFIG_IA32_EMULATION | 9 | #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; |
10 | #define SYM(sym, compat) compat | ||
11 | #else | ||
12 | #define SYM(sym, compat) sym | ||
13 | #endif | ||
14 | |||
15 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; | ||
16 | #include <asm/syscalls_32.h> | 10 | #include <asm/syscalls_32.h> |
17 | #undef __SYSCALL_I386 | 11 | #undef __SYSCALL_I386 |
18 | 12 | ||
19 | #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), | 13 | #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, |
20 | 14 | ||
21 | extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); | 15 | extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); |
22 | 16 | ||
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 41283d22be7a..9dbc5abb6162 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c | |||
@@ -6,19 +6,14 @@ | |||
6 | #include <asm/asm-offsets.h> | 6 | #include <asm/asm-offsets.h> |
7 | #include <asm/syscall.h> | 7 | #include <asm/syscall.h> |
8 | 8 | ||
9 | #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) | 9 | #define __SYSCALL_64_QUAL_(sym) sym |
10 | #define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym | ||
10 | 11 | ||
11 | #ifdef CONFIG_X86_X32_ABI | 12 | #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); |
12 | # define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) | ||
13 | #else | ||
14 | # define __SYSCALL_X32(nr, sym, compat) /* nothing */ | ||
15 | #endif | ||
16 | |||
17 | #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; | ||
18 | #include <asm/syscalls_64.h> | 13 | #include <asm/syscalls_64.h> |
19 | #undef __SYSCALL_64 | 14 | #undef __SYSCALL_64 |
20 | 15 | ||
21 | #define __SYSCALL_64(nr, sym, compat) [nr] = sym, | 16 | #define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), |
22 | 17 | ||
23 | extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); | 18 | extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); |
24 | 19 | ||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dc1040a50bdc..2e5b565adacc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl | |||
@@ -21,7 +21,7 @@ | |||
21 | 12 common brk sys_brk | 21 | 12 common brk sys_brk |
22 | 13 64 rt_sigaction sys_rt_sigaction | 22 | 13 64 rt_sigaction sys_rt_sigaction |
23 | 14 common rt_sigprocmask sys_rt_sigprocmask | 23 | 14 common rt_sigprocmask sys_rt_sigprocmask |
24 | 15 64 rt_sigreturn stub_rt_sigreturn | 24 | 15 64 rt_sigreturn sys_rt_sigreturn/ptregs |
25 | 16 64 ioctl sys_ioctl | 25 | 16 64 ioctl sys_ioctl |
26 | 17 common pread64 sys_pread64 | 26 | 17 common pread64 sys_pread64 |
27 | 18 common pwrite64 sys_pwrite64 | 27 | 18 common pwrite64 sys_pwrite64 |
@@ -62,10 +62,10 @@ | |||
62 | 53 common socketpair sys_socketpair | 62 | 53 common socketpair sys_socketpair |
63 | 54 64 setsockopt sys_setsockopt | 63 | 54 64 setsockopt sys_setsockopt |
64 | 55 64 getsockopt sys_getsockopt | 64 | 55 64 getsockopt sys_getsockopt |
65 | 56 common clone stub_clone | 65 | 56 common clone sys_clone/ptregs |
66 | 57 common fork stub_fork | 66 | 57 common fork sys_fork/ptregs |
67 | 58 common vfork stub_vfork | 67 | 58 common vfork sys_vfork/ptregs |
68 | 59 64 execve stub_execve | 68 | 59 64 execve sys_execve/ptregs |
69 | 60 common exit sys_exit | 69 | 60 common exit sys_exit |
70 | 61 common wait4 sys_wait4 | 70 | 61 common wait4 sys_wait4 |
71 | 62 common kill sys_kill | 71 | 62 common kill sys_kill |
@@ -178,7 +178,7 @@ | |||
178 | 169 common reboot sys_reboot | 178 | 169 common reboot sys_reboot |
179 | 170 common sethostname sys_sethostname | 179 | 170 common sethostname sys_sethostname |
180 | 171 common setdomainname sys_setdomainname | 180 | 171 common setdomainname sys_setdomainname |
181 | 172 common iopl sys_iopl | 181 | 172 common iopl sys_iopl/ptregs |
182 | 173 common ioperm sys_ioperm | 182 | 173 common ioperm sys_ioperm |
183 | 174 64 create_module | 183 | 174 64 create_module |
184 | 175 common init_module sys_init_module | 184 | 175 common init_module sys_init_module |
@@ -328,7 +328,7 @@ | |||
328 | 319 common memfd_create sys_memfd_create | 328 | 319 common memfd_create sys_memfd_create |
329 | 320 common kexec_file_load sys_kexec_file_load | 329 | 320 common kexec_file_load sys_kexec_file_load |
330 | 321 common bpf sys_bpf | 330 | 321 common bpf sys_bpf |
331 | 322 64 execveat stub_execveat | 331 | 322 64 execveat sys_execveat/ptregs |
332 | 323 common userfaultfd sys_userfaultfd | 332 | 323 common userfaultfd sys_userfaultfd |
333 | 324 common membarrier sys_membarrier | 333 | 324 common membarrier sys_membarrier |
334 | 325 common mlock2 sys_mlock2 | 334 | 325 common mlock2 sys_mlock2 |
@@ -339,14 +339,14 @@ | |||
339 | # for native 64-bit operation. | 339 | # for native 64-bit operation. |
340 | # | 340 | # |
341 | 512 x32 rt_sigaction compat_sys_rt_sigaction | 341 | 512 x32 rt_sigaction compat_sys_rt_sigaction |
342 | 513 x32 rt_sigreturn stub_x32_rt_sigreturn | 342 | 513 x32 rt_sigreturn sys32_x32_rt_sigreturn |
343 | 514 x32 ioctl compat_sys_ioctl | 343 | 514 x32 ioctl compat_sys_ioctl |
344 | 515 x32 readv compat_sys_readv | 344 | 515 x32 readv compat_sys_readv |
345 | 516 x32 writev compat_sys_writev | 345 | 516 x32 writev compat_sys_writev |
346 | 517 x32 recvfrom compat_sys_recvfrom | 346 | 517 x32 recvfrom compat_sys_recvfrom |
347 | 518 x32 sendmsg compat_sys_sendmsg | 347 | 518 x32 sendmsg compat_sys_sendmsg |
348 | 519 x32 recvmsg compat_sys_recvmsg | 348 | 519 x32 recvmsg compat_sys_recvmsg |
349 | 520 x32 execve stub_x32_execve | 349 | 520 x32 execve compat_sys_execve/ptregs |
350 | 521 x32 ptrace compat_sys_ptrace | 350 | 521 x32 ptrace compat_sys_ptrace |
351 | 522 x32 rt_sigpending compat_sys_rt_sigpending | 351 | 522 x32 rt_sigpending compat_sys_rt_sigpending |
352 | 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait | 352 | 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait |
@@ -371,4 +371,4 @@ | |||
371 | 542 x32 getsockopt compat_sys_getsockopt | 371 | 542 x32 getsockopt compat_sys_getsockopt |
372 | 543 x32 io_setup compat_sys_io_setup | 372 | 543 x32 io_setup compat_sys_io_setup |
373 | 544 x32 io_submit compat_sys_io_submit | 373 | 544 x32 io_submit compat_sys_io_submit |
374 | 545 x32 execveat stub_x32_execveat | 374 | 545 x32 execveat compat_sys_execveat/ptregs |
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 0e7f8ec071e7..cd3d3015d7df 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh | |||
@@ -3,13 +3,63 @@ | |||
3 | in="$1" | 3 | in="$1" |
4 | out="$2" | 4 | out="$2" |
5 | 5 | ||
6 | syscall_macro() { | ||
7 | abi="$1" | ||
8 | nr="$2" | ||
9 | entry="$3" | ||
10 | |||
11 | # Entry can be either just a function name or "function/qualifier" | ||
12 | real_entry="${entry%%/*}" | ||
13 | qualifier="${entry:${#real_entry}}" # Strip the function name | ||
14 | qualifier="${qualifier:1}" # Strip the slash, if any | ||
15 | |||
16 | echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" | ||
17 | } | ||
18 | |||
19 | emit() { | ||
20 | abi="$1" | ||
21 | nr="$2" | ||
22 | entry="$3" | ||
23 | compat="$4" | ||
24 | |||
25 | if [ "$abi" == "64" -a -n "$compat" ]; then | ||
26 | echo "a compat entry for a 64-bit syscall makes no sense" >&2 | ||
27 | exit 1 | ||
28 | fi | ||
29 | |||
30 | if [ -z "$compat" ]; then | ||
31 | if [ -n "$entry" ]; then | ||
32 | syscall_macro "$abi" "$nr" "$entry" | ||
33 | fi | ||
34 | else | ||
35 | echo "#ifdef CONFIG_X86_32" | ||
36 | if [ -n "$entry" ]; then | ||
37 | syscall_macro "$abi" "$nr" "$entry" | ||
38 | fi | ||
39 | echo "#else" | ||
40 | syscall_macro "$abi" "$nr" "$compat" | ||
41 | echo "#endif" | ||
42 | fi | ||
43 | } | ||
44 | |||
6 | grep '^[0-9]' "$in" | sort -n | ( | 45 | grep '^[0-9]' "$in" | sort -n | ( |
7 | while read nr abi name entry compat; do | 46 | while read nr abi name entry compat; do |
8 | abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` | 47 | abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` |
9 | if [ -n "$compat" ]; then | 48 | if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then |
10 | echo "__SYSCALL_${abi}($nr, $entry, $compat)" | 49 | # COMMON is the same as 64, except that we don't expect X32 |
11 | elif [ -n "$entry" ]; then | 50 | # programs to use it. Our expectation has nothing to do with |
12 | echo "__SYSCALL_${abi}($nr, $entry, $entry)" | 51 | # any generated code, so treat them the same. |
52 | emit 64 "$nr" "$entry" "$compat" | ||
53 | elif [ "$abi" == "X32" ]; then | ||
54 | # X32 is equivalent to 64 on an X32-compatible kernel. | ||
55 | echo "#ifdef CONFIG_X86_X32_ABI" | ||
56 | emit 64 "$nr" "$entry" "$compat" | ||
57 | echo "#endif" | ||
58 | elif [ "$abi" == "I386" ]; then | ||
59 | emit "$abi" "$nr" "$entry" "$compat" | ||
60 | else | ||
61 | echo "Unknown abi $abi" >&2 | ||
62 | exit 1 | ||
13 | fi | 63 | fi |
14 | done | 64 | done |
15 | ) > "$out" | 65 | ) > "$out" |
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 3f69326ed545..63a03bb91497 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h | |||
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, | |||
150 | } | 150 | } |
151 | fprintf(outfile, "\n};\n\n"); | 151 | fprintf(outfile, "\n};\n\n"); |
152 | 152 | ||
153 | fprintf(outfile, "static struct page *pages[%lu];\n\n", | ||
154 | mapping_size / 4096); | ||
155 | |||
156 | fprintf(outfile, "const struct vdso_image %s = {\n", name); | 153 | fprintf(outfile, "const struct vdso_image %s = {\n", name); |
157 | fprintf(outfile, "\t.data = raw_data,\n"); | 154 | fprintf(outfile, "\t.data = raw_data,\n"); |
158 | fprintf(outfile, "\t.size = %lu,\n", mapping_size); | 155 | fprintf(outfile, "\t.size = %lu,\n", mapping_size); |
159 | fprintf(outfile, "\t.text_mapping = {\n"); | ||
160 | fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); | ||
161 | fprintf(outfile, "\t\t.pages = pages,\n"); | ||
162 | fprintf(outfile, "\t},\n"); | ||
163 | if (alt_sec) { | 156 | if (alt_sec) { |
164 | fprintf(outfile, "\t.alt = %lu,\n", | 157 | fprintf(outfile, "\t.alt = %lu,\n", |
165 | (unsigned long)GET_LE(&alt_sec->sh_offset)); | 158 | (unsigned long)GET_LE(&alt_sec->sh_offset)); |
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 08a317a9ae4b..7853b53959cd 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c | |||
@@ -11,7 +11,6 @@ | |||
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/mm_types.h> | 12 | #include <linux/mm_types.h> |
13 | 13 | ||
14 | #include <asm/cpufeature.h> | ||
15 | #include <asm/processor.h> | 14 | #include <asm/processor.h> |
16 | #include <asm/vdso.h> | 15 | #include <asm/vdso.h> |
17 | 16 | ||
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 3a1d9297074b..0109ac6cb79c 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S | |||
@@ -3,7 +3,7 @@ | |||
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <asm/dwarf2.h> | 5 | #include <asm/dwarf2.h> |
6 | #include <asm/cpufeature.h> | 6 | #include <asm/cpufeatures.h> |
7 | #include <asm/alternative-asm.h> | 7 | #include <asm/alternative-asm.h> |
8 | 8 | ||
9 | /* | 9 | /* |
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f69e264ac4..10f704584922 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | #include <asm/hpet.h> | 21 | #include <asm/hpet.h> |
22 | #include <asm/desc.h> | 22 | #include <asm/desc.h> |
23 | #include <asm/cpufeature.h> | ||
23 | 24 | ||
24 | #if defined(CONFIG_X86_64) | 25 | #if defined(CONFIG_X86_64) |
25 | unsigned int __read_mostly vdso64_enabled = 1; | 26 | unsigned int __read_mostly vdso64_enabled = 1; |
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1; | |||
27 | 28 | ||
28 | void __init init_vdso_image(const struct vdso_image *image) | 29 | void __init init_vdso_image(const struct vdso_image *image) |
29 | { | 30 | { |
30 | int i; | ||
31 | int npages = (image->size) / PAGE_SIZE; | ||
32 | |||
33 | BUG_ON(image->size % PAGE_SIZE != 0); | 31 | BUG_ON(image->size % PAGE_SIZE != 0); |
34 | for (i = 0; i < npages; i++) | ||
35 | image->text_mapping.pages[i] = | ||
36 | virt_to_page(image->data + i*PAGE_SIZE); | ||
37 | 32 | ||
38 | apply_alternatives((struct alt_instr *)(image->data + image->alt), | 33 | apply_alternatives((struct alt_instr *)(image->data + image->alt), |
39 | (struct alt_instr *)(image->data + image->alt + | 34 | (struct alt_instr *)(image->data + image->alt + |
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) | |||
90 | #endif | 85 | #endif |
91 | } | 86 | } |
92 | 87 | ||
88 | static int vdso_fault(const struct vm_special_mapping *sm, | ||
89 | struct vm_area_struct *vma, struct vm_fault *vmf) | ||
90 | { | ||
91 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | ||
92 | |||
93 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) | ||
94 | return VM_FAULT_SIGBUS; | ||
95 | |||
96 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); | ||
97 | get_page(vmf->page); | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static const struct vm_special_mapping text_mapping = { | ||
102 | .name = "[vdso]", | ||
103 | .fault = vdso_fault, | ||
104 | }; | ||
105 | |||
106 | static int vvar_fault(const struct vm_special_mapping *sm, | ||
107 | struct vm_area_struct *vma, struct vm_fault *vmf) | ||
108 | { | ||
109 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | ||
110 | long sym_offset; | ||
111 | int ret = -EFAULT; | ||
112 | |||
113 | if (!image) | ||
114 | return VM_FAULT_SIGBUS; | ||
115 | |||
116 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + | ||
117 | image->sym_vvar_start; | ||
118 | |||
119 | /* | ||
120 | * Sanity check: a symbol offset of zero means that the page | ||
121 | * does not exist for this vdso image, not that the page is at | ||
122 | * offset zero relative to the text mapping. This should be | ||
123 | * impossible here, because sym_offset should only be zero for | ||
124 | * the page past the end of the vvar mapping. | ||
125 | */ | ||
126 | if (sym_offset == 0) | ||
127 | return VM_FAULT_SIGBUS; | ||
128 | |||
129 | if (sym_offset == image->sym_vvar_page) { | ||
130 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | ||
131 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | ||
132 | } else if (sym_offset == image->sym_hpet_page) { | ||
133 | #ifdef CONFIG_HPET_TIMER | ||
134 | if (hpet_address && vclock_was_used(VCLOCK_HPET)) { | ||
135 | ret = vm_insert_pfn_prot( | ||
136 | vma, | ||
137 | (unsigned long)vmf->virtual_address, | ||
138 | hpet_address >> PAGE_SHIFT, | ||
139 | pgprot_noncached(PAGE_READONLY)); | ||
140 | } | ||
141 | #endif | ||
142 | } else if (sym_offset == image->sym_pvclock_page) { | ||
143 | struct pvclock_vsyscall_time_info *pvti = | ||
144 | pvclock_pvti_cpu0_va(); | ||
145 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { | ||
146 | ret = vm_insert_pfn( | ||
147 | vma, | ||
148 | (unsigned long)vmf->virtual_address, | ||
149 | __pa(pvti) >> PAGE_SHIFT); | ||
150 | } | ||
151 | } | ||
152 | |||
153 | if (ret == 0 || ret == -EBUSY) | ||
154 | return VM_FAULT_NOPAGE; | ||
155 | |||
156 | return VM_FAULT_SIGBUS; | ||
157 | } | ||
158 | |||
93 | static int map_vdso(const struct vdso_image *image, bool calculate_addr) | 159 | static int map_vdso(const struct vdso_image *image, bool calculate_addr) |
94 | { | 160 | { |
95 | struct mm_struct *mm = current->mm; | 161 | struct mm_struct *mm = current->mm; |
96 | struct vm_area_struct *vma; | 162 | struct vm_area_struct *vma; |
97 | unsigned long addr, text_start; | 163 | unsigned long addr, text_start; |
98 | int ret = 0; | 164 | int ret = 0; |
99 | static struct page *no_pages[] = {NULL}; | 165 | static const struct vm_special_mapping vvar_mapping = { |
100 | static struct vm_special_mapping vvar_mapping = { | ||
101 | .name = "[vvar]", | 166 | .name = "[vvar]", |
102 | .pages = no_pages, | 167 | .fault = vvar_fault, |
103 | }; | 168 | }; |
104 | struct pvclock_vsyscall_time_info *pvti; | ||
105 | 169 | ||
106 | if (calculate_addr) { | 170 | if (calculate_addr) { |
107 | addr = vdso_addr(current->mm->start_stack, | 171 | addr = vdso_addr(current->mm->start_stack, |
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) | |||
121 | 185 | ||
122 | text_start = addr - image->sym_vvar_start; | 186 | text_start = addr - image->sym_vvar_start; |
123 | current->mm->context.vdso = (void __user *)text_start; | 187 | current->mm->context.vdso = (void __user *)text_start; |
188 | current->mm->context.vdso_image = image; | ||
124 | 189 | ||
125 | /* | 190 | /* |
126 | * MAYWRITE to allow gdb to COW and set breakpoints | 191 | * MAYWRITE to allow gdb to COW and set breakpoints |
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) | |||
130 | image->size, | 195 | image->size, |
131 | VM_READ|VM_EXEC| | 196 | VM_READ|VM_EXEC| |
132 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, | 197 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, |
133 | &image->text_mapping); | 198 | &text_mapping); |
134 | 199 | ||
135 | if (IS_ERR(vma)) { | 200 | if (IS_ERR(vma)) { |
136 | ret = PTR_ERR(vma); | 201 | ret = PTR_ERR(vma); |
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) | |||
140 | vma = _install_special_mapping(mm, | 205 | vma = _install_special_mapping(mm, |
141 | addr, | 206 | addr, |
142 | -image->sym_vvar_start, | 207 | -image->sym_vvar_start, |
143 | VM_READ|VM_MAYREAD, | 208 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
209 | VM_PFNMAP, | ||
144 | &vvar_mapping); | 210 | &vvar_mapping); |
145 | 211 | ||
146 | if (IS_ERR(vma)) { | 212 | if (IS_ERR(vma)) { |
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) | |||
148 | goto up_fail; | 214 | goto up_fail; |
149 | } | 215 | } |
150 | 216 | ||
151 | if (image->sym_vvar_page) | ||
152 | ret = remap_pfn_range(vma, | ||
153 | text_start + image->sym_vvar_page, | ||
154 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT, | ||
155 | PAGE_SIZE, | ||
156 | PAGE_READONLY); | ||
157 | |||
158 | if (ret) | ||
159 | goto up_fail; | ||
160 | |||
161 | #ifdef CONFIG_HPET_TIMER | ||
162 | if (hpet_address && image->sym_hpet_page) { | ||
163 | ret = io_remap_pfn_range(vma, | ||
164 | text_start + image->sym_hpet_page, | ||
165 | hpet_address >> PAGE_SHIFT, | ||
166 | PAGE_SIZE, | ||
167 | pgprot_noncached(PAGE_READONLY)); | ||
168 | |||
169 | if (ret) | ||
170 | goto up_fail; | ||
171 | } | ||
172 | #endif | ||
173 | |||
174 | pvti = pvclock_pvti_cpu0_va(); | ||
175 | if (pvti && image->sym_pvclock_page) { | ||
176 | ret = remap_pfn_range(vma, | ||
177 | text_start + image->sym_pvclock_page, | ||
178 | __pa(pvti) >> PAGE_SHIFT, | ||
179 | PAGE_SIZE, | ||
180 | PAGE_READONLY); | ||
181 | |||
182 | if (ret) | ||
183 | goto up_fail; | ||
184 | } | ||
185 | |||
186 | up_fail: | 217 | up_fail: |
187 | if (ret) | 218 | if (ret) |
188 | current->mm->context.vdso = NULL; | 219 | current->mm->context.vdso = NULL; |
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg) | |||
254 | #ifdef CONFIG_NUMA | 285 | #ifdef CONFIG_NUMA |
255 | node = cpu_to_node(cpu); | 286 | node = cpu_to_node(cpu); |
256 | #endif | 287 | #endif |
257 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) | 288 | if (static_cpu_has(X86_FEATURE_RDTSCP)) |
258 | write_rdtscp_aux((node << 12) | cpu); | 289 | write_rdtscp_aux((node << 12) | cpu); |
259 | 290 | ||
260 | /* | 291 | /* |
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 51e330416995..0fb3a104ac62 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <asm/vgtod.h> | 16 | #include <asm/vgtod.h> |
17 | #include <asm/vvar.h> | 17 | #include <asm/vvar.h> |
18 | 18 | ||
19 | int vclocks_used __read_mostly; | ||
20 | |||
19 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); | 21 | DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); |
20 | 22 | ||
21 | void update_vsyscall_tz(void) | 23 | void update_vsyscall_tz(void) |
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void) | |||
26 | 28 | ||
27 | void update_vsyscall(struct timekeeper *tk) | 29 | void update_vsyscall(struct timekeeper *tk) |
28 | { | 30 | { |
31 | int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; | ||
29 | struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; | 32 | struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; |
30 | 33 | ||
34 | /* Mark the new vclock used. */ | ||
35 | BUILD_BUG_ON(VCLOCK_MAX >= 32); | ||
36 | WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); | ||
37 | |||
31 | gtod_write_begin(vdata); | 38 | gtod_write_begin(vdata); |
32 | 39 | ||
33 | /* copy vsyscall data */ | 40 | /* copy vsyscall data */ |
34 | vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; | 41 | vdata->vclock_mode = vclock_mode; |
35 | vdata->cycle_last = tk->tkr_mono.cycle_last; | 42 | vdata->cycle_last = tk->tkr_mono.cycle_last; |
36 | vdata->mask = tk->tkr_mono.mask; | 43 | vdata->mask = tk->tkr_mono.mask; |
37 | vdata->mult = tk->tkr_mono.mult; | 44 | vdata->mult = tk->tkr_mono.mult; |