diff options
Diffstat (limited to 'arch/x86/kernel')
37 files changed, 899 insertions, 827 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..c887cd944f0c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o | |||
32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
33 | obj-$(CONFIG_X86_64) += mcount_64.o | 33 | obj-$(CONFIG_X86_64) += mcount_64.o |
34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o | 34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o |
35 | obj-$(CONFIG_IA32_EMULATION) += syscall_32.o | ||
35 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o | 36 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o |
36 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o | 37 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o |
37 | obj-$(CONFIG_SYSFS) += ksysfs.o | 38 | obj-$(CONFIG_SYSFS) += ksysfs.o |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..aef653193160 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) | |||
52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); | 52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #define DPRINTK(fmt, ...) \ | 55 | #define DPRINTK(fmt, args...) \ |
56 | do { \ | 56 | do { \ |
57 | if (debug_alternative) \ | 57 | if (debug_alternative) \ |
58 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | 58 | printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ |
59 | } while (0) | ||
60 | |||
61 | #define DUMP_BYTES(buf, len, fmt, args...) \ | ||
62 | do { \ | ||
63 | if (unlikely(debug_alternative)) { \ | ||
64 | int j; \ | ||
65 | \ | ||
66 | if (!(len)) \ | ||
67 | break; \ | ||
68 | \ | ||
69 | printk(KERN_DEBUG fmt, ##args); \ | ||
70 | for (j = 0; j < (len) - 1; j++) \ | ||
71 | printk(KERN_CONT "%02hhx ", buf[j]); \ | ||
72 | printk(KERN_CONT "%02hhx\n", buf[j]); \ | ||
73 | } \ | ||
59 | } while (0) | 74 | } while (0) |
60 | 75 | ||
61 | /* | 76 | /* |
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | |||
243 | extern s32 __smp_locks[], __smp_locks_end[]; | 258 | extern s32 __smp_locks[], __smp_locks_end[]; |
244 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 259 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
245 | 260 | ||
246 | /* Replace instructions with better alternatives for this CPU type. | 261 | /* |
247 | This runs before SMP is initialized to avoid SMP problems with | 262 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
248 | self modifying code. This implies that asymmetric systems where | 263 | */ |
249 | APs have less capabilities than the boot processor are not handled. | 264 | static inline bool is_jmp(const u8 opcode) |
250 | Tough. Make sure you disable such features by hand. */ | 265 | { |
266 | return opcode == 0xeb || opcode == 0xe9; | ||
267 | } | ||
268 | |||
269 | static void __init_or_module | ||
270 | recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) | ||
271 | { | ||
272 | u8 *next_rip, *tgt_rip; | ||
273 | s32 n_dspl, o_dspl; | ||
274 | int repl_len; | ||
275 | |||
276 | if (a->replacementlen != 5) | ||
277 | return; | ||
278 | |||
279 | o_dspl = *(s32 *)(insnbuf + 1); | ||
280 | |||
281 | /* next_rip of the replacement JMP */ | ||
282 | next_rip = repl_insn + a->replacementlen; | ||
283 | /* target rip of the replacement JMP */ | ||
284 | tgt_rip = next_rip + o_dspl; | ||
285 | n_dspl = tgt_rip - orig_insn; | ||
286 | |||
287 | DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); | ||
288 | |||
289 | if (tgt_rip - orig_insn >= 0) { | ||
290 | if (n_dspl - 2 <= 127) | ||
291 | goto two_byte_jmp; | ||
292 | else | ||
293 | goto five_byte_jmp; | ||
294 | /* negative offset */ | ||
295 | } else { | ||
296 | if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) | ||
297 | goto two_byte_jmp; | ||
298 | else | ||
299 | goto five_byte_jmp; | ||
300 | } | ||
301 | |||
302 | two_byte_jmp: | ||
303 | n_dspl -= 2; | ||
304 | |||
305 | insnbuf[0] = 0xeb; | ||
306 | insnbuf[1] = (s8)n_dspl; | ||
307 | add_nops(insnbuf + 2, 3); | ||
308 | |||
309 | repl_len = 2; | ||
310 | goto done; | ||
311 | |||
312 | five_byte_jmp: | ||
313 | n_dspl -= 5; | ||
314 | |||
315 | insnbuf[0] = 0xe9; | ||
316 | *(s32 *)&insnbuf[1] = n_dspl; | ||
251 | 317 | ||
318 | repl_len = 5; | ||
319 | |||
320 | done: | ||
321 | |||
322 | DPRINTK("final displ: 0x%08x, JMP 0x%lx", | ||
323 | n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); | ||
324 | } | ||
325 | |||
326 | static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) | ||
327 | { | ||
328 | if (instr[0] != 0x90) | ||
329 | return; | ||
330 | |||
331 | add_nops(instr + (a->instrlen - a->padlen), a->padlen); | ||
332 | |||
333 | DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", | ||
334 | instr, a->instrlen - a->padlen, a->padlen); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Replace instructions with better alternatives for this CPU type. This runs | ||
339 | * before SMP is initialized to avoid SMP problems with self modifying code. | ||
340 | * This implies that asymmetric systems where APs have less capabilities than | ||
341 | * the boot processor are not handled. Tough. Make sure you disable such | ||
342 | * features by hand. | ||
343 | */ | ||
252 | void __init_or_module apply_alternatives(struct alt_instr *start, | 344 | void __init_or_module apply_alternatives(struct alt_instr *start, |
253 | struct alt_instr *end) | 345 | struct alt_instr *end) |
254 | { | 346 | { |
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
256 | u8 *instr, *replacement; | 348 | u8 *instr, *replacement; |
257 | u8 insnbuf[MAX_PATCH_LEN]; | 349 | u8 insnbuf[MAX_PATCH_LEN]; |
258 | 350 | ||
259 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 351 | DPRINTK("alt table %p -> %p", start, end); |
260 | /* | 352 | /* |
261 | * The scan order should be from start to end. A later scanned | 353 | * The scan order should be from start to end. A later scanned |
262 | * alternative code can overwrite a previous scanned alternative code. | 354 | * alternative code can overwrite previously scanned alternative code. |
263 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | 355 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to |
264 | * patch code. | 356 | * patch code. |
265 | * | 357 | * |
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
267 | * order. | 359 | * order. |
268 | */ | 360 | */ |
269 | for (a = start; a < end; a++) { | 361 | for (a = start; a < end; a++) { |
362 | int insnbuf_sz = 0; | ||
363 | |||
270 | instr = (u8 *)&a->instr_offset + a->instr_offset; | 364 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
271 | replacement = (u8 *)&a->repl_offset + a->repl_offset; | 365 | replacement = (u8 *)&a->repl_offset + a->repl_offset; |
272 | BUG_ON(a->replacementlen > a->instrlen); | ||
273 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 366 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
274 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); | 367 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); |
275 | if (!boot_cpu_has(a->cpuid)) | 368 | if (!boot_cpu_has(a->cpuid)) { |
369 | if (a->padlen > 1) | ||
370 | optimize_nops(a, instr); | ||
371 | |||
276 | continue; | 372 | continue; |
373 | } | ||
374 | |||
375 | DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", | ||
376 | a->cpuid >> 5, | ||
377 | a->cpuid & 0x1f, | ||
378 | instr, a->instrlen, | ||
379 | replacement, a->replacementlen, a->padlen); | ||
380 | |||
381 | DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); | ||
382 | DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); | ||
277 | 383 | ||
278 | memcpy(insnbuf, replacement, a->replacementlen); | 384 | memcpy(insnbuf, replacement, a->replacementlen); |
385 | insnbuf_sz = a->replacementlen; | ||
279 | 386 | ||
280 | /* 0xe8 is a relative jump; fix the offset. */ | 387 | /* 0xe8 is a relative jump; fix the offset. */ |
281 | if (*insnbuf == 0xe8 && a->replacementlen == 5) | 388 | if (*insnbuf == 0xe8 && a->replacementlen == 5) { |
282 | *(s32 *)(insnbuf + 1) += replacement - instr; | 389 | *(s32 *)(insnbuf + 1) += replacement - instr; |
390 | DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", | ||
391 | *(s32 *)(insnbuf + 1), | ||
392 | (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); | ||
393 | } | ||
394 | |||
395 | if (a->replacementlen && is_jmp(replacement[0])) | ||
396 | recompute_jump(a, instr, replacement, insnbuf); | ||
283 | 397 | ||
284 | add_nops(insnbuf + a->replacementlen, | 398 | if (a->instrlen > a->replacementlen) { |
285 | a->instrlen - a->replacementlen); | 399 | add_nops(insnbuf + a->replacementlen, |
400 | a->instrlen - a->replacementlen); | ||
401 | insnbuf_sz += a->instrlen - a->replacementlen; | ||
402 | } | ||
403 | DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); | ||
286 | 404 | ||
287 | text_poke_early(instr, insnbuf, a->instrlen); | 405 | text_poke_early(instr, insnbuf, insnbuf_sz); |
288 | } | 406 | } |
289 | } | 407 | } |
290 | 408 | ||
291 | #ifdef CONFIG_SMP | 409 | #ifdef CONFIG_SMP |
292 | |||
293 | static void alternatives_smp_lock(const s32 *start, const s32 *end, | 410 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
294 | u8 *text, u8 *text_end) | 411 | u8 *text, u8 *text_end) |
295 | { | 412 | { |
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, | |||
371 | smp->locks_end = locks_end; | 488 | smp->locks_end = locks_end; |
372 | smp->text = text; | 489 | smp->text = text; |
373 | smp->text_end = text_end; | 490 | smp->text_end = text_end; |
374 | DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", | 491 | DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", |
375 | __func__, smp->locks, smp->locks_end, | 492 | smp->locks, smp->locks_end, |
376 | smp->text, smp->text_end, smp->name); | 493 | smp->text, smp->text_end, smp->name); |
377 | 494 | ||
378 | list_add_tail(&smp->next, &smp_alt_modules); | 495 | list_add_tail(&smp->next, &smp_alt_modules); |
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end) | |||
440 | 557 | ||
441 | return 0; | 558 | return 0; |
442 | } | 559 | } |
443 | #endif | 560 | #endif /* CONFIG_SMP */ |
444 | 561 | ||
445 | #ifdef CONFIG_PARAVIRT | 562 | #ifdef CONFIG_PARAVIRT |
446 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, | 563 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, |
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs) | |||
601 | if (likely(!bp_patching_in_progress)) | 718 | if (likely(!bp_patching_in_progress)) |
602 | return 0; | 719 | return 0; |
603 | 720 | ||
604 | if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) | 721 | if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) |
605 | return 0; | 722 | return 0; |
606 | 723 | ||
607 | /* set up the specified breakpoint handler */ | 724 | /* set up the specified breakpoint handler */ |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -68,7 +68,7 @@ void foo(void) | |||
68 | 68 | ||
69 | /* Offset from the sysenter stack to tss.sp0 */ | 69 | /* Offset from the sysenter stack to tss.sp0 */ |
70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - | 70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - |
71 | sizeof(struct tss_struct)); | 71 | offsetofend(struct tss_struct, SYSENTER_stack)); |
72 | 72 | ||
73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
74 | BLANK(); | 74 | BLANK(); |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -81,6 +81,7 @@ int main(void) | |||
81 | #undef ENTRY | 81 | #undef ENTRY |
82 | 82 | ||
83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); | 83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); |
84 | OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); | ||
84 | BLANK(); | 85 | BLANK(); |
85 | 86 | ||
86 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); | 87 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..dd9e50500297 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) | |||
711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); | 711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); |
712 | 712 | ||
713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); | 713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); |
714 | |||
715 | /* 3DNow or LM implies PREFETCHW */ | ||
716 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) | ||
717 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) | ||
718 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); | ||
714 | } | 719 | } |
715 | 720 | ||
716 | #ifdef CONFIG_X86_32 | 721 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..3f70538012e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
959 | #endif | 959 | #endif |
960 | } | 960 | } |
961 | 961 | ||
962 | #ifdef CONFIG_X86_64 | 962 | /* |
963 | #ifdef CONFIG_IA32_EMULATION | 963 | * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions |
964 | /* May not be __init: called during resume */ | 964 | * on 32-bit kernels: |
965 | static void syscall32_cpu_init(void) | 965 | */ |
966 | { | ||
967 | /* Load these always in case some future AMD CPU supports | ||
968 | SYSENTER from compat mode too. */ | ||
969 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
970 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
971 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
972 | |||
973 | wrmsrl(MSR_CSTAR, ia32_cstar_target); | ||
974 | } | ||
975 | #endif /* CONFIG_IA32_EMULATION */ | ||
976 | #endif /* CONFIG_X86_64 */ | ||
977 | |||
978 | #ifdef CONFIG_X86_32 | 966 | #ifdef CONFIG_X86_32 |
979 | void enable_sep_cpu(void) | 967 | void enable_sep_cpu(void) |
980 | { | 968 | { |
981 | int cpu = get_cpu(); | 969 | struct tss_struct *tss; |
982 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 970 | int cpu; |
983 | 971 | ||
984 | if (!boot_cpu_has(X86_FEATURE_SEP)) { | 972 | cpu = get_cpu(); |
985 | put_cpu(); | 973 | tss = &per_cpu(cpu_tss, cpu); |
986 | return; | 974 | |
987 | } | 975 | if (!boot_cpu_has(X86_FEATURE_SEP)) |
976 | goto out; | ||
977 | |||
978 | /* | ||
979 | * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- | ||
980 | * see the big comment in struct x86_hw_tss's definition. | ||
981 | */ | ||
988 | 982 | ||
989 | tss->x86_tss.ss1 = __KERNEL_CS; | 983 | tss->x86_tss.ss1 = __KERNEL_CS; |
990 | tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; | 984 | wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); |
991 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | 985 | |
992 | wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); | 986 | wrmsr(MSR_IA32_SYSENTER_ESP, |
993 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); | 987 | (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), |
988 | 0); | ||
989 | |||
990 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); | ||
991 | |||
992 | out: | ||
994 | put_cpu(); | 993 | put_cpu(); |
995 | } | 994 | } |
996 | #endif | 995 | #endif |
@@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg) | |||
1118 | __setup("clearcpuid=", setup_disablecpuid); | 1117 | __setup("clearcpuid=", setup_disablecpuid); |
1119 | 1118 | ||
1120 | DEFINE_PER_CPU(unsigned long, kernel_stack) = | 1119 | DEFINE_PER_CPU(unsigned long, kernel_stack) = |
1121 | (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; | 1120 | (unsigned long)&init_thread_union + THREAD_SIZE; |
1122 | EXPORT_PER_CPU_SYMBOL(kernel_stack); | 1121 | EXPORT_PER_CPU_SYMBOL(kernel_stack); |
1123 | 1122 | ||
1124 | #ifdef CONFIG_X86_64 | 1123 | #ifdef CONFIG_X86_64 |
@@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, | |||
1130 | irq_stack_union) __aligned(PAGE_SIZE) __visible; | 1129 | irq_stack_union) __aligned(PAGE_SIZE) __visible; |
1131 | 1130 | ||
1132 | /* | 1131 | /* |
1133 | * The following four percpu variables are hot. Align current_task to | 1132 | * The following percpu variables are hot. Align current_task to |
1134 | * cacheline size such that all four fall in the same cacheline. | 1133 | * cacheline size such that they fall in the same cacheline. |
1135 | */ | 1134 | */ |
1136 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = | 1135 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = |
1137 | &init_task; | 1136 | &init_task; |
@@ -1171,10 +1170,23 @@ void syscall_init(void) | |||
1171 | */ | 1170 | */ |
1172 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | 1171 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); |
1173 | wrmsrl(MSR_LSTAR, system_call); | 1172 | wrmsrl(MSR_LSTAR, system_call); |
1174 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1175 | 1173 | ||
1176 | #ifdef CONFIG_IA32_EMULATION | 1174 | #ifdef CONFIG_IA32_EMULATION |
1177 | syscall32_cpu_init(); | 1175 | wrmsrl(MSR_CSTAR, ia32_cstar_target); |
1176 | /* | ||
1177 | * This only works on Intel CPUs. | ||
1178 | * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. | ||
1179 | * This does not cause SYSENTER to jump to the wrong location, because | ||
1180 | * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). | ||
1181 | */ | ||
1182 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
1183 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1184 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
1185 | #else | ||
1186 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1187 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); | ||
1188 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1189 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); | ||
1178 | #endif | 1190 | #endif |
1179 | 1191 | ||
1180 | /* Flags to clear on syscall */ | 1192 | /* Flags to clear on syscall */ |
@@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | |||
1226 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | 1238 | EXPORT_PER_CPU_SYMBOL(__preempt_count); |
1227 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | 1239 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); |
1228 | 1240 | ||
1241 | /* | ||
1242 | * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find | ||
1243 | * the top of the kernel stack. Use an extra percpu variable to track the | ||
1244 | * top of the kernel stack directly. | ||
1245 | */ | ||
1246 | DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = | ||
1247 | (unsigned long)&init_thread_union + THREAD_SIZE; | ||
1248 | EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); | ||
1249 | |||
1229 | #ifdef CONFIG_CC_STACKPROTECTOR | 1250 | #ifdef CONFIG_CC_STACKPROTECTOR |
1230 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1251 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
1231 | #endif | 1252 | #endif |
@@ -1307,7 +1328,7 @@ void cpu_init(void) | |||
1307 | */ | 1328 | */ |
1308 | load_ucode_ap(); | 1329 | load_ucode_ap(); |
1309 | 1330 | ||
1310 | t = &per_cpu(init_tss, cpu); | 1331 | t = &per_cpu(cpu_tss, cpu); |
1311 | oist = &per_cpu(orig_ist, cpu); | 1332 | oist = &per_cpu(orig_ist, cpu); |
1312 | 1333 | ||
1313 | #ifdef CONFIG_NUMA | 1334 | #ifdef CONFIG_NUMA |
@@ -1391,7 +1412,7 @@ void cpu_init(void) | |||
1391 | { | 1412 | { |
1392 | int cpu = smp_processor_id(); | 1413 | int cpu = smp_processor_id(); |
1393 | struct task_struct *curr = current; | 1414 | struct task_struct *curr = current; |
1394 | struct tss_struct *t = &per_cpu(init_tss, cpu); | 1415 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
1395 | struct thread_struct *thread = &curr->thread; | 1416 | struct thread_struct *thread = &curr->thread; |
1396 | 1417 | ||
1397 | wait_for_master_cpu(cpu); | 1418 | wait_for_master_cpu(cpu); |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..e2888a3ad1e3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -2147,24 +2147,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) | |||
2147 | static unsigned long code_segment_base(struct pt_regs *regs) | 2147 | static unsigned long code_segment_base(struct pt_regs *regs) |
2148 | { | 2148 | { |
2149 | /* | 2149 | /* |
2150 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2151 | * effective IP to a linear address. | ||
2152 | */ | ||
2153 | |||
2154 | #ifdef CONFIG_X86_32 | ||
2155 | /* | ||
2150 | * If we are in VM86 mode, add the segment offset to convert to a | 2156 | * If we are in VM86 mode, add the segment offset to convert to a |
2151 | * linear address. | 2157 | * linear address. |
2152 | */ | 2158 | */ |
2153 | if (regs->flags & X86_VM_MASK) | 2159 | if (regs->flags & X86_VM_MASK) |
2154 | return 0x10 * regs->cs; | 2160 | return 0x10 * regs->cs; |
2155 | 2161 | ||
2156 | /* | ||
2157 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2158 | * effective IP to a linear address. | ||
2159 | */ | ||
2160 | #ifdef CONFIG_X86_32 | ||
2161 | if (user_mode(regs) && regs->cs != __USER_CS) | 2162 | if (user_mode(regs) && regs->cs != __USER_CS) |
2162 | return get_segment_base(regs->cs); | 2163 | return get_segment_base(regs->cs); |
2163 | #else | 2164 | #else |
2164 | if (test_thread_flag(TIF_IA32)) { | 2165 | if (user_mode(regs) && !user_64bit_mode(regs) && |
2165 | if (user_mode(regs) && regs->cs != __USER32_CS) | 2166 | regs->cs != __USER32_CS) |
2166 | return get_segment_base(regs->cs); | 2167 | return get_segment_base(regs->cs); |
2167 | } | ||
2168 | #endif | 2168 | #endif |
2169 | return 0; | 2169 | return 0; |
2170 | } | 2170 | } |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f90c716..c76d3e37c6e1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | |||
105 | #ifdef CONFIG_X86_32 | 105 | #ifdef CONFIG_X86_32 |
106 | struct pt_regs fixed_regs; | 106 | struct pt_regs fixed_regs; |
107 | 107 | ||
108 | if (!user_mode_vm(regs)) { | 108 | if (!user_mode(regs)) { |
109 | crash_fixup_ss_esp(&fixed_regs, regs); | 109 | crash_fixup_ss_esp(&fixed_regs, regs); |
110 | regs = &fixed_regs; | 110 | regs = &fixed_regs; |
111 | } | 111 | } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d8d039..ab3b65639a3e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err) | |||
278 | print_modules(); | 278 | print_modules(); |
279 | show_regs(regs); | 279 | show_regs(regs); |
280 | #ifdef CONFIG_X86_32 | 280 | #ifdef CONFIG_X86_32 |
281 | if (user_mode_vm(regs)) { | 281 | if (user_mode(regs)) { |
282 | sp = regs->sp; | 282 | sp = regs->sp; |
283 | ss = regs->ss & 0xffff; | 283 | ss = regs->ss & 0xffff; |
284 | } else { | 284 | } else { |
@@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err) | |||
307 | unsigned long flags = oops_begin(); | 307 | unsigned long flags = oops_begin(); |
308 | int sig = SIGSEGV; | 308 | int sig = SIGSEGV; |
309 | 309 | ||
310 | if (!user_mode_vm(regs)) | 310 | if (!user_mode(regs)) |
311 | report_bug(regs->ip, regs); | 311 | report_bug(regs->ip, regs); |
312 | 312 | ||
313 | if (__die(str, regs, err)) | 313 | if (__die(str, regs, err)) |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd4230c..39891ff50d03 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs) | |||
123 | int i; | 123 | int i; |
124 | 124 | ||
125 | show_regs_print_info(KERN_EMERG); | 125 | show_regs_print_info(KERN_EMERG); |
126 | __show_regs(regs, !user_mode_vm(regs)); | 126 | __show_regs(regs, !user_mode(regs)); |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * When in-kernel, we also print out the stack and code at the | 129 | * When in-kernel, we also print out the stack and code at the |
130 | * time of the fault.. | 130 | * time of the fault.. |
131 | */ | 131 | */ |
132 | if (!user_mode_vm(regs)) { | 132 | if (!user_mode(regs)) { |
133 | unsigned int code_prologue = code_bytes * 43 / 64; | 133 | unsigned int code_prologue = code_bytes * 43 / 64; |
134 | unsigned int code_len = code_bytes; | 134 | unsigned int code_len = code_bytes; |
135 | unsigned char c; | 135 | unsigned char c; |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5bf3e38..1c309763e321 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -395,10 +395,13 @@ sysenter_past_esp: | |||
395 | /*CFI_REL_OFFSET cs, 0*/ | 395 | /*CFI_REL_OFFSET cs, 0*/ |
396 | /* | 396 | /* |
397 | * Push current_thread_info()->sysenter_return to the stack. | 397 | * Push current_thread_info()->sysenter_return to the stack. |
398 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | 398 | * A tiny bit of offset fixup is necessary: TI_sysenter_return |
399 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | 399 | * is relative to thread_info, which is at the bottom of the |
400 | * kernel stack page. 4*4 means the 4 words pushed above; | ||
401 | * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; | ||
402 | * and THREAD_SIZE takes us to the bottom. | ||
400 | */ | 403 | */ |
401 | pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) | 404 | pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) |
402 | CFI_REL_OFFSET eip, 0 | 405 | CFI_REL_OFFSET eip, 0 |
403 | 406 | ||
404 | pushl_cfi %eax | 407 | pushl_cfi %eax |
@@ -432,7 +435,7 @@ sysenter_after_call: | |||
432 | TRACE_IRQS_OFF | 435 | TRACE_IRQS_OFF |
433 | movl TI_flags(%ebp), %ecx | 436 | movl TI_flags(%ebp), %ecx |
434 | testl $_TIF_ALLWORK_MASK, %ecx | 437 | testl $_TIF_ALLWORK_MASK, %ecx |
435 | jne sysexit_audit | 438 | jnz sysexit_audit |
436 | sysenter_exit: | 439 | sysenter_exit: |
437 | /* if something modifies registers it must also disable sysexit */ | 440 | /* if something modifies registers it must also disable sysexit */ |
438 | movl PT_EIP(%esp), %edx | 441 | movl PT_EIP(%esp), %edx |
@@ -460,7 +463,7 @@ sysenter_audit: | |||
460 | 463 | ||
461 | sysexit_audit: | 464 | sysexit_audit: |
462 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 465 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
463 | jne syscall_exit_work | 466 | jnz syscall_exit_work |
464 | TRACE_IRQS_ON | 467 | TRACE_IRQS_ON |
465 | ENABLE_INTERRUPTS(CLBR_ANY) | 468 | ENABLE_INTERRUPTS(CLBR_ANY) |
466 | movl %eax,%edx /* second arg, syscall return value */ | 469 | movl %eax,%edx /* second arg, syscall return value */ |
@@ -472,7 +475,7 @@ sysexit_audit: | |||
472 | TRACE_IRQS_OFF | 475 | TRACE_IRQS_OFF |
473 | movl TI_flags(%ebp), %ecx | 476 | movl TI_flags(%ebp), %ecx |
474 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 477 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
475 | jne syscall_exit_work | 478 | jnz syscall_exit_work |
476 | movl PT_EAX(%esp),%eax /* reload syscall return value */ | 479 | movl PT_EAX(%esp),%eax /* reload syscall return value */ |
477 | jmp sysenter_exit | 480 | jmp sysenter_exit |
478 | #endif | 481 | #endif |
@@ -510,7 +513,7 @@ syscall_exit: | |||
510 | TRACE_IRQS_OFF | 513 | TRACE_IRQS_OFF |
511 | movl TI_flags(%ebp), %ecx | 514 | movl TI_flags(%ebp), %ecx |
512 | testl $_TIF_ALLWORK_MASK, %ecx # current->work | 515 | testl $_TIF_ALLWORK_MASK, %ecx # current->work |
513 | jne syscall_exit_work | 516 | jnz syscall_exit_work |
514 | 517 | ||
515 | restore_all: | 518 | restore_all: |
516 | TRACE_IRQS_IRET | 519 | TRACE_IRQS_IRET |
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and | |||
612 | #ifdef CONFIG_VM86 | 615 | #ifdef CONFIG_VM86 |
613 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) | 616 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) |
614 | movl %esp, %eax | 617 | movl %esp, %eax |
615 | jne work_notifysig_v86 # returning to kernel-space or | 618 | jnz work_notifysig_v86 # returning to kernel-space or |
616 | # vm86-space | 619 | # vm86-space |
617 | 1: | 620 | 1: |
618 | #else | 621 | #else |
@@ -720,43 +723,22 @@ END(sysenter_badsys) | |||
720 | .endm | 723 | .endm |
721 | 724 | ||
722 | /* | 725 | /* |
723 | * Build the entry stubs and pointer table with some assembler magic. | 726 | * Build the entry stubs with some assembler magic. |
724 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 727 | * We pack 1 stub into every 8-byte block. |
725 | * single cache line on all modern x86 implementations. | ||
726 | */ | 728 | */ |
727 | .section .init.rodata,"a" | 729 | .align 8 |
728 | ENTRY(interrupt) | ||
729 | .section .entry.text, "ax" | ||
730 | .p2align 5 | ||
731 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
732 | ENTRY(irq_entries_start) | 730 | ENTRY(irq_entries_start) |
733 | RING0_INT_FRAME | 731 | RING0_INT_FRAME |
734 | vector=FIRST_EXTERNAL_VECTOR | 732 | vector=FIRST_EXTERNAL_VECTOR |
735 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 733 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
736 | .balign 32 | 734 | pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
737 | .rept 7 | 735 | vector=vector+1 |
738 | .if vector < FIRST_SYSTEM_VECTOR | 736 | jmp common_interrupt |
739 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
740 | CFI_ADJUST_CFA_OFFSET -4 | 737 | CFI_ADJUST_CFA_OFFSET -4 |
741 | .endif | 738 | .align 8 |
742 | 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 739 | .endr |
743 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
744 | jmp 2f | ||
745 | .endif | ||
746 | .previous | ||
747 | .long 1b | ||
748 | .section .entry.text, "ax" | ||
749 | vector=vector+1 | ||
750 | .endif | ||
751 | .endr | ||
752 | 2: jmp common_interrupt | ||
753 | .endr | ||
754 | END(irq_entries_start) | 740 | END(irq_entries_start) |
755 | 741 | ||
756 | .previous | ||
757 | END(interrupt) | ||
758 | .previous | ||
759 | |||
760 | /* | 742 | /* |
761 | * the CPU automatically disables interrupts when executing an IRQ vector, | 743 | * the CPU automatically disables interrupts when executing an IRQ vector, |
762 | * so IRQ-flags tracing has to follow that: | 744 | * so IRQ-flags tracing has to follow that: |
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error) | |||
816 | pushl_cfi $0 | 798 | pushl_cfi $0 |
817 | #ifdef CONFIG_X86_INVD_BUG | 799 | #ifdef CONFIG_X86_INVD_BUG |
818 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | 800 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ |
819 | 661: pushl_cfi $do_general_protection | 801 | ALTERNATIVE "pushl_cfi $do_general_protection", \ |
820 | 662: | 802 | "pushl $do_simd_coprocessor_error", \ |
821 | .section .altinstructions,"a" | 803 | X86_FEATURE_XMM |
822 | altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f | ||
823 | .previous | ||
824 | .section .altinstr_replacement,"ax" | ||
825 | 663: pushl $do_simd_coprocessor_error | ||
826 | 664: | ||
827 | .previous | ||
828 | #else | 804 | #else |
829 | pushl_cfi $do_simd_coprocessor_error | 805 | pushl_cfi $do_simd_coprocessor_error |
830 | #endif | 806 | #endif |
@@ -1240,20 +1216,13 @@ error_code: | |||
1240 | /*CFI_REL_OFFSET es, 0*/ | 1216 | /*CFI_REL_OFFSET es, 0*/ |
1241 | pushl_cfi %ds | 1217 | pushl_cfi %ds |
1242 | /*CFI_REL_OFFSET ds, 0*/ | 1218 | /*CFI_REL_OFFSET ds, 0*/ |
1243 | pushl_cfi %eax | 1219 | pushl_cfi_reg eax |
1244 | CFI_REL_OFFSET eax, 0 | 1220 | pushl_cfi_reg ebp |
1245 | pushl_cfi %ebp | 1221 | pushl_cfi_reg edi |
1246 | CFI_REL_OFFSET ebp, 0 | 1222 | pushl_cfi_reg esi |
1247 | pushl_cfi %edi | 1223 | pushl_cfi_reg edx |
1248 | CFI_REL_OFFSET edi, 0 | 1224 | pushl_cfi_reg ecx |
1249 | pushl_cfi %esi | 1225 | pushl_cfi_reg ebx |
1250 | CFI_REL_OFFSET esi, 0 | ||
1251 | pushl_cfi %edx | ||
1252 | CFI_REL_OFFSET edx, 0 | ||
1253 | pushl_cfi %ecx | ||
1254 | CFI_REL_OFFSET ecx, 0 | ||
1255 | pushl_cfi %ebx | ||
1256 | CFI_REL_OFFSET ebx, 0 | ||
1257 | cld | 1226 | cld |
1258 | movl $(__KERNEL_PERCPU), %ecx | 1227 | movl $(__KERNEL_PERCPU), %ecx |
1259 | movl %ecx, %fs | 1228 | movl %ecx, %fs |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f0095a76c182..c7b238494b31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -14,27 +14,14 @@ | |||
14 | * NOTE: This code handles signal-recognition, which happens every time | 14 | * NOTE: This code handles signal-recognition, which happens every time |
15 | * after an interrupt and after each system call. | 15 | * after an interrupt and after each system call. |
16 | * | 16 | * |
17 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
18 | * only done for syscall tracing, signals or fork/exec et.al. | ||
19 | * | ||
20 | * A note on terminology: | 17 | * A note on terminology: |
21 | * - top of stack: Architecture defined interrupt frame from SS to RIP | 18 | * - iret frame: Architecture defined interrupt frame from SS to RIP |
22 | * at the top of the kernel process stack. | 19 | * at the top of the kernel process stack. |
23 | * - partial stack frame: partially saved registers up to R11. | ||
24 | * - full stack frame: Like partial stack frame, but all register saved. | ||
25 | * | 20 | * |
26 | * Some macro usage: | 21 | * Some macro usage: |
27 | * - CFI macros are used to generate dwarf2 unwind information for better | 22 | * - CFI macros are used to generate dwarf2 unwind information for better |
28 | * backtraces. They don't change any code. | 23 | * backtraces. They don't change any code. |
29 | * - SAVE_ALL/RESTORE_ALL - Save/restore all registers | ||
30 | * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. | ||
31 | * There are unfortunately lots of special cases where some registers | ||
32 | * not touched. The macro is a big mess that should be cleaned up. | ||
33 | * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. | ||
34 | * Gives a full stack frame. | ||
35 | * - ENTRY/END Define functions in the symbol table. | 24 | * - ENTRY/END Define functions in the symbol table. |
36 | * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack | ||
37 | * frame that is otherwise undefined after a SYSCALL | ||
38 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. | 25 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. |
39 | * - idtentry - Define exception entry points. | 26 | * - idtentry - Define exception entry points. |
40 | */ | 27 | */ |
@@ -70,10 +57,6 @@ | |||
70 | .section .entry.text, "ax" | 57 | .section .entry.text, "ax" |
71 | 58 | ||
72 | 59 | ||
73 | #ifndef CONFIG_PREEMPT | ||
74 | #define retint_kernel retint_restore_args | ||
75 | #endif | ||
76 | |||
77 | #ifdef CONFIG_PARAVIRT | 60 | #ifdef CONFIG_PARAVIRT |
78 | ENTRY(native_usergs_sysret64) | 61 | ENTRY(native_usergs_sysret64) |
79 | swapgs | 62 | swapgs |
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64) | |||
82 | #endif /* CONFIG_PARAVIRT */ | 65 | #endif /* CONFIG_PARAVIRT */ |
83 | 66 | ||
84 | 67 | ||
85 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | 68 | .macro TRACE_IRQS_IRETQ |
86 | #ifdef CONFIG_TRACE_IRQFLAGS | 69 | #ifdef CONFIG_TRACE_IRQFLAGS |
87 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 70 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
88 | jnc 1f | 71 | jnc 1f |
89 | TRACE_IRQS_ON | 72 | TRACE_IRQS_ON |
90 | 1: | 73 | 1: |
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64) | |||
116 | call debug_stack_reset | 99 | call debug_stack_reset |
117 | .endm | 100 | .endm |
118 | 101 | ||
119 | .macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET | 102 | .macro TRACE_IRQS_IRETQ_DEBUG |
120 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 103 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
121 | jnc 1f | 104 | jnc 1f |
122 | TRACE_IRQS_ON_DEBUG | 105 | TRACE_IRQS_ON_DEBUG |
123 | 1: | 106 | 1: |
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64) | |||
130 | #endif | 113 | #endif |
131 | 114 | ||
132 | /* | 115 | /* |
133 | * C code is not supposed to know about undefined top of stack. Every time | 116 | * empty frame |
134 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
135 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
136 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
137 | * manipulation. | ||
138 | */ | ||
139 | |||
140 | /* %rsp:at FRAMEEND */ | ||
141 | .macro FIXUP_TOP_OF_STACK tmp offset=0 | ||
142 | movq PER_CPU_VAR(old_rsp),\tmp | ||
143 | movq \tmp,RSP+\offset(%rsp) | ||
144 | movq $__USER_DS,SS+\offset(%rsp) | ||
145 | movq $__USER_CS,CS+\offset(%rsp) | ||
146 | movq RIP+\offset(%rsp),\tmp /* get rip */ | ||
147 | movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ | ||
148 | movq R11+\offset(%rsp),\tmp /* get eflags */ | ||
149 | movq \tmp,EFLAGS+\offset(%rsp) | ||
150 | .endm | ||
151 | |||
152 | .macro RESTORE_TOP_OF_STACK tmp offset=0 | ||
153 | movq RSP+\offset(%rsp),\tmp | ||
154 | movq \tmp,PER_CPU_VAR(old_rsp) | ||
155 | movq EFLAGS+\offset(%rsp),\tmp | ||
156 | movq \tmp,R11+\offset(%rsp) | ||
157 | .endm | ||
158 | |||
159 | /* | ||
160 | * initial frame state for interrupts (and exceptions without error code) | ||
161 | */ | 117 | */ |
162 | .macro EMPTY_FRAME start=1 offset=0 | 118 | .macro EMPTY_FRAME start=1 offset=0 |
163 | .if \start | 119 | .if \start |
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64) | |||
173 | * initial frame state for interrupts (and exceptions without error code) | 129 | * initial frame state for interrupts (and exceptions without error code) |
174 | */ | 130 | */ |
175 | .macro INTR_FRAME start=1 offset=0 | 131 | .macro INTR_FRAME start=1 offset=0 |
176 | EMPTY_FRAME \start, SS+8+\offset-RIP | 132 | EMPTY_FRAME \start, 5*8+\offset |
177 | /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ | 133 | /*CFI_REL_OFFSET ss, 4*8+\offset*/ |
178 | CFI_REL_OFFSET rsp, RSP+\offset-RIP | 134 | CFI_REL_OFFSET rsp, 3*8+\offset |
179 | /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ | 135 | /*CFI_REL_OFFSET rflags, 2*8+\offset*/ |
180 | /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ | 136 | /*CFI_REL_OFFSET cs, 1*8+\offset*/ |
181 | CFI_REL_OFFSET rip, RIP+\offset-RIP | 137 | CFI_REL_OFFSET rip, 0*8+\offset |
182 | .endm | 138 | .endm |
183 | 139 | ||
184 | /* | 140 | /* |
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64) | |||
186 | * with vector already pushed) | 142 | * with vector already pushed) |
187 | */ | 143 | */ |
188 | .macro XCPT_FRAME start=1 offset=0 | 144 | .macro XCPT_FRAME start=1 offset=0 |
189 | INTR_FRAME \start, RIP+\offset-ORIG_RAX | 145 | INTR_FRAME \start, 1*8+\offset |
190 | .endm | ||
191 | |||
192 | /* | ||
193 | * frame that enables calling into C. | ||
194 | */ | ||
195 | .macro PARTIAL_FRAME start=1 offset=0 | ||
196 | XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET | ||
197 | CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET | ||
198 | CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET | ||
199 | CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET | ||
200 | CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET | ||
201 | CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET | ||
202 | CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET | ||
203 | CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET | ||
204 | CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET | ||
205 | CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET | ||
206 | .endm | 146 | .endm |
207 | 147 | ||
208 | /* | 148 | /* |
209 | * frame that enables passing a complete pt_regs to a C function. | 149 | * frame that enables passing a complete pt_regs to a C function. |
210 | */ | 150 | */ |
211 | .macro DEFAULT_FRAME start=1 offset=0 | 151 | .macro DEFAULT_FRAME start=1 offset=0 |
212 | PARTIAL_FRAME \start, R11+\offset-R15 | 152 | XCPT_FRAME \start, ORIG_RAX+\offset |
153 | CFI_REL_OFFSET rdi, RDI+\offset | ||
154 | CFI_REL_OFFSET rsi, RSI+\offset | ||
155 | CFI_REL_OFFSET rdx, RDX+\offset | ||
156 | CFI_REL_OFFSET rcx, RCX+\offset | ||
157 | CFI_REL_OFFSET rax, RAX+\offset | ||
158 | CFI_REL_OFFSET r8, R8+\offset | ||
159 | CFI_REL_OFFSET r9, R9+\offset | ||
160 | CFI_REL_OFFSET r10, R10+\offset | ||
161 | CFI_REL_OFFSET r11, R11+\offset | ||
213 | CFI_REL_OFFSET rbx, RBX+\offset | 162 | CFI_REL_OFFSET rbx, RBX+\offset |
214 | CFI_REL_OFFSET rbp, RBP+\offset | 163 | CFI_REL_OFFSET rbp, RBP+\offset |
215 | CFI_REL_OFFSET r12, R12+\offset | 164 | CFI_REL_OFFSET r12, R12+\offset |
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64) | |||
218 | CFI_REL_OFFSET r15, R15+\offset | 167 | CFI_REL_OFFSET r15, R15+\offset |
219 | .endm | 168 | .endm |
220 | 169 | ||
221 | ENTRY(save_paranoid) | ||
222 | XCPT_FRAME 1 RDI+8 | ||
223 | cld | ||
224 | movq %rdi, RDI+8(%rsp) | ||
225 | movq %rsi, RSI+8(%rsp) | ||
226 | movq_cfi rdx, RDX+8 | ||
227 | movq_cfi rcx, RCX+8 | ||
228 | movq_cfi rax, RAX+8 | ||
229 | movq %r8, R8+8(%rsp) | ||
230 | movq %r9, R9+8(%rsp) | ||
231 | movq %r10, R10+8(%rsp) | ||
232 | movq %r11, R11+8(%rsp) | ||
233 | movq_cfi rbx, RBX+8 | ||
234 | movq %rbp, RBP+8(%rsp) | ||
235 | movq %r12, R12+8(%rsp) | ||
236 | movq %r13, R13+8(%rsp) | ||
237 | movq %r14, R14+8(%rsp) | ||
238 | movq %r15, R15+8(%rsp) | ||
239 | movl $1,%ebx | ||
240 | movl $MSR_GS_BASE,%ecx | ||
241 | rdmsr | ||
242 | testl %edx,%edx | ||
243 | js 1f /* negative -> in kernel */ | ||
244 | SWAPGS | ||
245 | xorl %ebx,%ebx | ||
246 | 1: ret | ||
247 | CFI_ENDPROC | ||
248 | END(save_paranoid) | ||
249 | |||
250 | /* | 170 | /* |
251 | * A newly forked process directly context switches into this address. | 171 | * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. |
252 | * | 172 | * |
253 | * rdi: prev task we switched from | 173 | * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
254 | */ | 174 | * then loads new ss, cs, and rip from previously programmed MSRs. |
255 | ENTRY(ret_from_fork) | 175 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
256 | DEFAULT_FRAME | 176 | * are not needed). SYSCALL does not save anything on the stack |
257 | 177 | * and does not change rsp. | |
258 | LOCK ; btr $TIF_FORK,TI_flags(%r8) | ||
259 | |||
260 | pushq_cfi $0x0002 | ||
261 | popfq_cfi # reset kernel eflags | ||
262 | |||
263 | call schedule_tail # rdi: 'prev' task parameter | ||
264 | |||
265 | GET_THREAD_INFO(%rcx) | ||
266 | |||
267 | RESTORE_REST | ||
268 | |||
269 | testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
270 | jz 1f | ||
271 | |||
272 | /* | ||
273 | * By the time we get here, we have no idea whether our pt_regs, | ||
274 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
275 | * the slow path, or one of the ia32entry paths. | ||
276 | * Use int_ret_from_sys_call to return, since it can safely handle | ||
277 | * all of the above. | ||
278 | */ | ||
279 | jmp int_ret_from_sys_call | ||
280 | |||
281 | 1: | ||
282 | subq $REST_SKIP, %rsp # leave space for volatiles | ||
283 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
284 | movq %rbp, %rdi | ||
285 | call *%rbx | ||
286 | movl $0, RAX(%rsp) | ||
287 | RESTORE_REST | ||
288 | jmp int_ret_from_sys_call | ||
289 | CFI_ENDPROC | ||
290 | END(ret_from_fork) | ||
291 | |||
292 | /* | ||
293 | * System call entry. Up to 6 arguments in registers are supported. | ||
294 | * | 178 | * |
295 | * SYSCALL does not save anything on the stack and does not change the | 179 | * Registers on entry: |
296 | * stack pointer. However, it does mask the flags register for us, so | ||
297 | * CLD and CLAC are not needed. | ||
298 | */ | ||
299 | |||
300 | /* | ||
301 | * Register setup: | ||
302 | * rax system call number | 180 | * rax system call number |
181 | * rcx return address | ||
182 | * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) | ||
303 | * rdi arg0 | 183 | * rdi arg0 |
304 | * rcx return address for syscall/sysret, C arg3 | ||
305 | * rsi arg1 | 184 | * rsi arg1 |
306 | * rdx arg2 | 185 | * rdx arg2 |
307 | * r10 arg3 (--> moved to rcx for C) | 186 | * r10 arg3 (needs to be moved to rcx to conform to C ABI) |
308 | * r8 arg4 | 187 | * r8 arg4 |
309 | * r9 arg5 | 188 | * r9 arg5 |
310 | * r11 eflags for syscall/sysret, temporary for C | 189 | * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) |
311 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
312 | * | 190 | * |
313 | * Interrupts are off on entry. | ||
314 | * Only called from user space. | 191 | * Only called from user space. |
315 | * | 192 | * |
316 | * XXX if we had a free scratch register we could save the RSP into the stack frame | 193 | * When user can change pt_regs->foo always force IRET. That is because |
317 | * and report it properly in ps. Unfortunately we haven't. | ||
318 | * | ||
319 | * When user can change the frames always force IRET. That is because | ||
320 | * it deals with uncanonical addresses better. SYSRET has trouble | 194 | * it deals with uncanonical addresses better. SYSRET has trouble |
321 | * with them due to bugs in both AMD and Intel CPUs. | 195 | * with them due to bugs in both AMD and Intel CPUs. |
322 | */ | 196 | */ |
@@ -324,9 +198,15 @@ END(ret_from_fork) | |||
324 | ENTRY(system_call) | 198 | ENTRY(system_call) |
325 | CFI_STARTPROC simple | 199 | CFI_STARTPROC simple |
326 | CFI_SIGNAL_FRAME | 200 | CFI_SIGNAL_FRAME |
327 | CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET | 201 | CFI_DEF_CFA rsp,0 |
328 | CFI_REGISTER rip,rcx | 202 | CFI_REGISTER rip,rcx |
329 | /*CFI_REGISTER rflags,r11*/ | 203 | /*CFI_REGISTER rflags,r11*/ |
204 | |||
205 | /* | ||
206 | * Interrupts are off on entry. | ||
207 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
208 | * it is too small to ever cause noticeable irq latency. | ||
209 | */ | ||
330 | SWAPGS_UNSAFE_STACK | 210 | SWAPGS_UNSAFE_STACK |
331 | /* | 211 | /* |
332 | * A hypervisor implementation might want to use a label | 212 | * A hypervisor implementation might want to use a label |
@@ -335,18 +215,38 @@ ENTRY(system_call) | |||
335 | */ | 215 | */ |
336 | GLOBAL(system_call_after_swapgs) | 216 | GLOBAL(system_call_after_swapgs) |
337 | 217 | ||
338 | movq %rsp,PER_CPU_VAR(old_rsp) | 218 | movq %rsp,PER_CPU_VAR(rsp_scratch) |
339 | movq PER_CPU_VAR(kernel_stack),%rsp | 219 | movq PER_CPU_VAR(kernel_stack),%rsp |
220 | |||
221 | /* Construct struct pt_regs on stack */ | ||
222 | pushq_cfi $__USER_DS /* pt_regs->ss */ | ||
223 | pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ | ||
340 | /* | 224 | /* |
341 | * No need to follow this irqs off/on section - it's straight | 225 | * Re-enable interrupts. |
342 | * and short: | 226 | * We use 'rsp_scratch' as a scratch space, hence irq-off block above |
227 | * must execute atomically in the face of possible interrupt-driven | ||
228 | * task preemption. We must enable interrupts only after we're done | ||
229 | * with using rsp_scratch: | ||
343 | */ | 230 | */ |
344 | ENABLE_INTERRUPTS(CLBR_NONE) | 231 | ENABLE_INTERRUPTS(CLBR_NONE) |
345 | SAVE_ARGS 8, 0, rax_enosys=1 | 232 | pushq_cfi %r11 /* pt_regs->flags */ |
346 | movq_cfi rax,(ORIG_RAX-ARGOFFSET) | 233 | pushq_cfi $__USER_CS /* pt_regs->cs */ |
347 | movq %rcx,RIP-ARGOFFSET(%rsp) | 234 | pushq_cfi %rcx /* pt_regs->ip */ |
348 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 235 | CFI_REL_OFFSET rip,0 |
349 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 236 | pushq_cfi_reg rax /* pt_regs->orig_ax */ |
237 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
238 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
239 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
240 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
241 | pushq_cfi $-ENOSYS /* pt_regs->ax */ | ||
242 | pushq_cfi_reg r8 /* pt_regs->r8 */ | ||
243 | pushq_cfi_reg r9 /* pt_regs->r9 */ | ||
244 | pushq_cfi_reg r10 /* pt_regs->r10 */ | ||
245 | pushq_cfi_reg r11 /* pt_regs->r11 */ | ||
246 | sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ | ||
247 | CFI_ADJUST_CFA_OFFSET 6*8 | ||
248 | |||
249 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | ||
350 | jnz tracesys | 250 | jnz tracesys |
351 | system_call_fastpath: | 251 | system_call_fastpath: |
352 | #if __SYSCALL_MASK == ~0 | 252 | #if __SYSCALL_MASK == ~0 |
@@ -355,18 +255,21 @@ system_call_fastpath: | |||
355 | andl $__SYSCALL_MASK,%eax | 255 | andl $__SYSCALL_MASK,%eax |
356 | cmpl $__NR_syscall_max,%eax | 256 | cmpl $__NR_syscall_max,%eax |
357 | #endif | 257 | #endif |
358 | ja ret_from_sys_call /* and return regs->ax */ | 258 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
359 | movq %r10,%rcx | 259 | movq %r10,%rcx |
360 | call *sys_call_table(,%rax,8) # XXX: rip relative | 260 | call *sys_call_table(,%rax,8) |
361 | movq %rax,RAX-ARGOFFSET(%rsp) | 261 | movq %rax,RAX(%rsp) |
262 | 1: | ||
362 | /* | 263 | /* |
363 | * Syscall return path ending with SYSRET (fast path) | 264 | * Syscall return path ending with SYSRET (fast path). |
364 | * Has incomplete stack frame and undefined top of stack. | 265 | * Has incompletely filled pt_regs. |
365 | */ | 266 | */ |
366 | ret_from_sys_call: | ||
367 | LOCKDEP_SYS_EXIT | 267 | LOCKDEP_SYS_EXIT |
268 | /* | ||
269 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
270 | * it is too small to ever cause noticeable irq latency. | ||
271 | */ | ||
368 | DISABLE_INTERRUPTS(CLBR_NONE) | 272 | DISABLE_INTERRUPTS(CLBR_NONE) |
369 | TRACE_IRQS_OFF | ||
370 | 273 | ||
371 | /* | 274 | /* |
372 | * We must check ti flags with interrupts (or at least preemption) | 275 | * We must check ti flags with interrupts (or at least preemption) |
@@ -376,72 +279,73 @@ ret_from_sys_call: | |||
376 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is | 279 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is |
377 | * very bad. | 280 | * very bad. |
378 | */ | 281 | */ |
379 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 282 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
380 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ | 283 | jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ |
381 | 284 | ||
382 | CFI_REMEMBER_STATE | 285 | CFI_REMEMBER_STATE |
383 | /* | 286 | |
384 | * sysretq will re-enable interrupts: | 287 | RESTORE_C_REGS_EXCEPT_RCX_R11 |
385 | */ | 288 | movq RIP(%rsp),%rcx |
386 | TRACE_IRQS_ON | ||
387 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
388 | CFI_REGISTER rip,rcx | 289 | CFI_REGISTER rip,rcx |
389 | RESTORE_ARGS 1,-ARG_SKIP,0 | 290 | movq EFLAGS(%rsp),%r11 |
390 | /*CFI_REGISTER rflags,r11*/ | 291 | /*CFI_REGISTER rflags,r11*/ |
391 | movq PER_CPU_VAR(old_rsp), %rsp | 292 | movq RSP(%rsp),%rsp |
293 | /* | ||
294 | * 64bit SYSRET restores rip from rcx, | ||
295 | * rflags from r11 (but RF and VM bits are forced to 0), | ||
296 | * cs and ss are loaded from MSRs. | ||
297 | * Restoration of rflags re-enables interrupts. | ||
298 | */ | ||
392 | USERGS_SYSRET64 | 299 | USERGS_SYSRET64 |
393 | 300 | ||
394 | CFI_RESTORE_STATE | 301 | CFI_RESTORE_STATE |
395 | 302 | ||
396 | int_ret_from_sys_call_fixup: | 303 | /* Do syscall entry tracing */ |
397 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | ||
398 | jmp int_ret_from_sys_call_irqs_off | ||
399 | |||
400 | /* Do syscall tracing */ | ||
401 | tracesys: | 304 | tracesys: |
402 | leaq -REST_SKIP(%rsp), %rdi | 305 | movq %rsp, %rdi |
403 | movq $AUDIT_ARCH_X86_64, %rsi | 306 | movl $AUDIT_ARCH_X86_64, %esi |
404 | call syscall_trace_enter_phase1 | 307 | call syscall_trace_enter_phase1 |
405 | test %rax, %rax | 308 | test %rax, %rax |
406 | jnz tracesys_phase2 /* if needed, run the slow path */ | 309 | jnz tracesys_phase2 /* if needed, run the slow path */ |
407 | LOAD_ARGS 0 /* else restore clobbered regs */ | 310 | RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ |
311 | movq ORIG_RAX(%rsp), %rax | ||
408 | jmp system_call_fastpath /* and return to the fast path */ | 312 | jmp system_call_fastpath /* and return to the fast path */ |
409 | 313 | ||
410 | tracesys_phase2: | 314 | tracesys_phase2: |
411 | SAVE_REST | 315 | SAVE_EXTRA_REGS |
412 | FIXUP_TOP_OF_STACK %rdi | ||
413 | movq %rsp, %rdi | 316 | movq %rsp, %rdi |
414 | movq $AUDIT_ARCH_X86_64, %rsi | 317 | movl $AUDIT_ARCH_X86_64, %esi |
415 | movq %rax,%rdx | 318 | movq %rax,%rdx |
416 | call syscall_trace_enter_phase2 | 319 | call syscall_trace_enter_phase2 |
417 | 320 | ||
418 | /* | 321 | /* |
419 | * Reload arg registers from stack in case ptrace changed them. | 322 | * Reload registers from stack in case ptrace changed them. |
420 | * We don't reload %rax because syscall_trace_entry_phase2() returned | 323 | * We don't reload %rax because syscall_trace_entry_phase2() returned |
421 | * the value it wants us to use in the table lookup. | 324 | * the value it wants us to use in the table lookup. |
422 | */ | 325 | */ |
423 | LOAD_ARGS ARGOFFSET, 1 | 326 | RESTORE_C_REGS_EXCEPT_RAX |
424 | RESTORE_REST | 327 | RESTORE_EXTRA_REGS |
425 | #if __SYSCALL_MASK == ~0 | 328 | #if __SYSCALL_MASK == ~0 |
426 | cmpq $__NR_syscall_max,%rax | 329 | cmpq $__NR_syscall_max,%rax |
427 | #else | 330 | #else |
428 | andl $__SYSCALL_MASK,%eax | 331 | andl $__SYSCALL_MASK,%eax |
429 | cmpl $__NR_syscall_max,%eax | 332 | cmpl $__NR_syscall_max,%eax |
430 | #endif | 333 | #endif |
431 | ja int_ret_from_sys_call /* RAX(%rsp) is already set */ | 334 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
432 | movq %r10,%rcx /* fixup for C */ | 335 | movq %r10,%rcx /* fixup for C */ |
433 | call *sys_call_table(,%rax,8) | 336 | call *sys_call_table(,%rax,8) |
434 | movq %rax,RAX-ARGOFFSET(%rsp) | 337 | movq %rax,RAX(%rsp) |
435 | /* Use IRET because user could have changed frame */ | 338 | 1: |
339 | /* Use IRET because user could have changed pt_regs->foo */ | ||
436 | 340 | ||
437 | /* | 341 | /* |
438 | * Syscall return path ending with IRET. | 342 | * Syscall return path ending with IRET. |
439 | * Has correct top of stack, but partial stack frame. | 343 | * Has correct iret frame. |
440 | */ | 344 | */ |
441 | GLOBAL(int_ret_from_sys_call) | 345 | GLOBAL(int_ret_from_sys_call) |
442 | DISABLE_INTERRUPTS(CLBR_NONE) | 346 | DISABLE_INTERRUPTS(CLBR_NONE) |
347 | int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ | ||
443 | TRACE_IRQS_OFF | 348 | TRACE_IRQS_OFF |
444 | int_ret_from_sys_call_irqs_off: | ||
445 | movl $_TIF_ALLWORK_MASK,%edi | 349 | movl $_TIF_ALLWORK_MASK,%edi |
446 | /* edi: mask to check */ | 350 | /* edi: mask to check */ |
447 | GLOBAL(int_with_check) | 351 | GLOBAL(int_with_check) |
@@ -450,8 +354,8 @@ GLOBAL(int_with_check) | |||
450 | movl TI_flags(%rcx),%edx | 354 | movl TI_flags(%rcx),%edx |
451 | andl %edi,%edx | 355 | andl %edi,%edx |
452 | jnz int_careful | 356 | jnz int_careful |
453 | andl $~TS_COMPAT,TI_status(%rcx) | 357 | andl $~TS_COMPAT,TI_status(%rcx) |
454 | jmp retint_swapgs | 358 | jmp syscall_return |
455 | 359 | ||
456 | /* Either reschedule or signal or syscall exit tracking needed. */ | 360 | /* Either reschedule or signal or syscall exit tracking needed. */ |
457 | /* First do a reschedule test. */ | 361 | /* First do a reschedule test. */ |
@@ -468,12 +372,11 @@ int_careful: | |||
468 | TRACE_IRQS_OFF | 372 | TRACE_IRQS_OFF |
469 | jmp int_with_check | 373 | jmp int_with_check |
470 | 374 | ||
471 | /* handle signals and tracing -- both require a full stack frame */ | 375 | /* handle signals and tracing -- both require a full pt_regs */ |
472 | int_very_careful: | 376 | int_very_careful: |
473 | TRACE_IRQS_ON | 377 | TRACE_IRQS_ON |
474 | ENABLE_INTERRUPTS(CLBR_NONE) | 378 | ENABLE_INTERRUPTS(CLBR_NONE) |
475 | int_check_syscall_exit_work: | 379 | SAVE_EXTRA_REGS |
476 | SAVE_REST | ||
477 | /* Check for syscall exit trace */ | 380 | /* Check for syscall exit trace */ |
478 | testl $_TIF_WORK_SYSCALL_EXIT,%edx | 381 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
479 | jz int_signal | 382 | jz int_signal |
@@ -492,86 +395,192 @@ int_signal: | |||
492 | call do_notify_resume | 395 | call do_notify_resume |
493 | 1: movl $_TIF_WORK_MASK,%edi | 396 | 1: movl $_TIF_WORK_MASK,%edi |
494 | int_restore_rest: | 397 | int_restore_rest: |
495 | RESTORE_REST | 398 | RESTORE_EXTRA_REGS |
496 | DISABLE_INTERRUPTS(CLBR_NONE) | 399 | DISABLE_INTERRUPTS(CLBR_NONE) |
497 | TRACE_IRQS_OFF | 400 | TRACE_IRQS_OFF |
498 | jmp int_with_check | 401 | jmp int_with_check |
402 | |||
403 | syscall_return: | ||
404 | /* The IRETQ could re-enable interrupts: */ | ||
405 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
406 | TRACE_IRQS_IRETQ | ||
407 | |||
408 | /* | ||
409 | * Try to use SYSRET instead of IRET if we're returning to | ||
410 | * a completely clean 64-bit userspace context. | ||
411 | */ | ||
412 | movq RCX(%rsp),%rcx | ||
413 | cmpq %rcx,RIP(%rsp) /* RCX == RIP */ | ||
414 | jne opportunistic_sysret_failed | ||
415 | |||
416 | /* | ||
417 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP | ||
418 | * in kernel space. This essentially lets the user take over | ||
419 | * the kernel, since userspace controls RSP. It's not worth | ||
420 | * testing for canonicalness exactly -- this check detects any | ||
421 | * of the 17 high bits set, which is true for non-canonical | ||
422 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
423 | * Big deal.) | ||
424 | * | ||
425 | * If virtual addresses ever become wider, this will need | ||
426 | * to be updated to remain correct on both old and new CPUs. | ||
427 | */ | ||
428 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
429 | .error "virtual address width changed -- SYSRET checks need update" | ||
430 | .endif | ||
431 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
432 | jnz opportunistic_sysret_failed | ||
433 | |||
434 | cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ | ||
435 | jne opportunistic_sysret_failed | ||
436 | |||
437 | movq R11(%rsp),%r11 | ||
438 | cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ | ||
439 | jne opportunistic_sysret_failed | ||
440 | |||
441 | /* | ||
442 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
443 | * restoring TF results in a trap from userspace immediately after | ||
444 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
445 | * with register state that satisfies the opportunistic SYSRET | ||
446 | * conditions. For example, single-stepping this user code: | ||
447 | * | ||
448 | * movq $stuck_here,%rcx | ||
449 | * pushfq | ||
450 | * popq %r11 | ||
451 | * stuck_here: | ||
452 | * | ||
453 | * would never get past 'stuck_here'. | ||
454 | */ | ||
455 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
456 | jnz opportunistic_sysret_failed | ||
457 | |||
458 | /* nothing to check for RSP */ | ||
459 | |||
460 | cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ | ||
461 | jne opportunistic_sysret_failed | ||
462 | |||
463 | /* | ||
464 | * We win! This label is here just for ease of understanding | ||
465 | * perf profiles. Nothing jumps here. | ||
466 | */ | ||
467 | syscall_return_via_sysret: | ||
468 | CFI_REMEMBER_STATE | ||
469 | /* r11 is already restored (see code above) */ | ||
470 | RESTORE_C_REGS_EXCEPT_R11 | ||
471 | movq RSP(%rsp),%rsp | ||
472 | USERGS_SYSRET64 | ||
473 | CFI_RESTORE_STATE | ||
474 | |||
475 | opportunistic_sysret_failed: | ||
476 | SWAPGS | ||
477 | jmp restore_c_regs_and_iret | ||
499 | CFI_ENDPROC | 478 | CFI_ENDPROC |
500 | END(system_call) | 479 | END(system_call) |
501 | 480 | ||
481 | |||
502 | .macro FORK_LIKE func | 482 | .macro FORK_LIKE func |
503 | ENTRY(stub_\func) | 483 | ENTRY(stub_\func) |
504 | CFI_STARTPROC | 484 | CFI_STARTPROC |
505 | popq %r11 /* save return address */ | 485 | DEFAULT_FRAME 0, 8 /* offset 8: return address */ |
506 | PARTIAL_FRAME 0 | 486 | SAVE_EXTRA_REGS 8 |
507 | SAVE_REST | 487 | jmp sys_\func |
508 | pushq %r11 /* put it back on stack */ | ||
509 | FIXUP_TOP_OF_STACK %r11, 8 | ||
510 | DEFAULT_FRAME 0 8 /* offset 8: return address */ | ||
511 | call sys_\func | ||
512 | RESTORE_TOP_OF_STACK %r11, 8 | ||
513 | ret $REST_SKIP /* pop extended registers */ | ||
514 | CFI_ENDPROC | 488 | CFI_ENDPROC |
515 | END(stub_\func) | 489 | END(stub_\func) |
516 | .endm | 490 | .endm |
517 | 491 | ||
518 | .macro FIXED_FRAME label,func | ||
519 | ENTRY(\label) | ||
520 | CFI_STARTPROC | ||
521 | PARTIAL_FRAME 0 8 /* offset 8: return address */ | ||
522 | FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
523 | call \func | ||
524 | RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
525 | ret | ||
526 | CFI_ENDPROC | ||
527 | END(\label) | ||
528 | .endm | ||
529 | |||
530 | FORK_LIKE clone | 492 | FORK_LIKE clone |
531 | FORK_LIKE fork | 493 | FORK_LIKE fork |
532 | FORK_LIKE vfork | 494 | FORK_LIKE vfork |
533 | FIXED_FRAME stub_iopl, sys_iopl | ||
534 | 495 | ||
535 | ENTRY(stub_execve) | 496 | ENTRY(stub_execve) |
536 | CFI_STARTPROC | 497 | CFI_STARTPROC |
537 | addq $8, %rsp | 498 | DEFAULT_FRAME 0, 8 |
538 | PARTIAL_FRAME 0 | 499 | call sys_execve |
539 | SAVE_REST | 500 | return_from_execve: |
540 | FIXUP_TOP_OF_STACK %r11 | 501 | testl %eax, %eax |
541 | call sys_execve | 502 | jz 1f |
542 | movq %rax,RAX(%rsp) | 503 | /* exec failed, can use fast SYSRET code path in this case */ |
543 | RESTORE_REST | 504 | ret |
544 | jmp int_ret_from_sys_call | 505 | 1: |
506 | /* must use IRET code path (pt_regs->cs may have changed) */ | ||
507 | addq $8, %rsp | ||
508 | CFI_ADJUST_CFA_OFFSET -8 | ||
509 | ZERO_EXTRA_REGS | ||
510 | movq %rax,RAX(%rsp) | ||
511 | jmp int_ret_from_sys_call | ||
545 | CFI_ENDPROC | 512 | CFI_ENDPROC |
546 | END(stub_execve) | 513 | END(stub_execve) |
547 | 514 | /* | |
548 | ENTRY(stub_execveat) | 515 | * Remaining execve stubs are only 7 bytes long. |
516 | * ENTRY() often aligns to 16 bytes, which in this case has no benefits. | ||
517 | */ | ||
518 | .align 8 | ||
519 | GLOBAL(stub_execveat) | ||
549 | CFI_STARTPROC | 520 | CFI_STARTPROC |
550 | addq $8, %rsp | 521 | DEFAULT_FRAME 0, 8 |
551 | PARTIAL_FRAME 0 | 522 | call sys_execveat |
552 | SAVE_REST | 523 | jmp return_from_execve |
553 | FIXUP_TOP_OF_STACK %r11 | ||
554 | call sys_execveat | ||
555 | RESTORE_TOP_OF_STACK %r11 | ||
556 | movq %rax,RAX(%rsp) | ||
557 | RESTORE_REST | ||
558 | jmp int_ret_from_sys_call | ||
559 | CFI_ENDPROC | 524 | CFI_ENDPROC |
560 | END(stub_execveat) | 525 | END(stub_execveat) |
561 | 526 | ||
527 | #ifdef CONFIG_X86_X32_ABI | ||
528 | .align 8 | ||
529 | GLOBAL(stub_x32_execve) | ||
530 | CFI_STARTPROC | ||
531 | DEFAULT_FRAME 0, 8 | ||
532 | call compat_sys_execve | ||
533 | jmp return_from_execve | ||
534 | CFI_ENDPROC | ||
535 | END(stub_x32_execve) | ||
536 | .align 8 | ||
537 | GLOBAL(stub_x32_execveat) | ||
538 | CFI_STARTPROC | ||
539 | DEFAULT_FRAME 0, 8 | ||
540 | call compat_sys_execveat | ||
541 | jmp return_from_execve | ||
542 | CFI_ENDPROC | ||
543 | END(stub_x32_execveat) | ||
544 | #endif | ||
545 | |||
546 | #ifdef CONFIG_IA32_EMULATION | ||
547 | .align 8 | ||
548 | GLOBAL(stub32_execve) | ||
549 | CFI_STARTPROC | ||
550 | call compat_sys_execve | ||
551 | jmp return_from_execve | ||
552 | CFI_ENDPROC | ||
553 | END(stub32_execve) | ||
554 | .align 8 | ||
555 | GLOBAL(stub32_execveat) | ||
556 | CFI_STARTPROC | ||
557 | call compat_sys_execveat | ||
558 | jmp return_from_execve | ||
559 | CFI_ENDPROC | ||
560 | END(stub32_execveat) | ||
561 | #endif | ||
562 | |||
562 | /* | 563 | /* |
563 | * sigreturn is special because it needs to restore all registers on return. | 564 | * sigreturn is special because it needs to restore all registers on return. |
564 | * This cannot be done with SYSRET, so use the IRET return path instead. | 565 | * This cannot be done with SYSRET, so use the IRET return path instead. |
565 | */ | 566 | */ |
566 | ENTRY(stub_rt_sigreturn) | 567 | ENTRY(stub_rt_sigreturn) |
567 | CFI_STARTPROC | 568 | CFI_STARTPROC |
568 | addq $8, %rsp | 569 | DEFAULT_FRAME 0, 8 |
569 | PARTIAL_FRAME 0 | 570 | /* |
570 | SAVE_REST | 571 | * SAVE_EXTRA_REGS result is not normally needed: |
571 | FIXUP_TOP_OF_STACK %r11 | 572 | * sigreturn overwrites all pt_regs->GPREGS. |
573 | * But sigreturn can fail (!), and there is no easy way to detect that. | ||
574 | * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, | ||
575 | * we SAVE_EXTRA_REGS here. | ||
576 | */ | ||
577 | SAVE_EXTRA_REGS 8 | ||
572 | call sys_rt_sigreturn | 578 | call sys_rt_sigreturn |
573 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 579 | return_from_stub: |
574 | RESTORE_REST | 580 | addq $8, %rsp |
581 | CFI_ADJUST_CFA_OFFSET -8 | ||
582 | RESTORE_EXTRA_REGS | ||
583 | movq %rax,RAX(%rsp) | ||
575 | jmp int_ret_from_sys_call | 584 | jmp int_ret_from_sys_call |
576 | CFI_ENDPROC | 585 | CFI_ENDPROC |
577 | END(stub_rt_sigreturn) | 586 | END(stub_rt_sigreturn) |
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn) | |||
579 | #ifdef CONFIG_X86_X32_ABI | 588 | #ifdef CONFIG_X86_X32_ABI |
580 | ENTRY(stub_x32_rt_sigreturn) | 589 | ENTRY(stub_x32_rt_sigreturn) |
581 | CFI_STARTPROC | 590 | CFI_STARTPROC |
582 | addq $8, %rsp | 591 | DEFAULT_FRAME 0, 8 |
583 | PARTIAL_FRAME 0 | 592 | SAVE_EXTRA_REGS 8 |
584 | SAVE_REST | ||
585 | FIXUP_TOP_OF_STACK %r11 | ||
586 | call sys32_x32_rt_sigreturn | 593 | call sys32_x32_rt_sigreturn |
587 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 594 | jmp return_from_stub |
588 | RESTORE_REST | ||
589 | jmp int_ret_from_sys_call | ||
590 | CFI_ENDPROC | 595 | CFI_ENDPROC |
591 | END(stub_x32_rt_sigreturn) | 596 | END(stub_x32_rt_sigreturn) |
597 | #endif | ||
592 | 598 | ||
593 | ENTRY(stub_x32_execve) | 599 | /* |
594 | CFI_STARTPROC | 600 | * A newly forked process directly context switches into this address. |
595 | addq $8, %rsp | 601 | * |
596 | PARTIAL_FRAME 0 | 602 | * rdi: prev task we switched from |
597 | SAVE_REST | 603 | */ |
598 | FIXUP_TOP_OF_STACK %r11 | 604 | ENTRY(ret_from_fork) |
599 | call compat_sys_execve | 605 | DEFAULT_FRAME |
600 | RESTORE_TOP_OF_STACK %r11 | ||
601 | movq %rax,RAX(%rsp) | ||
602 | RESTORE_REST | ||
603 | jmp int_ret_from_sys_call | ||
604 | CFI_ENDPROC | ||
605 | END(stub_x32_execve) | ||
606 | 606 | ||
607 | ENTRY(stub_x32_execveat) | 607 | LOCK ; btr $TIF_FORK,TI_flags(%r8) |
608 | CFI_STARTPROC | 608 | |
609 | addq $8, %rsp | 609 | pushq_cfi $0x0002 |
610 | PARTIAL_FRAME 0 | 610 | popfq_cfi # reset kernel eflags |
611 | SAVE_REST | 611 | |
612 | FIXUP_TOP_OF_STACK %r11 | 612 | call schedule_tail # rdi: 'prev' task parameter |
613 | call compat_sys_execveat | 613 | |
614 | RESTORE_TOP_OF_STACK %r11 | 614 | RESTORE_EXTRA_REGS |
615 | movq %rax,RAX(%rsp) | 615 | |
616 | RESTORE_REST | 616 | testl $3,CS(%rsp) # from kernel_thread? |
617 | |||
618 | /* | ||
619 | * By the time we get here, we have no idea whether our pt_regs, | ||
620 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
621 | * the slow path, or one of the ia32entry paths. | ||
622 | * Use IRET code path to return, since it can safely handle | ||
623 | * all of the above. | ||
624 | */ | ||
625 | jnz int_ret_from_sys_call | ||
626 | |||
627 | /* We came from kernel_thread */ | ||
628 | /* nb: we depend on RESTORE_EXTRA_REGS above */ | ||
629 | movq %rbp, %rdi | ||
630 | call *%rbx | ||
631 | movl $0, RAX(%rsp) | ||
632 | RESTORE_EXTRA_REGS | ||
617 | jmp int_ret_from_sys_call | 633 | jmp int_ret_from_sys_call |
618 | CFI_ENDPROC | 634 | CFI_ENDPROC |
619 | END(stub_x32_execveat) | 635 | END(ret_from_fork) |
620 | |||
621 | #endif | ||
622 | 636 | ||
623 | /* | 637 | /* |
624 | * Build the entry stubs and pointer table with some assembler magic. | 638 | * Build the entry stubs with some assembler magic. |
625 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 639 | * We pack 1 stub into every 8-byte block. |
626 | * single cache line on all modern x86 implementations. | ||
627 | */ | 640 | */ |
628 | .section .init.rodata,"a" | 641 | .align 8 |
629 | ENTRY(interrupt) | ||
630 | .section .entry.text | ||
631 | .p2align 5 | ||
632 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
633 | ENTRY(irq_entries_start) | 642 | ENTRY(irq_entries_start) |
634 | INTR_FRAME | 643 | INTR_FRAME |
635 | vector=FIRST_EXTERNAL_VECTOR | 644 | vector=FIRST_EXTERNAL_VECTOR |
636 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 645 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
637 | .balign 32 | 646 | pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
638 | .rept 7 | 647 | vector=vector+1 |
639 | .if vector < FIRST_SYSTEM_VECTOR | 648 | jmp common_interrupt |
640 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
641 | CFI_ADJUST_CFA_OFFSET -8 | 649 | CFI_ADJUST_CFA_OFFSET -8 |
642 | .endif | 650 | .align 8 |
643 | 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 651 | .endr |
644 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
645 | jmp 2f | ||
646 | .endif | ||
647 | .previous | ||
648 | .quad 1b | ||
649 | .section .entry.text | ||
650 | vector=vector+1 | ||
651 | .endif | ||
652 | .endr | ||
653 | 2: jmp common_interrupt | ||
654 | .endr | ||
655 | CFI_ENDPROC | 652 | CFI_ENDPROC |
656 | END(irq_entries_start) | 653 | END(irq_entries_start) |
657 | 654 | ||
658 | .previous | ||
659 | END(interrupt) | ||
660 | .previous | ||
661 | |||
662 | /* | 655 | /* |
663 | * Interrupt entry/exit. | 656 | * Interrupt entry/exit. |
664 | * | 657 | * |
@@ -669,47 +662,45 @@ END(interrupt) | |||
669 | 662 | ||
670 | /* 0(%rsp): ~(interrupt number) */ | 663 | /* 0(%rsp): ~(interrupt number) */ |
671 | .macro interrupt func | 664 | .macro interrupt func |
672 | /* reserve pt_regs for scratch regs and rbp */ | ||
673 | subq $ORIG_RAX-RBP, %rsp | ||
674 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | ||
675 | cld | 665 | cld |
676 | /* start from rbp in pt_regs and jump over */ | 666 | /* |
677 | movq_cfi rdi, (RDI-RBP) | 667 | * Since nothing in interrupt handling code touches r12...r15 members |
678 | movq_cfi rsi, (RSI-RBP) | 668 | * of "struct pt_regs", and since interrupts can nest, we can save |
679 | movq_cfi rdx, (RDX-RBP) | 669 | * four stack slots and simultaneously provide |
680 | movq_cfi rcx, (RCX-RBP) | 670 | * an unwind-friendly stack layout by saving "truncated" pt_regs |
681 | movq_cfi rax, (RAX-RBP) | 671 | * exactly up to rbp slot, without these members. |
682 | movq_cfi r8, (R8-RBP) | 672 | */ |
683 | movq_cfi r9, (R9-RBP) | 673 | ALLOC_PT_GPREGS_ON_STACK -RBP |
684 | movq_cfi r10, (R10-RBP) | 674 | SAVE_C_REGS -RBP |
685 | movq_cfi r11, (R11-RBP) | 675 | /* this goes to 0(%rsp) for unwinder, not for saving the value: */ |
686 | 676 | SAVE_EXTRA_REGS_RBP -RBP | |
687 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
688 | movq_cfi rbp, 0 | ||
689 | |||
690 | /* Save previous stack value */ | ||
691 | movq %rsp, %rsi | ||
692 | 677 | ||
693 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | 678 | leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ |
694 | testl $3, CS-RBP(%rsi) | 679 | |
680 | testl $3, CS-RBP(%rsp) | ||
695 | je 1f | 681 | je 1f |
696 | SWAPGS | 682 | SWAPGS |
683 | 1: | ||
697 | /* | 684 | /* |
685 | * Save previous stack pointer, optionally switch to interrupt stack. | ||
698 | * irq_count is used to check if a CPU is already on an interrupt stack | 686 | * irq_count is used to check if a CPU is already on an interrupt stack |
699 | * or not. While this is essentially redundant with preempt_count it is | 687 | * or not. While this is essentially redundant with preempt_count it is |
700 | * a little cheaper to use a separate counter in the PDA (short of | 688 | * a little cheaper to use a separate counter in the PDA (short of |
701 | * moving irq_enter into assembly, which would be too much work) | 689 | * moving irq_enter into assembly, which would be too much work) |
702 | */ | 690 | */ |
703 | 1: incl PER_CPU_VAR(irq_count) | 691 | movq %rsp, %rsi |
692 | incl PER_CPU_VAR(irq_count) | ||
704 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | 693 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp |
705 | CFI_DEF_CFA_REGISTER rsi | 694 | CFI_DEF_CFA_REGISTER rsi |
706 | |||
707 | /* Store previous stack value */ | ||
708 | pushq %rsi | 695 | pushq %rsi |
696 | /* | ||
697 | * For debugger: | ||
698 | * "CFA (Current Frame Address) is the value on stack + offset" | ||
699 | */ | ||
709 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | 700 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ |
710 | 0x77 /* DW_OP_breg7 */, 0, \ | 701 | 0x77 /* DW_OP_breg7 (rsp) */, 0, \ |
711 | 0x06 /* DW_OP_deref */, \ | 702 | 0x06 /* DW_OP_deref */, \ |
712 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | 703 | 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ |
713 | 0x22 /* DW_OP_plus */ | 704 | 0x22 /* DW_OP_plus */ |
714 | /* We entered an interrupt context - irqs are off: */ | 705 | /* We entered an interrupt context - irqs are off: */ |
715 | TRACE_IRQS_OFF | 706 | TRACE_IRQS_OFF |
@@ -727,7 +718,7 @@ common_interrupt: | |||
727 | ASM_CLAC | 718 | ASM_CLAC |
728 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ | 719 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ |
729 | interrupt do_IRQ | 720 | interrupt do_IRQ |
730 | /* 0(%rsp): old_rsp-ARGOFFSET */ | 721 | /* 0(%rsp): old RSP */ |
731 | ret_from_intr: | 722 | ret_from_intr: |
732 | DISABLE_INTERRUPTS(CLBR_NONE) | 723 | DISABLE_INTERRUPTS(CLBR_NONE) |
733 | TRACE_IRQS_OFF | 724 | TRACE_IRQS_OFF |
@@ -735,19 +726,18 @@ ret_from_intr: | |||
735 | 726 | ||
736 | /* Restore saved previous stack */ | 727 | /* Restore saved previous stack */ |
737 | popq %rsi | 728 | popq %rsi |
738 | CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ | 729 | CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ |
739 | leaq ARGOFFSET-RBP(%rsi), %rsp | 730 | /* return code expects complete pt_regs - adjust rsp accordingly: */ |
731 | leaq -RBP(%rsi),%rsp | ||
740 | CFI_DEF_CFA_REGISTER rsp | 732 | CFI_DEF_CFA_REGISTER rsp |
741 | CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET | 733 | CFI_ADJUST_CFA_OFFSET RBP |
742 | 734 | ||
743 | exit_intr: | 735 | testl $3,CS(%rsp) |
744 | GET_THREAD_INFO(%rcx) | ||
745 | testl $3,CS-ARGOFFSET(%rsp) | ||
746 | je retint_kernel | 736 | je retint_kernel |
747 | |||
748 | /* Interrupt came from user space */ | 737 | /* Interrupt came from user space */ |
738 | |||
739 | GET_THREAD_INFO(%rcx) | ||
749 | /* | 740 | /* |
750 | * Has a correct top of stack, but a partial stack frame | ||
751 | * %rcx: thread info. Interrupts off. | 741 | * %rcx: thread info. Interrupts off. |
752 | */ | 742 | */ |
753 | retint_with_reschedule: | 743 | retint_with_reschedule: |
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */ | |||
766 | DISABLE_INTERRUPTS(CLBR_ANY) | 756 | DISABLE_INTERRUPTS(CLBR_ANY) |
767 | TRACE_IRQS_IRETQ | 757 | TRACE_IRQS_IRETQ |
768 | 758 | ||
769 | /* | ||
770 | * Try to use SYSRET instead of IRET if we're returning to | ||
771 | * a completely clean 64-bit userspace context. | ||
772 | */ | ||
773 | movq (RCX-R11)(%rsp), %rcx | ||
774 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
775 | jne opportunistic_sysret_failed | ||
776 | |||
777 | /* | ||
778 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
779 | * in kernel space. This essentially lets the user take over | ||
780 | * the kernel, since userspace controls RSP. It's not worth | ||
781 | * testing for canonicalness exactly -- this check detects any | ||
782 | * of the 17 high bits set, which is true for non-canonical | ||
783 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
784 | * Big deal.) | ||
785 | * | ||
786 | * If virtual addresses ever become wider, this will need | ||
787 | * to be updated to remain correct on both old and new CPUs. | ||
788 | */ | ||
789 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
790 | .error "virtual address width changed -- sysret checks need update" | ||
791 | .endif | ||
792 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
793 | jnz opportunistic_sysret_failed | ||
794 | |||
795 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
796 | jne opportunistic_sysret_failed | ||
797 | |||
798 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
799 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
800 | jne opportunistic_sysret_failed | ||
801 | |||
802 | /* | ||
803 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
804 | * restoring TF results in a trap from userspace immediately after | ||
805 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
806 | * with register state that satisfies the opportunistic SYSRET | ||
807 | * conditions. For example, single-stepping this user code: | ||
808 | * | ||
809 | * movq $stuck_here,%rcx | ||
810 | * pushfq | ||
811 | * popq %r11 | ||
812 | * stuck_here: | ||
813 | * | ||
814 | * would never get past 'stuck_here'. | ||
815 | */ | ||
816 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
817 | jnz opportunistic_sysret_failed | ||
818 | |||
819 | /* nothing to check for RSP */ | ||
820 | |||
821 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
822 | jne opportunistic_sysret_failed | ||
823 | |||
824 | /* | ||
825 | * We win! This label is here just for ease of understanding | ||
826 | * perf profiles. Nothing jumps here. | ||
827 | */ | ||
828 | irq_return_via_sysret: | ||
829 | CFI_REMEMBER_STATE | ||
830 | RESTORE_ARGS 1,8,1 | ||
831 | movq (RSP-RIP)(%rsp),%rsp | ||
832 | USERGS_SYSRET64 | ||
833 | CFI_RESTORE_STATE | ||
834 | |||
835 | opportunistic_sysret_failed: | ||
836 | SWAPGS | 759 | SWAPGS |
837 | jmp restore_args | 760 | jmp restore_c_regs_and_iret |
838 | 761 | ||
839 | retint_restore_args: /* return to kernel space */ | 762 | /* Returning to kernel space */ |
840 | DISABLE_INTERRUPTS(CLBR_ANY) | 763 | retint_kernel: |
764 | #ifdef CONFIG_PREEMPT | ||
765 | /* Interrupts are off */ | ||
766 | /* Check if we need preemption */ | ||
767 | bt $9,EFLAGS(%rsp) /* interrupts were off? */ | ||
768 | jnc 1f | ||
769 | 0: cmpl $0,PER_CPU_VAR(__preempt_count) | ||
770 | jnz 1f | ||
771 | call preempt_schedule_irq | ||
772 | jmp 0b | ||
773 | 1: | ||
774 | #endif | ||
841 | /* | 775 | /* |
842 | * The iretq could re-enable interrupts: | 776 | * The iretq could re-enable interrupts: |
843 | */ | 777 | */ |
844 | TRACE_IRQS_IRETQ | 778 | TRACE_IRQS_IRETQ |
845 | restore_args: | 779 | |
846 | RESTORE_ARGS 1,8,1 | 780 | /* |
781 | * At this label, code paths which return to kernel and to user, | ||
782 | * which come from interrupts/exception and from syscalls, merge. | ||
783 | */ | ||
784 | restore_c_regs_and_iret: | ||
785 | RESTORE_C_REGS | ||
786 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
847 | 787 | ||
848 | irq_return: | 788 | irq_return: |
849 | INTERRUPT_RETURN | 789 | INTERRUPT_RETURN |
@@ -914,28 +854,17 @@ retint_signal: | |||
914 | jz retint_swapgs | 854 | jz retint_swapgs |
915 | TRACE_IRQS_ON | 855 | TRACE_IRQS_ON |
916 | ENABLE_INTERRUPTS(CLBR_NONE) | 856 | ENABLE_INTERRUPTS(CLBR_NONE) |
917 | SAVE_REST | 857 | SAVE_EXTRA_REGS |
918 | movq $-1,ORIG_RAX(%rsp) | 858 | movq $-1,ORIG_RAX(%rsp) |
919 | xorl %esi,%esi # oldset | 859 | xorl %esi,%esi # oldset |
920 | movq %rsp,%rdi # &pt_regs | 860 | movq %rsp,%rdi # &pt_regs |
921 | call do_notify_resume | 861 | call do_notify_resume |
922 | RESTORE_REST | 862 | RESTORE_EXTRA_REGS |
923 | DISABLE_INTERRUPTS(CLBR_NONE) | 863 | DISABLE_INTERRUPTS(CLBR_NONE) |
924 | TRACE_IRQS_OFF | 864 | TRACE_IRQS_OFF |
925 | GET_THREAD_INFO(%rcx) | 865 | GET_THREAD_INFO(%rcx) |
926 | jmp retint_with_reschedule | 866 | jmp retint_with_reschedule |
927 | 867 | ||
928 | #ifdef CONFIG_PREEMPT | ||
929 | /* Returning to kernel space. Check if we need preemption */ | ||
930 | /* rcx: threadinfo. interrupts off. */ | ||
931 | ENTRY(retint_kernel) | ||
932 | cmpl $0,PER_CPU_VAR(__preempt_count) | ||
933 | jnz retint_restore_args | ||
934 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
935 | jnc retint_restore_args | ||
936 | call preempt_schedule_irq | ||
937 | jmp exit_intr | ||
938 | #endif | ||
939 | CFI_ENDPROC | 868 | CFI_ENDPROC |
940 | END(common_interrupt) | 869 | END(common_interrupt) |
941 | 870 | ||
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \ | |||
1024 | /* | 953 | /* |
1025 | * Exception entry points. | 954 | * Exception entry points. |
1026 | */ | 955 | */ |
1027 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | 956 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) |
1028 | 957 | ||
1029 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 | 958 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 |
1030 | ENTRY(\sym) | 959 | ENTRY(\sym) |
@@ -1046,8 +975,7 @@ ENTRY(\sym) | |||
1046 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 975 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1047 | .endif | 976 | .endif |
1048 | 977 | ||
1049 | subq $ORIG_RAX-R15, %rsp | 978 | ALLOC_PT_GPREGS_ON_STACK |
1050 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | ||
1051 | 979 | ||
1052 | .if \paranoid | 980 | .if \paranoid |
1053 | .if \paranoid == 1 | 981 | .if \paranoid == 1 |
@@ -1055,10 +983,11 @@ ENTRY(\sym) | |||
1055 | testl $3, CS(%rsp) /* If coming from userspace, switch */ | 983 | testl $3, CS(%rsp) /* If coming from userspace, switch */ |
1056 | jnz 1f /* stacks. */ | 984 | jnz 1f /* stacks. */ |
1057 | .endif | 985 | .endif |
1058 | call save_paranoid | 986 | call paranoid_entry |
1059 | .else | 987 | .else |
1060 | call error_entry | 988 | call error_entry |
1061 | .endif | 989 | .endif |
990 | /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ | ||
1062 | 991 | ||
1063 | DEFAULT_FRAME 0 | 992 | DEFAULT_FRAME 0 |
1064 | 993 | ||
@@ -1080,19 +1009,20 @@ ENTRY(\sym) | |||
1080 | .endif | 1009 | .endif |
1081 | 1010 | ||
1082 | .if \shift_ist != -1 | 1011 | .if \shift_ist != -1 |
1083 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1012 | subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1084 | .endif | 1013 | .endif |
1085 | 1014 | ||
1086 | call \do_sym | 1015 | call \do_sym |
1087 | 1016 | ||
1088 | .if \shift_ist != -1 | 1017 | .if \shift_ist != -1 |
1089 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1018 | addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1090 | .endif | 1019 | .endif |
1091 | 1020 | ||
1021 | /* these procedures expect "no swapgs" flag in ebx */ | ||
1092 | .if \paranoid | 1022 | .if \paranoid |
1093 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1023 | jmp paranoid_exit |
1094 | .else | 1024 | .else |
1095 | jmp error_exit /* %ebx: no swapgs flag */ | 1025 | jmp error_exit |
1096 | .endif | 1026 | .endif |
1097 | 1027 | ||
1098 | .if \paranoid == 1 | 1028 | .if \paranoid == 1 |
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback) | |||
1296 | addq $0x30,%rsp | 1226 | addq $0x30,%rsp |
1297 | CFI_ADJUST_CFA_OFFSET -0x30 | 1227 | CFI_ADJUST_CFA_OFFSET -0x30 |
1298 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ | 1228 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ |
1299 | SAVE_ALL | 1229 | ALLOC_PT_GPREGS_ON_STACK |
1230 | SAVE_C_REGS | ||
1231 | SAVE_EXTRA_REGS | ||
1300 | jmp error_exit | 1232 | jmp error_exit |
1301 | CFI_ENDPROC | 1233 | CFI_ENDPROC |
1302 | END(xen_failsafe_callback) | 1234 | END(xen_failsafe_callback) |
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 | |||
1328 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) | 1260 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) |
1329 | #endif | 1261 | #endif |
1330 | 1262 | ||
1331 | /* | 1263 | /* |
1332 | * "Paranoid" exit path from exception stack. This is invoked | 1264 | * Save all registers in pt_regs, and switch gs if needed. |
1333 | * only on return from non-NMI IST interrupts that came | 1265 | * Use slow, but surefire "are we in kernel?" check. |
1334 | * from kernel space. | 1266 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1335 | * | 1267 | */ |
1336 | * We may be returning to very strange contexts (e.g. very early | 1268 | ENTRY(paranoid_entry) |
1337 | * in syscall entry), so checking for preemption here would | 1269 | XCPT_FRAME 1 15*8 |
1338 | * be complicated. Fortunately, we there's no good reason | 1270 | cld |
1339 | * to try to handle preemption here. | 1271 | SAVE_C_REGS 8 |
1340 | */ | 1272 | SAVE_EXTRA_REGS 8 |
1273 | movl $1,%ebx | ||
1274 | movl $MSR_GS_BASE,%ecx | ||
1275 | rdmsr | ||
1276 | testl %edx,%edx | ||
1277 | js 1f /* negative -> in kernel */ | ||
1278 | SWAPGS | ||
1279 | xorl %ebx,%ebx | ||
1280 | 1: ret | ||
1281 | CFI_ENDPROC | ||
1282 | END(paranoid_entry) | ||
1341 | 1283 | ||
1342 | /* ebx: no swapgs flag */ | 1284 | /* |
1285 | * "Paranoid" exit path from exception stack. This is invoked | ||
1286 | * only on return from non-NMI IST interrupts that came | ||
1287 | * from kernel space. | ||
1288 | * | ||
1289 | * We may be returning to very strange contexts (e.g. very early | ||
1290 | * in syscall entry), so checking for preemption here would | ||
1291 | * be complicated. Fortunately, we there's no good reason | ||
1292 | * to try to handle preemption here. | ||
1293 | */ | ||
1294 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ | ||
1343 | ENTRY(paranoid_exit) | 1295 | ENTRY(paranoid_exit) |
1344 | DEFAULT_FRAME | 1296 | DEFAULT_FRAME |
1345 | DISABLE_INTERRUPTS(CLBR_NONE) | 1297 | DISABLE_INTERRUPTS(CLBR_NONE) |
1346 | TRACE_IRQS_OFF_DEBUG | 1298 | TRACE_IRQS_OFF_DEBUG |
1347 | testl %ebx,%ebx /* swapgs needed? */ | 1299 | testl %ebx,%ebx /* swapgs needed? */ |
1348 | jnz paranoid_restore | 1300 | jnz paranoid_exit_no_swapgs |
1349 | TRACE_IRQS_IRETQ 0 | 1301 | TRACE_IRQS_IRETQ |
1350 | SWAPGS_UNSAFE_STACK | 1302 | SWAPGS_UNSAFE_STACK |
1351 | RESTORE_ALL 8 | 1303 | jmp paranoid_exit_restore |
1352 | INTERRUPT_RETURN | 1304 | paranoid_exit_no_swapgs: |
1353 | paranoid_restore: | 1305 | TRACE_IRQS_IRETQ_DEBUG |
1354 | TRACE_IRQS_IRETQ_DEBUG 0 | 1306 | paranoid_exit_restore: |
1355 | RESTORE_ALL 8 | 1307 | RESTORE_EXTRA_REGS |
1308 | RESTORE_C_REGS | ||
1309 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
1356 | INTERRUPT_RETURN | 1310 | INTERRUPT_RETURN |
1357 | CFI_ENDPROC | 1311 | CFI_ENDPROC |
1358 | END(paranoid_exit) | 1312 | END(paranoid_exit) |
1359 | 1313 | ||
1360 | /* | 1314 | /* |
1361 | * Exception entry point. This expects an error code/orig_rax on the stack. | 1315 | * Save all registers in pt_regs, and switch gs if needed. |
1362 | * returns in "no swapgs flag" in %ebx. | 1316 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1363 | */ | 1317 | */ |
1364 | ENTRY(error_entry) | 1318 | ENTRY(error_entry) |
1365 | XCPT_FRAME | 1319 | XCPT_FRAME 1 15*8 |
1366 | CFI_ADJUST_CFA_OFFSET 15*8 | ||
1367 | /* oldrax contains error code */ | ||
1368 | cld | 1320 | cld |
1369 | movq %rdi, RDI+8(%rsp) | 1321 | SAVE_C_REGS 8 |
1370 | movq %rsi, RSI+8(%rsp) | 1322 | SAVE_EXTRA_REGS 8 |
1371 | movq %rdx, RDX+8(%rsp) | ||
1372 | movq %rcx, RCX+8(%rsp) | ||
1373 | movq %rax, RAX+8(%rsp) | ||
1374 | movq %r8, R8+8(%rsp) | ||
1375 | movq %r9, R9+8(%rsp) | ||
1376 | movq %r10, R10+8(%rsp) | ||
1377 | movq %r11, R11+8(%rsp) | ||
1378 | movq_cfi rbx, RBX+8 | ||
1379 | movq %rbp, RBP+8(%rsp) | ||
1380 | movq %r12, R12+8(%rsp) | ||
1381 | movq %r13, R13+8(%rsp) | ||
1382 | movq %r14, R14+8(%rsp) | ||
1383 | movq %r15, R15+8(%rsp) | ||
1384 | xorl %ebx,%ebx | 1323 | xorl %ebx,%ebx |
1385 | testl $3,CS+8(%rsp) | 1324 | testl $3,CS+8(%rsp) |
1386 | je error_kernelspace | 1325 | je error_kernelspace |
@@ -1390,12 +1329,12 @@ error_sti: | |||
1390 | TRACE_IRQS_OFF | 1329 | TRACE_IRQS_OFF |
1391 | ret | 1330 | ret |
1392 | 1331 | ||
1393 | /* | 1332 | /* |
1394 | * There are two places in the kernel that can potentially fault with | 1333 | * There are two places in the kernel that can potentially fault with |
1395 | * usergs. Handle them here. B stepping K8s sometimes report a | 1334 | * usergs. Handle them here. B stepping K8s sometimes report a |
1396 | * truncated RIP for IRET exceptions returning to compat mode. Check | 1335 | * truncated RIP for IRET exceptions returning to compat mode. Check |
1397 | * for these here too. | 1336 | * for these here too. |
1398 | */ | 1337 | */ |
1399 | error_kernelspace: | 1338 | error_kernelspace: |
1400 | CFI_REL_OFFSET rcx, RCX+8 | 1339 | CFI_REL_OFFSET rcx, RCX+8 |
1401 | incl %ebx | 1340 | incl %ebx |
@@ -1425,11 +1364,11 @@ error_bad_iret: | |||
1425 | END(error_entry) | 1364 | END(error_entry) |
1426 | 1365 | ||
1427 | 1366 | ||
1428 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | 1367 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ |
1429 | ENTRY(error_exit) | 1368 | ENTRY(error_exit) |
1430 | DEFAULT_FRAME | 1369 | DEFAULT_FRAME |
1431 | movl %ebx,%eax | 1370 | movl %ebx,%eax |
1432 | RESTORE_REST | 1371 | RESTORE_EXTRA_REGS |
1433 | DISABLE_INTERRUPTS(CLBR_NONE) | 1372 | DISABLE_INTERRUPTS(CLBR_NONE) |
1434 | TRACE_IRQS_OFF | 1373 | TRACE_IRQS_OFF |
1435 | GET_THREAD_INFO(%rcx) | 1374 | GET_THREAD_INFO(%rcx) |
@@ -1444,19 +1383,7 @@ ENTRY(error_exit) | |||
1444 | CFI_ENDPROC | 1383 | CFI_ENDPROC |
1445 | END(error_exit) | 1384 | END(error_exit) |
1446 | 1385 | ||
1447 | /* | 1386 | /* Runs on exception stack */ |
1448 | * Test if a given stack is an NMI stack or not. | ||
1449 | */ | ||
1450 | .macro test_in_nmi reg stack nmi_ret normal_ret | ||
1451 | cmpq %\reg, \stack | ||
1452 | ja \normal_ret | ||
1453 | subq $EXCEPTION_STKSZ, %\reg | ||
1454 | cmpq %\reg, \stack | ||
1455 | jb \normal_ret | ||
1456 | jmp \nmi_ret | ||
1457 | .endm | ||
1458 | |||
1459 | /* runs on exception stack */ | ||
1460 | ENTRY(nmi) | 1387 | ENTRY(nmi) |
1461 | INTR_FRAME | 1388 | INTR_FRAME |
1462 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1389 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
@@ -1492,7 +1419,7 @@ ENTRY(nmi) | |||
1492 | * NMI. | 1419 | * NMI. |
1493 | */ | 1420 | */ |
1494 | 1421 | ||
1495 | /* Use %rdx as out temp variable throughout */ | 1422 | /* Use %rdx as our temp variable throughout */ |
1496 | pushq_cfi %rdx | 1423 | pushq_cfi %rdx |
1497 | CFI_REL_OFFSET rdx, 0 | 1424 | CFI_REL_OFFSET rdx, 0 |
1498 | 1425 | ||
@@ -1517,8 +1444,17 @@ ENTRY(nmi) | |||
1517 | * We check the variable because the first NMI could be in a | 1444 | * We check the variable because the first NMI could be in a |
1518 | * breakpoint routine using a breakpoint stack. | 1445 | * breakpoint routine using a breakpoint stack. |
1519 | */ | 1446 | */ |
1520 | lea 6*8(%rsp), %rdx | 1447 | lea 6*8(%rsp), %rdx |
1521 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | 1448 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
1449 | cmpq %rdx, 4*8(%rsp) | ||
1450 | /* If the stack pointer is above the NMI stack, this is a normal NMI */ | ||
1451 | ja first_nmi | ||
1452 | subq $EXCEPTION_STKSZ, %rdx | ||
1453 | cmpq %rdx, 4*8(%rsp) | ||
1454 | /* If it is below the NMI stack, it is a normal NMI */ | ||
1455 | jb first_nmi | ||
1456 | /* Ah, it is within the NMI stack, treat it as nested */ | ||
1457 | |||
1522 | CFI_REMEMBER_STATE | 1458 | CFI_REMEMBER_STATE |
1523 | 1459 | ||
1524 | nested_nmi: | 1460 | nested_nmi: |
@@ -1611,7 +1547,7 @@ first_nmi: | |||
1611 | .rept 5 | 1547 | .rept 5 |
1612 | pushq_cfi 11*8(%rsp) | 1548 | pushq_cfi 11*8(%rsp) |
1613 | .endr | 1549 | .endr |
1614 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1550 | CFI_DEF_CFA_OFFSET 5*8 |
1615 | 1551 | ||
1616 | /* Everything up to here is safe from nested NMIs */ | 1552 | /* Everything up to here is safe from nested NMIs */ |
1617 | 1553 | ||
@@ -1639,7 +1575,7 @@ repeat_nmi: | |||
1639 | pushq_cfi -6*8(%rsp) | 1575 | pushq_cfi -6*8(%rsp) |
1640 | .endr | 1576 | .endr |
1641 | subq $(5*8), %rsp | 1577 | subq $(5*8), %rsp |
1642 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1578 | CFI_DEF_CFA_OFFSET 5*8 |
1643 | end_repeat_nmi: | 1579 | end_repeat_nmi: |
1644 | 1580 | ||
1645 | /* | 1581 | /* |
@@ -1648,16 +1584,16 @@ end_repeat_nmi: | |||
1648 | * so that we repeat another NMI. | 1584 | * so that we repeat another NMI. |
1649 | */ | 1585 | */ |
1650 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1586 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1651 | subq $ORIG_RAX-R15, %rsp | 1587 | ALLOC_PT_GPREGS_ON_STACK |
1652 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1588 | |
1653 | /* | 1589 | /* |
1654 | * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit | 1590 | * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
1655 | * as we should not be calling schedule in NMI context. | 1591 | * as we should not be calling schedule in NMI context. |
1656 | * Even with normal interrupts enabled. An NMI should not be | 1592 | * Even with normal interrupts enabled. An NMI should not be |
1657 | * setting NEED_RESCHED or anything that normal interrupts and | 1593 | * setting NEED_RESCHED or anything that normal interrupts and |
1658 | * exceptions might do. | 1594 | * exceptions might do. |
1659 | */ | 1595 | */ |
1660 | call save_paranoid | 1596 | call paranoid_entry |
1661 | DEFAULT_FRAME 0 | 1597 | DEFAULT_FRAME 0 |
1662 | 1598 | ||
1663 | /* | 1599 | /* |
@@ -1688,8 +1624,10 @@ end_repeat_nmi: | |||
1688 | nmi_swapgs: | 1624 | nmi_swapgs: |
1689 | SWAPGS_UNSAFE_STACK | 1625 | SWAPGS_UNSAFE_STACK |
1690 | nmi_restore: | 1626 | nmi_restore: |
1627 | RESTORE_EXTRA_REGS | ||
1628 | RESTORE_C_REGS | ||
1691 | /* Pop the extra iret frame at once */ | 1629 | /* Pop the extra iret frame at once */ |
1692 | RESTORE_ALL 6*8 | 1630 | REMOVE_PT_GPREGS_FROM_STACK 6*8 |
1693 | 1631 | ||
1694 | /* Clear the NMI executing stack variable */ | 1632 | /* Clear the NMI executing stack variable */ |
1695 | movq $0, 5*8(%rsp) | 1633 | movq $0, 5*8(%rsp) |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/percpu.h> | 23 | #include <asm/percpu.h> |
24 | #include <asm/nops.h> | 24 | #include <asm/nops.h> |
25 | #include <asm/bootparam.h> | ||
25 | 26 | ||
26 | /* Physical address */ | 27 | /* Physical address */ |
27 | #define pa(X) ((X) - __PAGE_OFFSET) | 28 | #define pa(X) ((X) - __PAGE_OFFSET) |
@@ -90,7 +91,7 @@ ENTRY(startup_32) | |||
90 | 91 | ||
91 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking | 92 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking |
92 | us to not reload segments */ | 93 | us to not reload segments */ |
93 | testb $(1<<6), BP_loadflags(%esi) | 94 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
94 | jnz 2f | 95 | jnz 2f |
95 | 96 | ||
96 | /* | 97 | /* |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9f69a..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | 2 | * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit |
3 | * | 3 | * |
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
@@ -56,7 +56,7 @@ startup_64: | |||
56 | * %rsi holds a physical pointer to real_mode_data. | 56 | * %rsi holds a physical pointer to real_mode_data. |
57 | * | 57 | * |
58 | * We come here either directly from a 64bit bootloader, or from | 58 | * We come here either directly from a 64bit bootloader, or from |
59 | * arch/x86_64/boot/compressed/head.S. | 59 | * arch/x86/boot/compressed/head_64.S. |
60 | * | 60 | * |
61 | * We only come here initially at boot nothing else comes here. | 61 | * We only come here initially at boot nothing else comes here. |
62 | * | 62 | * |
@@ -146,7 +146,7 @@ startup_64: | |||
146 | leaq level2_kernel_pgt(%rip), %rdi | 146 | leaq level2_kernel_pgt(%rip), %rdi |
147 | leaq 4096(%rdi), %r8 | 147 | leaq 4096(%rdi), %r8 |
148 | /* See if it is a valid page table entry */ | 148 | /* See if it is a valid page table entry */ |
149 | 1: testq $1, 0(%rdi) | 149 | 1: testb $1, 0(%rdi) |
150 | jz 2f | 150 | jz 2f |
151 | addq %rbp, 0(%rdi) | 151 | addq %rbp, 0(%rdi) |
152 | /* Go to the next page */ | 152 | /* Go to the next page */ |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fce0b71..29c740deafec 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) | |||
68 | static inline bool interrupted_user_mode(void) | 68 | static inline bool interrupted_user_mode(void) |
69 | { | 69 | { |
70 | struct pt_regs *regs = get_irq_regs(); | 70 | struct pt_regs *regs = get_irq_regs(); |
71 | return regs && user_mode_vm(regs); | 71 | return regs && user_mode(regs); |
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c | |||
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |||
54 | * because the ->io_bitmap_max value must match the bitmap | 54 | * because the ->io_bitmap_max value must match the bitmap |
55 | * contents: | 55 | * contents: |
56 | */ | 56 | */ |
57 | tss = &per_cpu(init_tss, get_cpu()); | 57 | tss = &per_cpu(cpu_tss, get_cpu()); |
58 | 58 | ||
59 | if (turn_on) | 59 | if (turn_on) |
60 | bitmap_clear(t->io_bitmap_ptr, from, num); | 60 | bitmap_clear(t->io_bitmap_ptr, from, num); |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5eb8f4..f9fd86a7fcc7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
165 | if (unlikely(!desc)) | 165 | if (unlikely(!desc)) |
166 | return false; | 166 | return false; |
167 | 167 | ||
168 | if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { | 168 | if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { |
169 | if (unlikely(overflow)) | 169 | if (unlikely(overflow)) |
170 | print_stack_overflow(); | 170 | print_stack_overflow(); |
171 | desc->handle_irq(irq, desc); | 171 | desc->handle_irq(irq, desc); |
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d5558c..394e643d7830 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) | |||
44 | u64 estack_top, estack_bottom; | 44 | u64 estack_top, estack_bottom; |
45 | u64 curbase = (u64)task_stack_page(current); | 45 | u64 curbase = (u64)task_stack_page(current); |
46 | 46 | ||
47 | if (user_mode_vm(regs)) | 47 | if (user_mode(regs)) |
48 | return; | 48 | return; |
49 | 49 | ||
50 | if (regs->sp >= curbase + sizeof(struct thread_info) + | 50 | if (regs->sp >= curbase + sizeof(struct thread_info) + |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181ea1eac..cd10a6437264 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void) | |||
178 | #endif | 178 | #endif |
179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { | 179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { |
180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ | 180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ |
181 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 181 | set_intr_gate(i, irq_entries_start + |
182 | 8 * (i - FIRST_EXTERNAL_VECTOR)); | ||
182 | } | 183 | } |
183 | #ifdef CONFIG_X86_LOCAL_APIC | 184 | #ifdef CONFIG_X86_LOCAL_APIC |
184 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) | 185 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 25ecd56cefa8..d6178d9791db 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) | |||
126 | #ifdef CONFIG_X86_32 | 126 | #ifdef CONFIG_X86_32 |
127 | switch (regno) { | 127 | switch (regno) { |
128 | case GDB_SS: | 128 | case GDB_SS: |
129 | if (!user_mode_vm(regs)) | 129 | if (!user_mode(regs)) |
130 | *(unsigned long *)mem = __KERNEL_DS; | 130 | *(unsigned long *)mem = __KERNEL_DS; |
131 | break; | 131 | break; |
132 | case GDB_SP: | 132 | case GDB_SP: |
133 | if (!user_mode_vm(regs)) | 133 | if (!user_mode(regs)) |
134 | *(unsigned long *)mem = kernel_stack_pointer(regs); | 134 | *(unsigned long *)mem = kernel_stack_pointer(regs); |
135 | break; | 135 | break; |
136 | case GDB_GS: | 136 | case GDB_GS: |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4e3d5a9621fe..24d079604fd5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs) | |||
602 | struct kprobe *p; | 602 | struct kprobe *p; |
603 | struct kprobe_ctlblk *kcb; | 603 | struct kprobe_ctlblk *kcb; |
604 | 604 | ||
605 | if (user_mode_vm(regs)) | 605 | if (user_mode(regs)) |
606 | return 0; | 606 | return 0; |
607 | 607 | ||
608 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); | 608 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); |
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | |||
1007 | struct die_args *args = data; | 1007 | struct die_args *args = data; |
1008 | int ret = NOTIFY_DONE; | 1008 | int ret = NOTIFY_DONE; |
1009 | 1009 | ||
1010 | if (args->regs && user_mode_vm(args->regs)) | 1010 | if (args->regs && user_mode(args->regs)) |
1011 | return ret; | 1011 | return ret; |
1012 | 1012 | ||
1013 | if (val == DIE_GPF) { | 1013 | if (val == DIE_GPF) { |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d1ac80b72c72..005c03e93fc5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -33,6 +33,7 @@ | |||
33 | 33 | ||
34 | #include <asm/page.h> | 34 | #include <asm/page.h> |
35 | #include <asm/pgtable.h> | 35 | #include <asm/pgtable.h> |
36 | #include <asm/setup.h> | ||
36 | 37 | ||
37 | #if 0 | 38 | #if 0 |
38 | #define DEBUGP(fmt, ...) \ | 39 | #define DEBUGP(fmt, ...) \ |
@@ -47,21 +48,13 @@ do { \ | |||
47 | 48 | ||
48 | #ifdef CONFIG_RANDOMIZE_BASE | 49 | #ifdef CONFIG_RANDOMIZE_BASE |
49 | static unsigned long module_load_offset; | 50 | static unsigned long module_load_offset; |
50 | static int randomize_modules = 1; | ||
51 | 51 | ||
52 | /* Mutex protects the module_load_offset. */ | 52 | /* Mutex protects the module_load_offset. */ |
53 | static DEFINE_MUTEX(module_kaslr_mutex); | 53 | static DEFINE_MUTEX(module_kaslr_mutex); |
54 | 54 | ||
55 | static int __init parse_nokaslr(char *p) | ||
56 | { | ||
57 | randomize_modules = 0; | ||
58 | return 0; | ||
59 | } | ||
60 | early_param("nokaslr", parse_nokaslr); | ||
61 | |||
62 | static unsigned long int get_module_load_offset(void) | 55 | static unsigned long int get_module_load_offset(void) |
63 | { | 56 | { |
64 | if (randomize_modules) { | 57 | if (kaslr_enabled()) { |
65 | mutex_lock(&module_kaslr_mutex); | 58 | mutex_lock(&module_kaslr_mutex); |
66 | /* | 59 | /* |
67 | * Calculate the module_load_offset the first time this | 60 | * Calculate the module_load_offset the first time this |
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..da8cb987b973 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c | |||
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
131 | } | 131 | } |
132 | 132 | ||
133 | /* | 133 | /* |
134 | * RIP, flags, and the argument registers are usually saved. | 134 | * These registers are always saved on 64-bit syscall entry. |
135 | * orig_ax is probably okay, too. | 135 | * On 32-bit entry points, they are saved too except r8..r11. |
136 | */ | 136 | */ |
137 | regs_user_copy->ip = user_regs->ip; | 137 | regs_user_copy->ip = user_regs->ip; |
138 | regs_user_copy->ax = user_regs->ax; | ||
138 | regs_user_copy->cx = user_regs->cx; | 139 | regs_user_copy->cx = user_regs->cx; |
139 | regs_user_copy->dx = user_regs->dx; | 140 | regs_user_copy->dx = user_regs->dx; |
140 | regs_user_copy->si = user_regs->si; | 141 | regs_user_copy->si = user_regs->si; |
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
145 | regs_user_copy->r11 = user_regs->r11; | 146 | regs_user_copy->r11 = user_regs->r11; |
146 | regs_user_copy->orig_ax = user_regs->orig_ax; | 147 | regs_user_copy->orig_ax = user_regs->orig_ax; |
147 | regs_user_copy->flags = user_regs->flags; | 148 | regs_user_copy->flags = user_regs->flags; |
149 | regs_user_copy->sp = user_regs->sp; | ||
150 | regs_user_copy->cs = user_regs->cs; | ||
151 | regs_user_copy->ss = user_regs->ss; | ||
148 | 152 | ||
149 | /* | 153 | /* |
150 | * Don't even try to report the "rest" regs. | 154 | * Most system calls don't save these registers, don't report them. |
151 | */ | 155 | */ |
152 | regs_user_copy->bx = -1; | 156 | regs_user_copy->bx = -1; |
153 | regs_user_copy->bp = -1; | 157 | regs_user_copy->bp = -1; |
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
158 | 162 | ||
159 | /* | 163 | /* |
160 | * For this to be at all useful, we need a reasonable guess for | 164 | * For this to be at all useful, we need a reasonable guess for |
161 | * sp and the ABI. Be careful: we're in NMI context, and we're | 165 | * the ABI. Be careful: we're in NMI context, and we're |
162 | * considering current to be the current task, so we should | 166 | * considering current to be the current task, so we should |
163 | * be careful not to look at any other percpu variables that might | 167 | * be careful not to look at any other percpu variables that might |
164 | * change during context switches. | 168 | * change during context switches. |
165 | */ | 169 | */ |
166 | if (IS_ENABLED(CONFIG_IA32_EMULATION) && | 170 | regs_user->abi = user_64bit_mode(user_regs) ? |
167 | task_thread_info(current)->status & TS_COMPAT) { | 171 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; |
168 | /* Easy case: we're in a compat syscall. */ | ||
169 | regs_user->abi = PERF_SAMPLE_REGS_ABI_32; | ||
170 | regs_user_copy->sp = user_regs->sp; | ||
171 | regs_user_copy->cs = user_regs->cs; | ||
172 | regs_user_copy->ss = user_regs->ss; | ||
173 | } else if (user_regs->orig_ax != -1) { | ||
174 | /* | ||
175 | * We're probably in a 64-bit syscall. | ||
176 | * Warning: this code is severely racy. At least it's better | ||
177 | * than just blindly copying user_regs. | ||
178 | */ | ||
179 | regs_user->abi = PERF_SAMPLE_REGS_ABI_64; | ||
180 | regs_user_copy->sp = this_cpu_read(old_rsp); | ||
181 | regs_user_copy->cs = __USER_CS; | ||
182 | regs_user_copy->ss = __USER_DS; | ||
183 | regs_user_copy->cx = -1; /* usually contains garbage */ | ||
184 | } else { | ||
185 | /* We're probably in an interrupt or exception. */ | ||
186 | regs_user->abi = user_64bit_mode(user_regs) ? | ||
187 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; | ||
188 | regs_user_copy->sp = user_regs->sp; | ||
189 | regs_user_copy->cs = user_regs->cs; | ||
190 | regs_user_copy->ss = user_regs->ss; | ||
191 | } | ||
192 | 172 | ||
193 | regs_user->regs = regs_user_copy; | 173 | regs_user->regs = regs_user_copy; |
194 | } | 174 | } |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7af7b6478637..0c8992dbead5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -38,7 +38,26 @@ | |||
38 | * section. Since TSS's are completely CPU-local, we want them | 38 | * section. Since TSS's are completely CPU-local, we want them |
39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | 39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
40 | */ | 40 | */ |
41 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | 41 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { |
42 | .x86_tss = { | ||
43 | .sp0 = TOP_OF_INIT_STACK, | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | .ss0 = __KERNEL_DS, | ||
46 | .ss1 = __KERNEL_CS, | ||
47 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, | ||
48 | #endif | ||
49 | }, | ||
50 | #ifdef CONFIG_X86_32 | ||
51 | /* | ||
52 | * Note that the .io_bitmap member must be extra-big. This is because | ||
53 | * the CPU will access an additional byte beyond the end of the IO | ||
54 | * permission bitmap. The extra byte must be all 1 bits, and must | ||
55 | * be within the limit. | ||
56 | */ | ||
57 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, | ||
58 | #endif | ||
59 | }; | ||
60 | EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); | ||
42 | 61 | ||
43 | #ifdef CONFIG_X86_64 | 62 | #ifdef CONFIG_X86_64 |
44 | static DEFINE_PER_CPU(unsigned char, is_idle); | 63 | static DEFINE_PER_CPU(unsigned char, is_idle); |
@@ -110,7 +129,7 @@ void exit_thread(void) | |||
110 | unsigned long *bp = t->io_bitmap_ptr; | 129 | unsigned long *bp = t->io_bitmap_ptr; |
111 | 130 | ||
112 | if (bp) { | 131 | if (bp) { |
113 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | 132 | struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); |
114 | 133 | ||
115 | t->io_bitmap_ptr = NULL; | 134 | t->io_bitmap_ptr = NULL; |
116 | clear_thread_flag(TIF_IO_BITMAP); | 135 | clear_thread_flag(TIF_IO_BITMAP); |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..8ed2106b06da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
73 | unsigned long sp; | 73 | unsigned long sp; |
74 | unsigned short ss, gs; | 74 | unsigned short ss, gs; |
75 | 75 | ||
76 | if (user_mode_vm(regs)) { | 76 | if (user_mode(regs)) { |
77 | sp = regs->sp; | 77 | sp = regs->sp; |
78 | ss = regs->ss & 0xffff; | 78 | ss = regs->ss & 0xffff; |
79 | gs = get_user_gs(regs); | 79 | gs = get_user_gs(regs); |
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | |||
206 | regs->ip = new_ip; | 206 | regs->ip = new_ip; |
207 | regs->sp = new_sp; | 207 | regs->sp = new_sp; |
208 | regs->flags = X86_EFLAGS_IF; | 208 | regs->flags = X86_EFLAGS_IF; |
209 | /* | 209 | force_iret(); |
210 | * force it to the iret return path by making it look as if there was | ||
211 | * some work pending. | ||
212 | */ | ||
213 | set_thread_flag(TIF_NOTIFY_RESUME); | ||
214 | } | 210 | } |
215 | EXPORT_SYMBOL_GPL(start_thread); | 211 | EXPORT_SYMBOL_GPL(start_thread); |
216 | 212 | ||
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
248 | struct thread_struct *prev = &prev_p->thread, | 244 | struct thread_struct *prev = &prev_p->thread, |
249 | *next = &next_p->thread; | 245 | *next = &next_p->thread; |
250 | int cpu = smp_processor_id(); | 246 | int cpu = smp_processor_id(); |
251 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 247 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
252 | fpu_switch_t fpu; | 248 | fpu_switch_t fpu; |
253 | 249 | ||
254 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 250 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
256 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 252 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
257 | 253 | ||
258 | /* | 254 | /* |
259 | * Reload esp0. | ||
260 | */ | ||
261 | load_sp0(tss, next); | ||
262 | |||
263 | /* | ||
264 | * Save away %gs. No need to save %fs, as it was saved on the | 255 | * Save away %gs. No need to save %fs, as it was saved on the |
265 | * stack on entry. No need to save %es and %ds, as those are | 256 | * stack on entry. No need to save %es and %ds, as those are |
266 | * always kernel segments while inside the kernel. Doing this | 257 | * always kernel segments while inside the kernel. Doing this |
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
310 | */ | 301 | */ |
311 | arch_end_context_switch(next_p); | 302 | arch_end_context_switch(next_p); |
312 | 303 | ||
304 | /* | ||
305 | * Reload esp0, kernel_stack, and current_top_of_stack. This changes | ||
306 | * current_thread_info(). | ||
307 | */ | ||
308 | load_sp0(tss, next); | ||
313 | this_cpu_write(kernel_stack, | 309 | this_cpu_write(kernel_stack, |
314 | (unsigned long)task_stack_page(next_p) + | 310 | (unsigned long)task_stack_page(next_p) + |
315 | THREAD_SIZE - KERNEL_STACK_OFFSET); | 311 | THREAD_SIZE); |
312 | this_cpu_write(cpu_current_top_of_stack, | ||
313 | (unsigned long)task_stack_page(next_p) + | ||
314 | THREAD_SIZE); | ||
316 | 315 | ||
317 | /* | 316 | /* |
318 | * Restore %gs if needed (which is common) | 317 | * Restore %gs if needed (which is common) |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43577d2..4baaa972f52a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -52,7 +52,7 @@ | |||
52 | 52 | ||
53 | asmlinkage extern void ret_from_fork(void); | 53 | asmlinkage extern void ret_from_fork(void); |
54 | 54 | ||
55 | __visible DEFINE_PER_CPU(unsigned long, old_rsp); | 55 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); |
56 | 56 | ||
57 | /* Prints also some state that isn't saved in the pt_regs */ | 57 | /* Prints also some state that isn't saved in the pt_regs */ |
58 | void __show_regs(struct pt_regs *regs, int all) | 58 | void __show_regs(struct pt_regs *regs, int all) |
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; | 161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; |
162 | childregs = task_pt_regs(p); | 162 | childregs = task_pt_regs(p); |
163 | p->thread.sp = (unsigned long) childregs; | 163 | p->thread.sp = (unsigned long) childregs; |
164 | p->thread.usersp = me->thread.usersp; | ||
165 | set_tsk_thread_flag(p, TIF_FORK); | 164 | set_tsk_thread_flag(p, TIF_FORK); |
166 | p->thread.io_bitmap_ptr = NULL; | 165 | p->thread.io_bitmap_ptr = NULL; |
167 | 166 | ||
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
207 | */ | 206 | */ |
208 | if (clone_flags & CLONE_SETTLS) { | 207 | if (clone_flags & CLONE_SETTLS) { |
209 | #ifdef CONFIG_IA32_EMULATION | 208 | #ifdef CONFIG_IA32_EMULATION |
210 | if (test_thread_flag(TIF_IA32)) | 209 | if (is_ia32_task()) |
211 | err = do_set_thread_area(p, -1, | 210 | err = do_set_thread_area(p, -1, |
212 | (struct user_desc __user *)childregs->si, 0); | 211 | (struct user_desc __user *)childregs->si, 0); |
213 | else | 212 | else |
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | |||
235 | loadsegment(es, _ds); | 234 | loadsegment(es, _ds); |
236 | loadsegment(ds, _ds); | 235 | loadsegment(ds, _ds); |
237 | load_gs_index(0); | 236 | load_gs_index(0); |
238 | current->thread.usersp = new_sp; | ||
239 | regs->ip = new_ip; | 237 | regs->ip = new_ip; |
240 | regs->sp = new_sp; | 238 | regs->sp = new_sp; |
241 | this_cpu_write(old_rsp, new_sp); | ||
242 | regs->cs = _cs; | 239 | regs->cs = _cs; |
243 | regs->ss = _ss; | 240 | regs->ss = _ss; |
244 | regs->flags = X86_EFLAGS_IF; | 241 | regs->flags = X86_EFLAGS_IF; |
242 | force_iret(); | ||
245 | } | 243 | } |
246 | 244 | ||
247 | void | 245 | void |
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
277 | struct thread_struct *prev = &prev_p->thread; | 275 | struct thread_struct *prev = &prev_p->thread; |
278 | struct thread_struct *next = &next_p->thread; | 276 | struct thread_struct *next = &next_p->thread; |
279 | int cpu = smp_processor_id(); | 277 | int cpu = smp_processor_id(); |
280 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 278 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
281 | unsigned fsindex, gsindex; | 279 | unsigned fsindex, gsindex; |
282 | fpu_switch_t fpu; | 280 | fpu_switch_t fpu; |
283 | 281 | ||
284 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 282 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
285 | 283 | ||
286 | /* Reload esp0 and ss1. */ | ||
287 | load_sp0(tss, next); | ||
288 | |||
289 | /* We must save %fs and %gs before load_TLS() because | 284 | /* We must save %fs and %gs before load_TLS() because |
290 | * %fs and %gs may be cleared by load_TLS(). | 285 | * %fs and %gs may be cleared by load_TLS(). |
291 | * | 286 | * |
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
401 | /* | 396 | /* |
402 | * Switch the PDA and FPU contexts. | 397 | * Switch the PDA and FPU contexts. |
403 | */ | 398 | */ |
404 | prev->usersp = this_cpu_read(old_rsp); | ||
405 | this_cpu_write(old_rsp, next->usersp); | ||
406 | this_cpu_write(current_task, next_p); | 399 | this_cpu_write(current_task, next_p); |
407 | 400 | ||
408 | /* | 401 | /* |
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
413 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | 406 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); |
414 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | 407 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); |
415 | 408 | ||
409 | /* Reload esp0 and ss1. This changes current_thread_info(). */ | ||
410 | load_sp0(tss, next); | ||
411 | |||
416 | this_cpu_write(kernel_stack, | 412 | this_cpu_write(kernel_stack, |
417 | (unsigned long)task_stack_page(next_p) + | 413 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE); |
418 | THREAD_SIZE - KERNEL_STACK_OFFSET); | ||
419 | 414 | ||
420 | /* | 415 | /* |
421 | * Now maybe reload the debug registers and handle I/O bitmaps | 416 | * Now maybe reload the debug registers and handle I/O bitmaps |
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr) | |||
602 | 597 | ||
603 | unsigned long KSTK_ESP(struct task_struct *task) | 598 | unsigned long KSTK_ESP(struct task_struct *task) |
604 | { | 599 | { |
605 | return (test_tsk_thread_flag(task, TIF_IA32)) ? | 600 | return task_pt_regs(task)->sp; |
606 | (task_pt_regs(task)->sp) : ((task)->thread.usersp); | ||
607 | } | 601 | } |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..a7bc79480719 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, | |||
364 | case offsetof(struct user_regs_struct,cs): | 364 | case offsetof(struct user_regs_struct,cs): |
365 | if (unlikely(value == 0)) | 365 | if (unlikely(value == 0)) |
366 | return -EIO; | 366 | return -EIO; |
367 | #ifdef CONFIG_IA32_EMULATION | 367 | task_pt_regs(task)->cs = value; |
368 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
369 | task_pt_regs(task)->cs = value; | ||
370 | #endif | ||
371 | break; | 368 | break; |
372 | case offsetof(struct user_regs_struct,ss): | 369 | case offsetof(struct user_regs_struct,ss): |
373 | if (unlikely(value == 0)) | 370 | if (unlikely(value == 0)) |
374 | return -EIO; | 371 | return -EIO; |
375 | #ifdef CONFIG_IA32_EMULATION | 372 | task_pt_regs(task)->ss = value; |
376 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
377 | task_pt_regs(task)->ss = value; | ||
378 | #endif | ||
379 | break; | 373 | break; |
380 | } | 374 | } |
381 | 375 | ||
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, | |||
1421 | memset(info, 0, sizeof(*info)); | 1415 | memset(info, 0, sizeof(*info)); |
1422 | info->si_signo = SIGTRAP; | 1416 | info->si_signo = SIGTRAP; |
1423 | info->si_code = si_code; | 1417 | info->si_code = si_code; |
1424 | info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; | 1418 | info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; |
1425 | } | 1419 | } |
1426 | 1420 | ||
1427 | void user_single_step_siginfo(struct task_struct *tsk, | 1421 | void user_single_step_siginfo(struct task_struct *tsk, |
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -226,23 +226,23 @@ swap_pages: | |||
226 | movl (%ebx), %ecx | 226 | movl (%ebx), %ecx |
227 | addl $4, %ebx | 227 | addl $4, %ebx |
228 | 1: | 228 | 1: |
229 | testl $0x1, %ecx /* is it a destination page */ | 229 | testb $0x1, %cl /* is it a destination page */ |
230 | jz 2f | 230 | jz 2f |
231 | movl %ecx, %edi | 231 | movl %ecx, %edi |
232 | andl $0xfffff000, %edi | 232 | andl $0xfffff000, %edi |
233 | jmp 0b | 233 | jmp 0b |
234 | 2: | 234 | 2: |
235 | testl $0x2, %ecx /* is it an indirection page */ | 235 | testb $0x2, %cl /* is it an indirection page */ |
236 | jz 2f | 236 | jz 2f |
237 | movl %ecx, %ebx | 237 | movl %ecx, %ebx |
238 | andl $0xfffff000, %ebx | 238 | andl $0xfffff000, %ebx |
239 | jmp 0b | 239 | jmp 0b |
240 | 2: | 240 | 2: |
241 | testl $0x4, %ecx /* is it the done indicator */ | 241 | testb $0x4, %cl /* is it the done indicator */ |
242 | jz 2f | 242 | jz 2f |
243 | jmp 3f | 243 | jmp 3f |
244 | 2: | 244 | 2: |
245 | testl $0x8, %ecx /* is it the source indicator */ | 245 | testb $0x8, %cl /* is it the source indicator */ |
246 | jz 0b /* Ignore it otherwise */ | 246 | jz 0b /* Ignore it otherwise */ |
247 | movl %ecx, %esi /* For every source page do a copy */ | 247 | movl %ecx, %esi /* For every source page do a copy */ |
248 | andl $0xfffff000, %esi | 248 | andl $0xfffff000, %esi |
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..98111b38ebfd 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
@@ -123,7 +123,7 @@ identity_mapped: | |||
123 | * Set cr4 to a known state: | 123 | * Set cr4 to a known state: |
124 | * - physical address extension enabled | 124 | * - physical address extension enabled |
125 | */ | 125 | */ |
126 | movq $X86_CR4_PAE, %rax | 126 | movl $X86_CR4_PAE, %eax |
127 | movq %rax, %cr4 | 127 | movq %rax, %cr4 |
128 | 128 | ||
129 | jmp 1f | 129 | jmp 1f |
@@ -221,23 +221,23 @@ swap_pages: | |||
221 | movq (%rbx), %rcx | 221 | movq (%rbx), %rcx |
222 | addq $8, %rbx | 222 | addq $8, %rbx |
223 | 1: | 223 | 1: |
224 | testq $0x1, %rcx /* is it a destination page? */ | 224 | testb $0x1, %cl /* is it a destination page? */ |
225 | jz 2f | 225 | jz 2f |
226 | movq %rcx, %rdi | 226 | movq %rcx, %rdi |
227 | andq $0xfffffffffffff000, %rdi | 227 | andq $0xfffffffffffff000, %rdi |
228 | jmp 0b | 228 | jmp 0b |
229 | 2: | 229 | 2: |
230 | testq $0x2, %rcx /* is it an indirection page? */ | 230 | testb $0x2, %cl /* is it an indirection page? */ |
231 | jz 2f | 231 | jz 2f |
232 | movq %rcx, %rbx | 232 | movq %rcx, %rbx |
233 | andq $0xfffffffffffff000, %rbx | 233 | andq $0xfffffffffffff000, %rbx |
234 | jmp 0b | 234 | jmp 0b |
235 | 2: | 235 | 2: |
236 | testq $0x4, %rcx /* is it the done indicator? */ | 236 | testb $0x4, %cl /* is it the done indicator? */ |
237 | jz 2f | 237 | jz 2f |
238 | jmp 3f | 238 | jmp 3f |
239 | 2: | 239 | 2: |
240 | testq $0x8, %rcx /* is it the source indicator? */ | 240 | testb $0x8, %cl /* is it the source indicator? */ |
241 | jz 0b /* Ignore it otherwise */ | 241 | jz 0b /* Ignore it otherwise */ |
242 | movq %rcx, %rsi /* For ever source page do a copy */ | 242 | movq %rcx, %rsi /* For ever source page do a copy */ |
243 | andq $0xfffffffffffff000, %rsi | 243 | andq $0xfffffffffffff000, %rsi |
@@ -246,17 +246,17 @@ swap_pages: | |||
246 | movq %rsi, %rax | 246 | movq %rsi, %rax |
247 | 247 | ||
248 | movq %r10, %rdi | 248 | movq %r10, %rdi |
249 | movq $512, %rcx | 249 | movl $512, %ecx |
250 | rep ; movsq | 250 | rep ; movsq |
251 | 251 | ||
252 | movq %rax, %rdi | 252 | movq %rax, %rdi |
253 | movq %rdx, %rsi | 253 | movq %rdx, %rsi |
254 | movq $512, %rcx | 254 | movl $512, %ecx |
255 | rep ; movsq | 255 | rep ; movsq |
256 | 256 | ||
257 | movq %rdx, %rdi | 257 | movq %rdx, %rdi |
258 | movq %r10, %rsi | 258 | movq %r10, %rsi |
259 | movq $512, %rcx | 259 | movl $512, %ecx |
260 | rep ; movsq | 260 | rep ; movsq |
261 | 261 | ||
262 | lea PAGE_SIZE(%rax), %rsi | 262 | lea PAGE_SIZE(%rax), %rsi |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a2421cca01f..014466b152b5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void) | |||
832 | static int | 832 | static int |
833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) | 833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) |
834 | { | 834 | { |
835 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx " | 835 | if (kaslr_enabled()) { |
836 | "(relocation range: 0x%lx-0x%lx)\n", | 836 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", |
837 | (unsigned long)&_text - __START_KERNEL, __START_KERNEL, | 837 | (unsigned long)&_text - __START_KERNEL, |
838 | __START_KERNEL_map, MODULES_VADDR-1); | 838 | __START_KERNEL, |
839 | __START_KERNEL_map, | ||
840 | MODULES_VADDR-1); | ||
841 | } else { | ||
842 | pr_emerg("Kernel Offset: disabled\n"); | ||
843 | } | ||
839 | 844 | ||
840 | return 0; | 845 | return 0; |
841 | } | 846 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..53cc4085c3d7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -61,8 +61,7 @@ | |||
61 | regs->seg = GET_SEG(seg) | 3; \ | 61 | regs->seg = GET_SEG(seg) | 3; \ |
62 | } while (0) | 62 | } while (0) |
63 | 63 | ||
64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | 64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) |
65 | unsigned long *pax) | ||
66 | { | 65 | { |
67 | void __user *buf; | 66 | void __user *buf; |
68 | unsigned int tmpflags; | 67 | unsigned int tmpflags; |
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
81 | #endif /* CONFIG_X86_32 */ | 80 | #endif /* CONFIG_X86_32 */ |
82 | 81 | ||
83 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 82 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
84 | COPY(dx); COPY(cx); COPY(ip); | 83 | COPY(dx); COPY(cx); COPY(ip); COPY(ax); |
85 | 84 | ||
86 | #ifdef CONFIG_X86_64 | 85 | #ifdef CONFIG_X86_64 |
87 | COPY(r8); | 86 | COPY(r8); |
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
94 | COPY(r15); | 93 | COPY(r15); |
95 | #endif /* CONFIG_X86_64 */ | 94 | #endif /* CONFIG_X86_64 */ |
96 | 95 | ||
97 | #ifdef CONFIG_X86_32 | ||
98 | COPY_SEG_CPL3(cs); | 96 | COPY_SEG_CPL3(cs); |
99 | COPY_SEG_CPL3(ss); | 97 | COPY_SEG_CPL3(ss); |
100 | #else /* !CONFIG_X86_32 */ | ||
101 | /* Kernel saves and restores only the CS segment register on signals, | ||
102 | * which is the bare minimum needed to allow mixed 32/64-bit code. | ||
103 | * App's signal handler can save/restore other segments if needed. */ | ||
104 | COPY_SEG_CPL3(cs); | ||
105 | #endif /* CONFIG_X86_32 */ | ||
106 | 98 | ||
107 | get_user_ex(tmpflags, &sc->flags); | 99 | get_user_ex(tmpflags, &sc->flags); |
108 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); | 100 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); |
109 | regs->orig_ax = -1; /* disable syscall checks */ | 101 | regs->orig_ax = -1; /* disable syscall checks */ |
110 | 102 | ||
111 | get_user_ex(buf, &sc->fpstate); | 103 | get_user_ex(buf, &sc->fpstate); |
112 | |||
113 | get_user_ex(*pax, &sc->ax); | ||
114 | } get_user_catch(err); | 104 | } get_user_catch(err); |
115 | 105 | ||
116 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); | 106 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); |
117 | 107 | ||
108 | force_iret(); | ||
109 | |||
118 | return err; | 110 | return err; |
119 | } | 111 | } |
120 | 112 | ||
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, | |||
162 | #else /* !CONFIG_X86_32 */ | 154 | #else /* !CONFIG_X86_32 */ |
163 | put_user_ex(regs->flags, &sc->flags); | 155 | put_user_ex(regs->flags, &sc->flags); |
164 | put_user_ex(regs->cs, &sc->cs); | 156 | put_user_ex(regs->cs, &sc->cs); |
165 | put_user_ex(0, &sc->gs); | 157 | put_user_ex(0, &sc->__pad2); |
166 | put_user_ex(0, &sc->fs); | 158 | put_user_ex(0, &sc->__pad1); |
159 | put_user_ex(regs->ss, &sc->ss); | ||
167 | #endif /* CONFIG_X86_32 */ | 160 | #endif /* CONFIG_X86_32 */ |
168 | 161 | ||
169 | put_user_ex(fpstate, &sc->fpstate); | 162 | put_user_ex(fpstate, &sc->fpstate); |
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, | |||
457 | 450 | ||
458 | regs->sp = (unsigned long)frame; | 451 | regs->sp = (unsigned long)frame; |
459 | 452 | ||
460 | /* Set up the CS register to run signal handlers in 64-bit mode, | 453 | /* |
461 | even if the handler happens to be interrupting 32-bit code. */ | 454 | * Set up the CS and SS registers to run signal handlers in |
455 | * 64-bit mode, even if the handler happens to be interrupting | ||
456 | * 32-bit or 16-bit code. | ||
457 | * | ||
458 | * SS is subtle. In 64-bit mode, we don't need any particular | ||
459 | * SS descriptor, but we do need SS to be valid. It's possible | ||
460 | * that the old SS is entirely bogus -- this can happen if the | ||
461 | * signal we're trying to deliver is #GP or #SS caused by a bad | ||
462 | * SS value. | ||
463 | */ | ||
462 | regs->cs = __USER_CS; | 464 | regs->cs = __USER_CS; |
465 | regs->ss = __USER_DS; | ||
463 | 466 | ||
464 | return 0; | 467 | return 0; |
465 | } | 468 | } |
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
539 | { | 542 | { |
540 | struct pt_regs *regs = current_pt_regs(); | 543 | struct pt_regs *regs = current_pt_regs(); |
541 | struct sigframe __user *frame; | 544 | struct sigframe __user *frame; |
542 | unsigned long ax; | ||
543 | sigset_t set; | 545 | sigset_t set; |
544 | 546 | ||
545 | frame = (struct sigframe __user *)(regs->sp - 8); | 547 | frame = (struct sigframe __user *)(regs->sp - 8); |
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
553 | 555 | ||
554 | set_current_blocked(&set); | 556 | set_current_blocked(&set); |
555 | 557 | ||
556 | if (restore_sigcontext(regs, &frame->sc, &ax)) | 558 | if (restore_sigcontext(regs, &frame->sc)) |
557 | goto badframe; | 559 | goto badframe; |
558 | return ax; | 560 | return regs->ax; |
559 | 561 | ||
560 | badframe: | 562 | badframe: |
561 | signal_fault(regs, frame, "sigreturn"); | 563 | signal_fault(regs, frame, "sigreturn"); |
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) | |||
568 | { | 570 | { |
569 | struct pt_regs *regs = current_pt_regs(); | 571 | struct pt_regs *regs = current_pt_regs(); |
570 | struct rt_sigframe __user *frame; | 572 | struct rt_sigframe __user *frame; |
571 | unsigned long ax; | ||
572 | sigset_t set; | 573 | sigset_t set; |
573 | 574 | ||
574 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); | 575 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); |
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void) | |||
579 | 580 | ||
580 | set_current_blocked(&set); | 581 | set_current_blocked(&set); |
581 | 582 | ||
582 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 583 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
583 | goto badframe; | 584 | goto badframe; |
584 | 585 | ||
585 | if (restore_altstack(&frame->uc.uc_stack)) | 586 | if (restore_altstack(&frame->uc.uc_stack)) |
586 | goto badframe; | 587 | goto badframe; |
587 | 588 | ||
588 | return ax; | 589 | return regs->ax; |
589 | 590 | ||
590 | badframe: | 591 | badframe: |
591 | signal_fault(regs, frame, "rt_sigreturn"); | 592 | signal_fault(regs, frame, "rt_sigreturn"); |
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
780 | struct pt_regs *regs = current_pt_regs(); | 781 | struct pt_regs *regs = current_pt_regs(); |
781 | struct rt_sigframe_x32 __user *frame; | 782 | struct rt_sigframe_x32 __user *frame; |
782 | sigset_t set; | 783 | sigset_t set; |
783 | unsigned long ax; | ||
784 | 784 | ||
785 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); | 785 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); |
786 | 786 | ||
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
791 | 791 | ||
792 | set_current_blocked(&set); | 792 | set_current_blocked(&set); |
793 | 793 | ||
794 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 794 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
795 | goto badframe; | 795 | goto badframe; |
796 | 796 | ||
797 | if (compat_restore_altstack(&frame->uc.uc_stack)) | 797 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
798 | goto badframe; | 798 | goto badframe; |
799 | 799 | ||
800 | return ax; | 800 | return regs->ax; |
801 | 801 | ||
802 | badframe: | 802 | badframe: |
803 | signal_fault(regs, frame, "x32 rt_sigreturn"); | 803 | signal_fault(regs, frame, "x32 rt_sigreturn"); |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ddd2c0674cda..7035f6b21c3f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -779,6 +779,26 @@ out: | |||
779 | return boot_error; | 779 | return boot_error; |
780 | } | 780 | } |
781 | 781 | ||
782 | void common_cpu_up(unsigned int cpu, struct task_struct *idle) | ||
783 | { | ||
784 | /* Just in case we booted with a single CPU. */ | ||
785 | alternatives_enable_smp(); | ||
786 | |||
787 | per_cpu(current_task, cpu) = idle; | ||
788 | |||
789 | #ifdef CONFIG_X86_32 | ||
790 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
791 | irq_ctx_init(cpu); | ||
792 | per_cpu(cpu_current_top_of_stack, cpu) = | ||
793 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
794 | #else | ||
795 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
796 | initial_gs = per_cpu_offset(cpu); | ||
797 | #endif | ||
798 | per_cpu(kernel_stack, cpu) = | ||
799 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
800 | } | ||
801 | |||
782 | /* | 802 | /* |
783 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | 803 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad |
784 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. | 804 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. |
@@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
796 | int cpu0_nmi_registered = 0; | 816 | int cpu0_nmi_registered = 0; |
797 | unsigned long timeout; | 817 | unsigned long timeout; |
798 | 818 | ||
799 | /* Just in case we booted with a single CPU. */ | ||
800 | alternatives_enable_smp(); | ||
801 | |||
802 | idle->thread.sp = (unsigned long) (((struct pt_regs *) | 819 | idle->thread.sp = (unsigned long) (((struct pt_regs *) |
803 | (THREAD_SIZE + task_stack_page(idle))) - 1); | 820 | (THREAD_SIZE + task_stack_page(idle))) - 1); |
804 | per_cpu(current_task, cpu) = idle; | ||
805 | 821 | ||
806 | #ifdef CONFIG_X86_32 | ||
807 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
808 | irq_ctx_init(cpu); | ||
809 | #else | ||
810 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
811 | initial_gs = per_cpu_offset(cpu); | ||
812 | #endif | ||
813 | per_cpu(kernel_stack, cpu) = | ||
814 | (unsigned long)task_stack_page(idle) - | ||
815 | KERNEL_STACK_OFFSET + THREAD_SIZE; | ||
816 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | 822 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); |
817 | initial_code = (unsigned long)start_secondary; | 823 | initial_code = (unsigned long)start_secondary; |
818 | stack_start = idle->thread.sp; | 824 | stack_start = idle->thread.sp; |
@@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
953 | /* the FPU context is blank, nobody can own it */ | 959 | /* the FPU context is blank, nobody can own it */ |
954 | __cpu_disable_lazy_restore(cpu); | 960 | __cpu_disable_lazy_restore(cpu); |
955 | 961 | ||
962 | common_cpu_up(cpu, tidle); | ||
963 | |||
956 | err = do_boot_cpu(apicid, cpu, tidle); | 964 | err = do_boot_cpu(apicid, cpu, tidle); |
957 | if (err) { | 965 | if (err) { |
958 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); | 966 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); |
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c | |||
@@ -5,21 +5,29 @@ | |||
5 | #include <linux/cache.h> | 5 | #include <linux/cache.h> |
6 | #include <asm/asm-offsets.h> | 6 | #include <asm/asm-offsets.h> |
7 | 7 | ||
8 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; | 8 | #ifdef CONFIG_IA32_EMULATION |
9 | #define SYM(sym, compat) compat | ||
10 | #else | ||
11 | #define SYM(sym, compat) sym | ||
12 | #define ia32_sys_call_table sys_call_table | ||
13 | #define __NR_ia32_syscall_max __NR_syscall_max | ||
14 | #endif | ||
15 | |||
16 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; | ||
9 | #include <asm/syscalls_32.h> | 17 | #include <asm/syscalls_32.h> |
10 | #undef __SYSCALL_I386 | 18 | #undef __SYSCALL_I386 |
11 | 19 | ||
12 | #define __SYSCALL_I386(nr, sym, compat) [nr] = sym, | 20 | #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), |
13 | 21 | ||
14 | typedef asmlinkage void (*sys_call_ptr_t)(void); | 22 | typedef asmlinkage void (*sys_call_ptr_t)(void); |
15 | 23 | ||
16 | extern asmlinkage void sys_ni_syscall(void); | 24 | extern asmlinkage void sys_ni_syscall(void); |
17 | 25 | ||
18 | __visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { | 26 | __visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { |
19 | /* | 27 | /* |
20 | * Smells like a compiler bug -- it doesn't work | 28 | * Smells like a compiler bug -- it doesn't work |
21 | * when the & below is removed. | 29 | * when the & below is removed. |
22 | */ | 30 | */ |
23 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | 31 | [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, |
24 | #include <asm/syscalls_32.h> | 32 | #include <asm/syscalls_32.h> |
25 | }; | 33 | }; |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e16eaa..d39c09119db6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
30 | { | 30 | { |
31 | unsigned long pc = instruction_pointer(regs); | 31 | unsigned long pc = instruction_pointer(regs); |
32 | 32 | ||
33 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | 33 | if (!user_mode(regs) && in_lock_functions(pc)) { |
34 | #ifdef CONFIG_FRAME_POINTER | 34 | #ifdef CONFIG_FRAME_POINTER |
35 | return *(unsigned long *)(regs->bp + sizeof(long)); | 35 | return *(unsigned long *)(regs->bp + sizeof(long)); |
36 | #else | 36 | #else |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4ff5d162ff9f..6751c5c58eec 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
112 | { | 112 | { |
113 | enum ctx_state prev_state; | 113 | enum ctx_state prev_state; |
114 | 114 | ||
115 | if (user_mode_vm(regs)) { | 115 | if (user_mode(regs)) { |
116 | /* Other than that, we're just an exception. */ | 116 | /* Other than that, we're just an exception. */ |
117 | prev_state = exception_enter(); | 117 | prev_state = exception_enter(); |
118 | } else { | 118 | } else { |
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
146 | /* Must be before exception_exit. */ | 146 | /* Must be before exception_exit. */ |
147 | preempt_count_sub(HARDIRQ_OFFSET); | 147 | preempt_count_sub(HARDIRQ_OFFSET); |
148 | 148 | ||
149 | if (user_mode_vm(regs)) | 149 | if (user_mode(regs)) |
150 | return exception_exit(prev_state); | 150 | return exception_exit(prev_state); |
151 | else | 151 | else |
152 | rcu_nmi_exit(); | 152 | rcu_nmi_exit(); |
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
158 | * | 158 | * |
159 | * IST exception handlers normally cannot schedule. As a special | 159 | * IST exception handlers normally cannot schedule. As a special |
160 | * exception, if the exception interrupted userspace code (i.e. | 160 | * exception, if the exception interrupted userspace code (i.e. |
161 | * user_mode_vm(regs) would return true) and the exception was not | 161 | * user_mode(regs) would return true) and the exception was not |
162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() | 162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() |
163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. | 163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. |
164 | * Callers are responsible for enabling interrupts themselves inside | 164 | * Callers are responsible for enabling interrupts themselves inside |
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
167 | */ | 167 | */ |
168 | void ist_begin_non_atomic(struct pt_regs *regs) | 168 | void ist_begin_non_atomic(struct pt_regs *regs) |
169 | { | 169 | { |
170 | BUG_ON(!user_mode_vm(regs)); | 170 | BUG_ON(!user_mode(regs)); |
171 | 171 | ||
172 | /* | 172 | /* |
173 | * Sanity check: we need to be on the normal thread stack. This | 173 | * Sanity check: we need to be on the normal thread stack. This |
174 | * will catch asm bugs and any attempt to use ist_preempt_enable | 174 | * will catch asm bugs and any attempt to use ist_preempt_enable |
175 | * from double_fault. | 175 | * from double_fault. |
176 | */ | 176 | */ |
177 | BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) | 177 | BUG_ON((unsigned long)(current_top_of_stack() - |
178 | & ~(THREAD_SIZE - 1)) != 0); | 178 | current_stack_pointer()) >= THREAD_SIZE); |
179 | 179 | ||
180 | preempt_count_sub(HARDIRQ_OFFSET); | 180 | preempt_count_sub(HARDIRQ_OFFSET); |
181 | } | 181 | } |
@@ -194,8 +194,7 @@ static nokprobe_inline int | |||
194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
195 | struct pt_regs *regs, long error_code) | 195 | struct pt_regs *regs, long error_code) |
196 | { | 196 | { |
197 | #ifdef CONFIG_X86_32 | 197 | if (v8086_mode(regs)) { |
198 | if (regs->flags & X86_VM_MASK) { | ||
199 | /* | 198 | /* |
200 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | 199 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. |
201 | * On nmi (interrupt 2), do_trap should not be called. | 200 | * On nmi (interrupt 2), do_trap should not be called. |
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | |||
207 | } | 206 | } |
208 | return -1; | 207 | return -1; |
209 | } | 208 | } |
210 | #endif | 209 | |
211 | if (!user_mode(regs)) { | 210 | if (!user_mode(regs)) { |
212 | if (!fixup_exception(regs)) { | 211 | if (!fixup_exception(regs)) { |
213 | tsk->thread.error_code = error_code; | 212 | tsk->thread.error_code = error_code; |
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) | |||
384 | goto exit; | 383 | goto exit; |
385 | conditional_sti(regs); | 384 | conditional_sti(regs); |
386 | 385 | ||
387 | if (!user_mode_vm(regs)) | 386 | if (!user_mode(regs)) |
388 | die("bounds", regs, error_code); | 387 | die("bounds", regs, error_code); |
389 | 388 | ||
390 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { | 389 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { |
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) | |||
462 | prev_state = exception_enter(); | 461 | prev_state = exception_enter(); |
463 | conditional_sti(regs); | 462 | conditional_sti(regs); |
464 | 463 | ||
465 | #ifdef CONFIG_X86_32 | 464 | if (v8086_mode(regs)) { |
466 | if (regs->flags & X86_VM_MASK) { | ||
467 | local_irq_enable(); | 465 | local_irq_enable(); |
468 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 466 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
469 | goto exit; | 467 | goto exit; |
470 | } | 468 | } |
471 | #endif | ||
472 | 469 | ||
473 | tsk = current; | 470 | tsk = current; |
474 | if (!user_mode(regs)) { | 471 | if (!user_mode(regs)) { |
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) | |||
587 | /* Copy the remainder of the stack from the current stack. */ | 584 | /* Copy the remainder of the stack from the current stack. */ |
588 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); | 585 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); |
589 | 586 | ||
590 | BUG_ON(!user_mode_vm(&new_stack->regs)); | 587 | BUG_ON(!user_mode(&new_stack->regs)); |
591 | return new_stack; | 588 | return new_stack; |
592 | } | 589 | } |
593 | NOKPROBE_SYMBOL(fixup_bad_iret); | 590 | NOKPROBE_SYMBOL(fixup_bad_iret); |
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
637 | * then it's very likely the result of an icebp/int01 trap. | 634 | * then it's very likely the result of an icebp/int01 trap. |
638 | * User wants a sigtrap for that. | 635 | * User wants a sigtrap for that. |
639 | */ | 636 | */ |
640 | if (!dr6 && user_mode_vm(regs)) | 637 | if (!dr6 && user_mode(regs)) |
641 | user_icebp = 1; | 638 | user_icebp = 1; |
642 | 639 | ||
643 | /* Catch kmemcheck conditions first of all! */ | 640 | /* Catch kmemcheck conditions first of all! */ |
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
673 | /* It's safe to allow irq's after DR6 has been saved */ | 670 | /* It's safe to allow irq's after DR6 has been saved */ |
674 | preempt_conditional_sti(regs); | 671 | preempt_conditional_sti(regs); |
675 | 672 | ||
676 | if (regs->flags & X86_VM_MASK) { | 673 | if (v8086_mode(regs)) { |
677 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, | 674 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, |
678 | X86_TRAP_DB); | 675 | X86_TRAP_DB); |
679 | preempt_conditional_cli(regs); | 676 | preempt_conditional_cli(regs); |
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) | |||
721 | return; | 718 | return; |
722 | conditional_sti(regs); | 719 | conditional_sti(regs); |
723 | 720 | ||
724 | if (!user_mode_vm(regs)) | 721 | if (!user_mode(regs)) |
725 | { | 722 | { |
726 | if (!fixup_exception(regs)) { | 723 | if (!fixup_exception(regs)) { |
727 | task->thread.error_code = error_code; | 724 | task->thread.error_code = error_code; |
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | |||
925 | /* Set of traps needed for early debugging. */ | 922 | /* Set of traps needed for early debugging. */ |
926 | void __init early_trap_init(void) | 923 | void __init early_trap_init(void) |
927 | { | 924 | { |
928 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 925 | /* |
926 | * Don't use IST to set DEBUG_STACK as it doesn't work until TSS | ||
927 | * is ready in cpu_init() <-- trap_init(). Before trap_init(), | ||
928 | * CPU runs at ring 0 so it is impossible to hit an invalid | ||
929 | * stack. Using the original stack works well enough at this | ||
930 | * early stage. DEBUG_STACK will be equipped after cpu_init() in | ||
931 | * trap_init(). | ||
932 | * | ||
933 | * We don't need to set trace_idt_table like set_intr_gate(), | ||
934 | * since we don't have trace_debug and it will be reset to | ||
935 | * 'debug' in trap_init() by set_intr_gate_ist(). | ||
936 | */ | ||
937 | set_intr_gate_notrace(X86_TRAP_DB, debug); | ||
929 | /* int3 can be called from all */ | 938 | /* int3 can be called from all */ |
930 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 939 | set_system_intr_gate(X86_TRAP_BP, &int3); |
931 | #ifdef CONFIG_X86_32 | 940 | #ifdef CONFIG_X86_32 |
932 | set_intr_gate(X86_TRAP_PF, page_fault); | 941 | set_intr_gate(X86_TRAP_PF, page_fault); |
933 | #endif | 942 | #endif |
@@ -1005,6 +1014,15 @@ void __init trap_init(void) | |||
1005 | */ | 1014 | */ |
1006 | cpu_init(); | 1015 | cpu_init(); |
1007 | 1016 | ||
1017 | /* | ||
1018 | * X86_TRAP_DB and X86_TRAP_BP have been set | ||
1019 | * in early_trap_init(). However, ITS works only after | ||
1020 | * cpu_init() loads TSS. See comments in early_trap_init(). | ||
1021 | */ | ||
1022 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | ||
1023 | /* int3 can be called from all */ | ||
1024 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | ||
1025 | |||
1008 | x86_init.irqs.trap_init(); | 1026 | x86_init.irqs.trap_init(); |
1009 | 1027 | ||
1010 | #ifdef CONFIG_X86_64 | 1028 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0679e..0b81ad67da07 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c | |||
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, | |||
912 | int ret = NOTIFY_DONE; | 912 | int ret = NOTIFY_DONE; |
913 | 913 | ||
914 | /* We are only interested in userspace traps */ | 914 | /* We are only interested in userspace traps */ |
915 | if (regs && !user_mode_vm(regs)) | 915 | if (regs && !user_mode(regs)) |
916 | return NOTIFY_DONE; | 916 | return NOTIFY_DONE; |
917 | 917 | ||
918 | switch (val) { | 918 | switch (val) { |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) | |||
150 | do_exit(SIGSEGV); | 150 | do_exit(SIGSEGV); |
151 | } | 151 | } |
152 | 152 | ||
153 | tss = &per_cpu(init_tss, get_cpu()); | 153 | tss = &per_cpu(cpu_tss, get_cpu()); |
154 | current->thread.sp0 = current->thread.saved_sp0; | 154 | current->thread.sp0 = current->thread.saved_sp0; |
155 | current->thread.sysenter_cs = __KERNEL_CS; | 155 | current->thread.sysenter_cs = __KERNEL_CS; |
156 | load_sp0(tss, ¤t->thread); | 156 | load_sp0(tss, ¤t->thread); |
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
318 | tsk->thread.saved_fs = info->regs32->fs; | 318 | tsk->thread.saved_fs = info->regs32->fs; |
319 | tsk->thread.saved_gs = get_user_gs(info->regs32); | 319 | tsk->thread.saved_gs = get_user_gs(info->regs32); |
320 | 320 | ||
321 | tss = &per_cpu(init_tss, get_cpu()); | 321 | tss = &per_cpu(cpu_tss, get_cpu()); |
322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; | 322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; |
323 | if (cpu_has_sep) | 323 | if (cpu_has_sep) |
324 | tsk->thread.sysenter_cs = 0; | 324 | tsk->thread.sysenter_cs = 0; |