aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/common.c87
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S978
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/signal.c50
-rw-r--r--arch/x86/kernel/smpboot.c36
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c56
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
37 files changed, 899 insertions, 827 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..c887cd944f0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
33obj-$(CONFIG_X86_64) += mcount_64.o 33obj-$(CONFIG_X86_64) += mcount_64.o
34obj-y += syscall_$(BITS).o vsyscall_gtod.o 34obj-y += syscall_$(BITS).o vsyscall_gtod.o
35obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
35obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o 36obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
36obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o 37obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
37obj-$(CONFIG_SYSFS) += ksysfs.o 38obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 52__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 53#endif
54 54
55#define DPRINTK(fmt, ...) \ 55#define DPRINTK(fmt, args...) \
56do { \ 56do { \
57 if (debug_alternative) \ 57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 58 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
59} while (0)
60
61#define DUMP_BYTES(buf, len, fmt, args...) \
62do { \
63 if (unlikely(debug_alternative)) { \
64 int j; \
65 \
66 if (!(len)) \
67 break; \
68 \
69 printk(KERN_DEBUG fmt, ##args); \
70 for (j = 0; j < (len) - 1; j++) \
71 printk(KERN_CONT "%02hhx ", buf[j]); \
72 printk(KERN_CONT "%02hhx\n", buf[j]); \
73 } \
59} while (0) 74} while (0)
60 75
61/* 76/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
243extern s32 __smp_locks[], __smp_locks_end[]; 258extern s32 __smp_locks[], __smp_locks_end[];
244void *text_poke_early(void *addr, const void *opcode, size_t len); 259void *text_poke_early(void *addr, const void *opcode, size_t len);
245 260
246/* Replace instructions with better alternatives for this CPU type. 261/*
247 This runs before SMP is initialized to avoid SMP problems with 262 * Are we looking at a near JMP with a 1 or 4-byte displacement.
248 self modifying code. This implies that asymmetric systems where 263 */
249 APs have less capabilities than the boot processor are not handled. 264static inline bool is_jmp(const u8 opcode)
250 Tough. Make sure you disable such features by hand. */ 265{
266 return opcode == 0xeb || opcode == 0xe9;
267}
268
269static void __init_or_module
270recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
271{
272 u8 *next_rip, *tgt_rip;
273 s32 n_dspl, o_dspl;
274 int repl_len;
275
276 if (a->replacementlen != 5)
277 return;
278
279 o_dspl = *(s32 *)(insnbuf + 1);
280
281 /* next_rip of the replacement JMP */
282 next_rip = repl_insn + a->replacementlen;
283 /* target rip of the replacement JMP */
284 tgt_rip = next_rip + o_dspl;
285 n_dspl = tgt_rip - orig_insn;
286
287 DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
288
289 if (tgt_rip - orig_insn >= 0) {
290 if (n_dspl - 2 <= 127)
291 goto two_byte_jmp;
292 else
293 goto five_byte_jmp;
294 /* negative offset */
295 } else {
296 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
297 goto two_byte_jmp;
298 else
299 goto five_byte_jmp;
300 }
301
302two_byte_jmp:
303 n_dspl -= 2;
304
305 insnbuf[0] = 0xeb;
306 insnbuf[1] = (s8)n_dspl;
307 add_nops(insnbuf + 2, 3);
308
309 repl_len = 2;
310 goto done;
311
312five_byte_jmp:
313 n_dspl -= 5;
314
315 insnbuf[0] = 0xe9;
316 *(s32 *)&insnbuf[1] = n_dspl;
251 317
318 repl_len = 5;
319
320done:
321
322 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
323 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
324}
325
326static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
327{
328 if (instr[0] != 0x90)
329 return;
330
331 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
332
333 DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
334 instr, a->instrlen - a->padlen, a->padlen);
335}
336
337/*
338 * Replace instructions with better alternatives for this CPU type. This runs
339 * before SMP is initialized to avoid SMP problems with self modifying code.
340 * This implies that asymmetric systems where APs have less capabilities than
341 * the boot processor are not handled. Tough. Make sure you disable such
342 * features by hand.
343 */
252void __init_or_module apply_alternatives(struct alt_instr *start, 344void __init_or_module apply_alternatives(struct alt_instr *start,
253 struct alt_instr *end) 345 struct alt_instr *end)
254{ 346{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
256 u8 *instr, *replacement; 348 u8 *instr, *replacement;
257 u8 insnbuf[MAX_PATCH_LEN]; 349 u8 insnbuf[MAX_PATCH_LEN];
258 350
259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 351 DPRINTK("alt table %p -> %p", start, end);
260 /* 352 /*
261 * The scan order should be from start to end. A later scanned 353 * The scan order should be from start to end. A later scanned
262 * alternative code can overwrite a previous scanned alternative code. 354 * alternative code can overwrite previously scanned alternative code.
263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 355 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
264 * patch code. 356 * patch code.
265 * 357 *
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
267 * order. 359 * order.
268 */ 360 */
269 for (a = start; a < end; a++) { 361 for (a = start; a < end; a++) {
362 int insnbuf_sz = 0;
363
270 instr = (u8 *)&a->instr_offset + a->instr_offset; 364 instr = (u8 *)&a->instr_offset + a->instr_offset;
271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 365 replacement = (u8 *)&a->repl_offset + a->repl_offset;
272 BUG_ON(a->replacementlen > a->instrlen);
273 BUG_ON(a->instrlen > sizeof(insnbuf)); 366 BUG_ON(a->instrlen > sizeof(insnbuf));
274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 367 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
275 if (!boot_cpu_has(a->cpuid)) 368 if (!boot_cpu_has(a->cpuid)) {
369 if (a->padlen > 1)
370 optimize_nops(a, instr);
371
276 continue; 372 continue;
373 }
374
375 DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
376 a->cpuid >> 5,
377 a->cpuid & 0x1f,
378 instr, a->instrlen,
379 replacement, a->replacementlen, a->padlen);
380
381 DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
382 DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
277 383
278 memcpy(insnbuf, replacement, a->replacementlen); 384 memcpy(insnbuf, replacement, a->replacementlen);
385 insnbuf_sz = a->replacementlen;
279 386
280 /* 0xe8 is a relative jump; fix the offset. */ 387 /* 0xe8 is a relative jump; fix the offset. */
281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 388 if (*insnbuf == 0xe8 && a->replacementlen == 5) {
282 *(s32 *)(insnbuf + 1) += replacement - instr; 389 *(s32 *)(insnbuf + 1) += replacement - instr;
390 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
391 *(s32 *)(insnbuf + 1),
392 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
393 }
394
395 if (a->replacementlen && is_jmp(replacement[0]))
396 recompute_jump(a, instr, replacement, insnbuf);
283 397
284 add_nops(insnbuf + a->replacementlen, 398 if (a->instrlen > a->replacementlen) {
285 a->instrlen - a->replacementlen); 399 add_nops(insnbuf + a->replacementlen,
400 a->instrlen - a->replacementlen);
401 insnbuf_sz += a->instrlen - a->replacementlen;
402 }
403 DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
286 404
287 text_poke_early(instr, insnbuf, a->instrlen); 405 text_poke_early(instr, insnbuf, insnbuf_sz);
288 } 406 }
289} 407}
290 408
291#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
292
293static void alternatives_smp_lock(const s32 *start, const s32 *end, 410static void alternatives_smp_lock(const s32 *start, const s32 *end,
294 u8 *text, u8 *text_end) 411 u8 *text, u8 *text_end)
295{ 412{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
371 smp->locks_end = locks_end; 488 smp->locks_end = locks_end;
372 smp->text = text; 489 smp->text = text;
373 smp->text_end = text_end; 490 smp->text_end = text_end;
374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 491 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
375 __func__, smp->locks, smp->locks_end, 492 smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 493 smp->text, smp->text_end, smp->name);
377 494
378 list_add_tail(&smp->next, &smp_alt_modules); 495 list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
440 557
441 return 0; 558 return 0;
442} 559}
443#endif 560#endif /* CONFIG_SMP */
444 561
445#ifdef CONFIG_PARAVIRT 562#ifdef CONFIG_PARAVIRT
446void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 563void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
601 if (likely(!bp_patching_in_progress)) 718 if (likely(!bp_patching_in_progress))
602 return 0; 719 return 0;
603 720
604 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 721 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
605 return 0; 722 return 0;
606 723
607 /* set up the specified breakpoint handler */ 724 /* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
68 68
69 /* Offset from the sysenter stack to tss.sp0 */ 69 /* Offset from the sysenter stack to tss.sp0 */
70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
71 sizeof(struct tss_struct)); 71 offsetofend(struct tss_struct, SYSENTER_stack));
72 72
73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
74 BLANK(); 74 BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
81#undef ENTRY 81#undef ENTRY
82 82
83 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 83 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
84 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
84 BLANK(); 85 BLANK();
85 86
86 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 87 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
712 712
713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
714
715 /* 3DNow or LM implies PREFETCHW */
716 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
717 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
718 set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
714} 719}
715 720
716#ifdef CONFIG_X86_32 721#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..3f70538012e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
959#endif 959#endif
960} 960}
961 961
962#ifdef CONFIG_X86_64 962/*
963#ifdef CONFIG_IA32_EMULATION 963 * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
964/* May not be __init: called during resume */ 964 * on 32-bit kernels:
965static void syscall32_cpu_init(void) 965 */
966{
967 /* Load these always in case some future AMD CPU supports
968 SYSENTER from compat mode too. */
969 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
970 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
971 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
972
973 wrmsrl(MSR_CSTAR, ia32_cstar_target);
974}
975#endif /* CONFIG_IA32_EMULATION */
976#endif /* CONFIG_X86_64 */
977
978#ifdef CONFIG_X86_32 966#ifdef CONFIG_X86_32
979void enable_sep_cpu(void) 967void enable_sep_cpu(void)
980{ 968{
981 int cpu = get_cpu(); 969 struct tss_struct *tss;
982 struct tss_struct *tss = &per_cpu(init_tss, cpu); 970 int cpu;
983 971
984 if (!boot_cpu_has(X86_FEATURE_SEP)) { 972 cpu = get_cpu();
985 put_cpu(); 973 tss = &per_cpu(cpu_tss, cpu);
986 return; 974
987 } 975 if (!boot_cpu_has(X86_FEATURE_SEP))
976 goto out;
977
978 /*
979 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
980 * see the big comment in struct x86_hw_tss's definition.
981 */
988 982
989 tss->x86_tss.ss1 = __KERNEL_CS; 983 tss->x86_tss.ss1 = __KERNEL_CS;
990 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; 984 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
991 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 985
992 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); 986 wrmsr(MSR_IA32_SYSENTER_ESP,
993 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); 987 (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
988 0);
989
990 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
991
992out:
994 put_cpu(); 993 put_cpu();
995} 994}
996#endif 995#endif
@@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg)
1118__setup("clearcpuid=", setup_disablecpuid); 1117__setup("clearcpuid=", setup_disablecpuid);
1119 1118
1120DEFINE_PER_CPU(unsigned long, kernel_stack) = 1119DEFINE_PER_CPU(unsigned long, kernel_stack) =
1121 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 1120 (unsigned long)&init_thread_union + THREAD_SIZE;
1122EXPORT_PER_CPU_SYMBOL(kernel_stack); 1121EXPORT_PER_CPU_SYMBOL(kernel_stack);
1123 1122
1124#ifdef CONFIG_X86_64 1123#ifdef CONFIG_X86_64
@@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
1130 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1129 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1131 1130
1132/* 1131/*
1133 * The following four percpu variables are hot. Align current_task to 1132 * The following percpu variables are hot. Align current_task to
1134 * cacheline size such that all four fall in the same cacheline. 1133 * cacheline size such that they fall in the same cacheline.
1135 */ 1134 */
1136DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = 1135DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1137 &init_task; 1136 &init_task;
@@ -1171,10 +1170,23 @@ void syscall_init(void)
1171 */ 1170 */
1172 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1171 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1173 wrmsrl(MSR_LSTAR, system_call); 1172 wrmsrl(MSR_LSTAR, system_call);
1174 wrmsrl(MSR_CSTAR, ignore_sysret);
1175 1173
1176#ifdef CONFIG_IA32_EMULATION 1174#ifdef CONFIG_IA32_EMULATION
1177 syscall32_cpu_init(); 1175 wrmsrl(MSR_CSTAR, ia32_cstar_target);
1176 /*
1177 * This only works on Intel CPUs.
1178 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
1179 * This does not cause SYSENTER to jump to the wrong location, because
1180 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
1181 */
1182 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1183 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1184 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
1185#else
1186 wrmsrl(MSR_CSTAR, ignore_sysret);
1187 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
1188 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1189 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
1178#endif 1190#endif
1179 1191
1180 /* Flags to clear on syscall */ 1192 /* Flags to clear on syscall */
@@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1226EXPORT_PER_CPU_SYMBOL(__preempt_count); 1238EXPORT_PER_CPU_SYMBOL(__preempt_count);
1227DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1239DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1228 1240
1241/*
1242 * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
1243 * the top of the kernel stack. Use an extra percpu variable to track the
1244 * top of the kernel stack directly.
1245 */
1246DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
1247 (unsigned long)&init_thread_union + THREAD_SIZE;
1248EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
1249
1229#ifdef CONFIG_CC_STACKPROTECTOR 1250#ifdef CONFIG_CC_STACKPROTECTOR
1230DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1251DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1231#endif 1252#endif
@@ -1307,7 +1328,7 @@ void cpu_init(void)
1307 */ 1328 */
1308 load_ucode_ap(); 1329 load_ucode_ap();
1309 1330
1310 t = &per_cpu(init_tss, cpu); 1331 t = &per_cpu(cpu_tss, cpu);
1311 oist = &per_cpu(orig_ist, cpu); 1332 oist = &per_cpu(orig_ist, cpu);
1312 1333
1313#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
@@ -1391,7 +1412,7 @@ void cpu_init(void)
1391{ 1412{
1392 int cpu = smp_processor_id(); 1413 int cpu = smp_processor_id();
1393 struct task_struct *curr = current; 1414 struct task_struct *curr = current;
1394 struct tss_struct *t = &per_cpu(init_tss, cpu); 1415 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
1395 struct thread_struct *thread = &curr->thread; 1416 struct thread_struct *thread = &curr->thread;
1396 1417
1397 wait_for_master_cpu(cpu); 1418 wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..e2888a3ad1e3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2147,24 +2147,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2147static unsigned long code_segment_base(struct pt_regs *regs) 2147static unsigned long code_segment_base(struct pt_regs *regs)
2148{ 2148{
2149 /* 2149 /*
2150 * For IA32 we look at the GDT/LDT segment base to convert the
2151 * effective IP to a linear address.
2152 */
2153
2154#ifdef CONFIG_X86_32
2155 /*
2150 * If we are in VM86 mode, add the segment offset to convert to a 2156 * If we are in VM86 mode, add the segment offset to convert to a
2151 * linear address. 2157 * linear address.
2152 */ 2158 */
2153 if (regs->flags & X86_VM_MASK) 2159 if (regs->flags & X86_VM_MASK)
2154 return 0x10 * regs->cs; 2160 return 0x10 * regs->cs;
2155 2161
2156 /*
2157 * For IA32 we look at the GDT/LDT segment base to convert the
2158 * effective IP to a linear address.
2159 */
2160#ifdef CONFIG_X86_32
2161 if (user_mode(regs) && regs->cs != __USER_CS) 2162 if (user_mode(regs) && regs->cs != __USER_CS)
2162 return get_segment_base(regs->cs); 2163 return get_segment_base(regs->cs);
2163#else 2164#else
2164 if (test_thread_flag(TIF_IA32)) { 2165 if (user_mode(regs) && !user_64bit_mode(regs) &&
2165 if (user_mode(regs) && regs->cs != __USER32_CS) 2166 regs->cs != __USER32_CS)
2166 return get_segment_base(regs->cs); 2167 return get_segment_base(regs->cs);
2167 }
2168#endif 2168#endif
2169 return 0; 2169 return 0;
2170} 2170}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
105#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
106 struct pt_regs fixed_regs; 106 struct pt_regs fixed_regs;
107 107
108 if (!user_mode_vm(regs)) { 108 if (!user_mode(regs)) {
109 crash_fixup_ss_esp(&fixed_regs, regs); 109 crash_fixup_ss_esp(&fixed_regs, regs);
110 regs = &fixed_regs; 110 regs = &fixed_regs;
111 } 111 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..ab3b65639a3e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
278 print_modules(); 278 print_modules();
279 show_regs(regs); 279 show_regs(regs);
280#ifdef CONFIG_X86_32 280#ifdef CONFIG_X86_32
281 if (user_mode_vm(regs)) { 281 if (user_mode(regs)) {
282 sp = regs->sp; 282 sp = regs->sp;
283 ss = regs->ss & 0xffff; 283 ss = regs->ss & 0xffff;
284 } else { 284 } else {
@@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err)
307 unsigned long flags = oops_begin(); 307 unsigned long flags = oops_begin();
308 int sig = SIGSEGV; 308 int sig = SIGSEGV;
309 309
310 if (!user_mode_vm(regs)) 310 if (!user_mode(regs))
311 report_bug(regs->ip, regs); 311 report_bug(regs->ip, regs);
312 312
313 if (__die(str, regs, err)) 313 if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..39891ff50d03 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs)
123 int i; 123 int i;
124 124
125 show_regs_print_info(KERN_EMERG); 125 show_regs_print_info(KERN_EMERG);
126 __show_regs(regs, !user_mode_vm(regs)); 126 __show_regs(regs, !user_mode(regs));
127 127
128 /* 128 /*
129 * When in-kernel, we also print out the stack and code at the 129 * When in-kernel, we also print out the stack and code at the
130 * time of the fault.. 130 * time of the fault..
131 */ 131 */
132 if (!user_mode_vm(regs)) { 132 if (!user_mode(regs)) {
133 unsigned int code_prologue = code_bytes * 43 / 64; 133 unsigned int code_prologue = code_bytes * 43 / 64;
134 unsigned int code_len = code_bytes; 134 unsigned int code_len = code_bytes;
135 unsigned char c; 135 unsigned char c;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
395 /*CFI_REL_OFFSET cs, 0*/ 395 /*CFI_REL_OFFSET cs, 0*/
396 /* 396 /*
397 * Push current_thread_info()->sysenter_return to the stack. 397 * Push current_thread_info()->sysenter_return to the stack.
398 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 398 * A tiny bit of offset fixup is necessary: TI_sysenter_return
399 * pushed above; +8 corresponds to copy_thread's esp0 setting. 399 * is relative to thread_info, which is at the bottom of the
400 * kernel stack page. 4*4 means the 4 words pushed above;
401 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
402 * and THREAD_SIZE takes us to the bottom.
400 */ 403 */
401 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) 404 pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
402 CFI_REL_OFFSET eip, 0 405 CFI_REL_OFFSET eip, 0
403 406
404 pushl_cfi %eax 407 pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
432 TRACE_IRQS_OFF 435 TRACE_IRQS_OFF
433 movl TI_flags(%ebp), %ecx 436 movl TI_flags(%ebp), %ecx
434 testl $_TIF_ALLWORK_MASK, %ecx 437 testl $_TIF_ALLWORK_MASK, %ecx
435 jne sysexit_audit 438 jnz sysexit_audit
436sysenter_exit: 439sysenter_exit:
437/* if something modifies registers it must also disable sysexit */ 440/* if something modifies registers it must also disable sysexit */
438 movl PT_EIP(%esp), %edx 441 movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
460 463
461sysexit_audit: 464sysexit_audit:
462 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 465 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
463 jne syscall_exit_work 466 jnz syscall_exit_work
464 TRACE_IRQS_ON 467 TRACE_IRQS_ON
465 ENABLE_INTERRUPTS(CLBR_ANY) 468 ENABLE_INTERRUPTS(CLBR_ANY)
466 movl %eax,%edx /* second arg, syscall return value */ 469 movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
472 TRACE_IRQS_OFF 475 TRACE_IRQS_OFF
473 movl TI_flags(%ebp), %ecx 476 movl TI_flags(%ebp), %ecx
474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 477 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
475 jne syscall_exit_work 478 jnz syscall_exit_work
476 movl PT_EAX(%esp),%eax /* reload syscall return value */ 479 movl PT_EAX(%esp),%eax /* reload syscall return value */
477 jmp sysenter_exit 480 jmp sysenter_exit
478#endif 481#endif
@@ -510,7 +513,7 @@ syscall_exit:
510 TRACE_IRQS_OFF 513 TRACE_IRQS_OFF
511 movl TI_flags(%ebp), %ecx 514 movl TI_flags(%ebp), %ecx
512 testl $_TIF_ALLWORK_MASK, %ecx # current->work 515 testl $_TIF_ALLWORK_MASK, %ecx # current->work
513 jne syscall_exit_work 516 jnz syscall_exit_work
514 517
515restore_all: 518restore_all:
516 TRACE_IRQS_IRET 519 TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
612#ifdef CONFIG_VM86 615#ifdef CONFIG_VM86
613 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 616 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614 movl %esp, %eax 617 movl %esp, %eax
615 jne work_notifysig_v86 # returning to kernel-space or 618 jnz work_notifysig_v86 # returning to kernel-space or
616 # vm86-space 619 # vm86-space
6171: 6201:
618#else 621#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
720.endm 723.endm
721 724
722/* 725/*
723 * Build the entry stubs and pointer table with some assembler magic. 726 * Build the entry stubs with some assembler magic.
724 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 727 * We pack 1 stub into every 8-byte block.
725 * single cache line on all modern x86 implementations.
726 */ 728 */
727.section .init.rodata,"a" 729 .align 8
728ENTRY(interrupt)
729.section .entry.text, "ax"
730 .p2align 5
731 .p2align CONFIG_X86_L1_CACHE_SHIFT
732ENTRY(irq_entries_start) 730ENTRY(irq_entries_start)
733 RING0_INT_FRAME 731 RING0_INT_FRAME
734vector=FIRST_EXTERNAL_VECTOR 732 vector=FIRST_EXTERNAL_VECTOR
735.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 733 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
736 .balign 32 734 pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
737 .rept 7 735 vector=vector+1
738 .if vector < FIRST_SYSTEM_VECTOR 736 jmp common_interrupt
739 .if vector <> FIRST_EXTERNAL_VECTOR
740 CFI_ADJUST_CFA_OFFSET -4 737 CFI_ADJUST_CFA_OFFSET -4
741 .endif 738 .align 8
7421: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ 739 .endr
743 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
744 jmp 2f
745 .endif
746 .previous
747 .long 1b
748 .section .entry.text, "ax"
749vector=vector+1
750 .endif
751 .endr
7522: jmp common_interrupt
753.endr
754END(irq_entries_start) 740END(irq_entries_start)
755 741
756.previous
757END(interrupt)
758.previous
759
760/* 742/*
761 * the CPU automatically disables interrupts when executing an IRQ vector, 743 * the CPU automatically disables interrupts when executing an IRQ vector,
762 * so IRQ-flags tracing has to follow that: 744 * so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
816 pushl_cfi $0 798 pushl_cfi $0
817#ifdef CONFIG_X86_INVD_BUG 799#ifdef CONFIG_X86_INVD_BUG
818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 800 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
819661: pushl_cfi $do_general_protection 801 ALTERNATIVE "pushl_cfi $do_general_protection", \
820662: 802 "pushl $do_simd_coprocessor_error", \
821.section .altinstructions,"a" 803 X86_FEATURE_XMM
822 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
823.previous
824.section .altinstr_replacement,"ax"
825663: pushl $do_simd_coprocessor_error
826664:
827.previous
828#else 804#else
829 pushl_cfi $do_simd_coprocessor_error 805 pushl_cfi $do_simd_coprocessor_error
830#endif 806#endif
@@ -1240,20 +1216,13 @@ error_code:
1240 /*CFI_REL_OFFSET es, 0*/ 1216 /*CFI_REL_OFFSET es, 0*/
1241 pushl_cfi %ds 1217 pushl_cfi %ds
1242 /*CFI_REL_OFFSET ds, 0*/ 1218 /*CFI_REL_OFFSET ds, 0*/
1243 pushl_cfi %eax 1219 pushl_cfi_reg eax
1244 CFI_REL_OFFSET eax, 0 1220 pushl_cfi_reg ebp
1245 pushl_cfi %ebp 1221 pushl_cfi_reg edi
1246 CFI_REL_OFFSET ebp, 0 1222 pushl_cfi_reg esi
1247 pushl_cfi %edi 1223 pushl_cfi_reg edx
1248 CFI_REL_OFFSET edi, 0 1224 pushl_cfi_reg ecx
1249 pushl_cfi %esi 1225 pushl_cfi_reg ebx
1250 CFI_REL_OFFSET esi, 0
1251 pushl_cfi %edx
1252 CFI_REL_OFFSET edx, 0
1253 pushl_cfi %ecx
1254 CFI_REL_OFFSET ecx, 0
1255 pushl_cfi %ebx
1256 CFI_REL_OFFSET ebx, 0
1257 cld 1226 cld
1258 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1259 movl %ecx, %fs 1228 movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f0095a76c182..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
14 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
16 * 16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology: 17 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP 18 * - iret frame: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack. 19 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 * 20 *
26 * Some macro usage: 21 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better 22 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code. 23 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table. 24 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 25 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - idtentry - Define exception entry points. 26 * - idtentry - Define exception entry points.
40 */ 27 */
@@ -70,10 +57,6 @@
70 .section .entry.text, "ax" 57 .section .entry.text, "ax"
71 58
72 59
73#ifndef CONFIG_PREEMPT
74#define retint_kernel retint_restore_args
75#endif
76
77#ifdef CONFIG_PARAVIRT 60#ifdef CONFIG_PARAVIRT
78ENTRY(native_usergs_sysret64) 61ENTRY(native_usergs_sysret64)
79 swapgs 62 swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
82#endif /* CONFIG_PARAVIRT */ 65#endif /* CONFIG_PARAVIRT */
83 66
84 67
85.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 68.macro TRACE_IRQS_IRETQ
86#ifdef CONFIG_TRACE_IRQFLAGS 69#ifdef CONFIG_TRACE_IRQFLAGS
87 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 70 bt $9,EFLAGS(%rsp) /* interrupts off? */
88 jnc 1f 71 jnc 1f
89 TRACE_IRQS_ON 72 TRACE_IRQS_ON
901: 731:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
116 call debug_stack_reset 99 call debug_stack_reset
117.endm 100.endm
118 101
119.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET 102.macro TRACE_IRQS_IRETQ_DEBUG
120 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 103 bt $9,EFLAGS(%rsp) /* interrupts off? */
121 jnc 1f 104 jnc 1f
122 TRACE_IRQS_ON_DEBUG 105 TRACE_IRQS_ON_DEBUG
1231: 1061:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
130#endif 113#endif
131 114
132/* 115/*
133 * C code is not supposed to know about undefined top of stack. Every time 116 * empty frame
134 * a C function with an pt_regs argument is called from the SYSCALL based
135 * fast path FIXUP_TOP_OF_STACK is needed.
136 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
137 * manipulation.
138 */
139
140 /* %rsp:at FRAMEEND */
141 .macro FIXUP_TOP_OF_STACK tmp offset=0
142 movq PER_CPU_VAR(old_rsp),\tmp
143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp)
146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
148 movq R11+\offset(%rsp),\tmp /* get eflags */
149 movq \tmp,EFLAGS+\offset(%rsp)
150 .endm
151
152 .macro RESTORE_TOP_OF_STACK tmp offset=0
153 movq RSP+\offset(%rsp),\tmp
154 movq \tmp,PER_CPU_VAR(old_rsp)
155 movq EFLAGS+\offset(%rsp),\tmp
156 movq \tmp,R11+\offset(%rsp)
157 .endm
158
159/*
160 * initial frame state for interrupts (and exceptions without error code)
161 */ 117 */
162 .macro EMPTY_FRAME start=1 offset=0 118 .macro EMPTY_FRAME start=1 offset=0
163 .if \start 119 .if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
173 * initial frame state for interrupts (and exceptions without error code) 129 * initial frame state for interrupts (and exceptions without error code)
174 */ 130 */
175 .macro INTR_FRAME start=1 offset=0 131 .macro INTR_FRAME start=1 offset=0
176 EMPTY_FRAME \start, SS+8+\offset-RIP 132 EMPTY_FRAME \start, 5*8+\offset
177 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 133 /*CFI_REL_OFFSET ss, 4*8+\offset*/
178 CFI_REL_OFFSET rsp, RSP+\offset-RIP 134 CFI_REL_OFFSET rsp, 3*8+\offset
179 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 135 /*CFI_REL_OFFSET rflags, 2*8+\offset*/
180 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 136 /*CFI_REL_OFFSET cs, 1*8+\offset*/
181 CFI_REL_OFFSET rip, RIP+\offset-RIP 137 CFI_REL_OFFSET rip, 0*8+\offset
182 .endm 138 .endm
183 139
184/* 140/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
186 * with vector already pushed) 142 * with vector already pushed)
187 */ 143 */
188 .macro XCPT_FRAME start=1 offset=0 144 .macro XCPT_FRAME start=1 offset=0
189 INTR_FRAME \start, RIP+\offset-ORIG_RAX 145 INTR_FRAME \start, 1*8+\offset
190 .endm
191
192/*
193 * frame that enables calling into C.
194 */
195 .macro PARTIAL_FRAME start=1 offset=0
196 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
197 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
198 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
199 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
200 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
201 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
202 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
203 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
204 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
205 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
206 .endm 146 .endm
207 147
208/* 148/*
209 * frame that enables passing a complete pt_regs to a C function. 149 * frame that enables passing a complete pt_regs to a C function.
210 */ 150 */
211 .macro DEFAULT_FRAME start=1 offset=0 151 .macro DEFAULT_FRAME start=1 offset=0
212 PARTIAL_FRAME \start, R11+\offset-R15 152 XCPT_FRAME \start, ORIG_RAX+\offset
153 CFI_REL_OFFSET rdi, RDI+\offset
154 CFI_REL_OFFSET rsi, RSI+\offset
155 CFI_REL_OFFSET rdx, RDX+\offset
156 CFI_REL_OFFSET rcx, RCX+\offset
157 CFI_REL_OFFSET rax, RAX+\offset
158 CFI_REL_OFFSET r8, R8+\offset
159 CFI_REL_OFFSET r9, R9+\offset
160 CFI_REL_OFFSET r10, R10+\offset
161 CFI_REL_OFFSET r11, R11+\offset
213 CFI_REL_OFFSET rbx, RBX+\offset 162 CFI_REL_OFFSET rbx, RBX+\offset
214 CFI_REL_OFFSET rbp, RBP+\offset 163 CFI_REL_OFFSET rbp, RBP+\offset
215 CFI_REL_OFFSET r12, R12+\offset 164 CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
218 CFI_REL_OFFSET r15, R15+\offset 167 CFI_REL_OFFSET r15, R15+\offset
219 .endm 168 .endm
220 169
221ENTRY(save_paranoid)
222 XCPT_FRAME 1 RDI+8
223 cld
224 movq %rdi, RDI+8(%rsp)
225 movq %rsi, RSI+8(%rsp)
226 movq_cfi rdx, RDX+8
227 movq_cfi rcx, RCX+8
228 movq_cfi rax, RAX+8
229 movq %r8, R8+8(%rsp)
230 movq %r9, R9+8(%rsp)
231 movq %r10, R10+8(%rsp)
232 movq %r11, R11+8(%rsp)
233 movq_cfi rbx, RBX+8
234 movq %rbp, RBP+8(%rsp)
235 movq %r12, R12+8(%rsp)
236 movq %r13, R13+8(%rsp)
237 movq %r14, R14+8(%rsp)
238 movq %r15, R15+8(%rsp)
239 movl $1,%ebx
240 movl $MSR_GS_BASE,%ecx
241 rdmsr
242 testl %edx,%edx
243 js 1f /* negative -> in kernel */
244 SWAPGS
245 xorl %ebx,%ebx
2461: ret
247 CFI_ENDPROC
248END(save_paranoid)
249
250/* 170/*
251 * A newly forked process directly context switches into this address. 171 * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
252 * 172 *
253 * rdi: prev task we switched from 173 * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
254 */ 174 * then loads new ss, cs, and rip from previously programmed MSRs.
255ENTRY(ret_from_fork) 175 * rflags gets masked by a value from another MSR (so CLD and CLAC
256 DEFAULT_FRAME 176 * are not needed). SYSCALL does not save anything on the stack
257 177 * and does not change rsp.
258 LOCK ; btr $TIF_FORK,TI_flags(%r8)
259
260 pushq_cfi $0x0002
261 popfq_cfi # reset kernel eflags
262
263 call schedule_tail # rdi: 'prev' task parameter
264
265 GET_THREAD_INFO(%rcx)
266
267 RESTORE_REST
268
269 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
270 jz 1f
271
272 /*
273 * By the time we get here, we have no idea whether our pt_regs,
274 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
275 * the slow path, or one of the ia32entry paths.
276 * Use int_ret_from_sys_call to return, since it can safely handle
277 * all of the above.
278 */
279 jmp int_ret_from_sys_call
280
2811:
282 subq $REST_SKIP, %rsp # leave space for volatiles
283 CFI_ADJUST_CFA_OFFSET REST_SKIP
284 movq %rbp, %rdi
285 call *%rbx
286 movl $0, RAX(%rsp)
287 RESTORE_REST
288 jmp int_ret_from_sys_call
289 CFI_ENDPROC
290END(ret_from_fork)
291
292/*
293 * System call entry. Up to 6 arguments in registers are supported.
294 * 178 *
295 * SYSCALL does not save anything on the stack and does not change the 179 * Registers on entry:
296 * stack pointer. However, it does mask the flags register for us, so
297 * CLD and CLAC are not needed.
298 */
299
300/*
301 * Register setup:
302 * rax system call number 180 * rax system call number
181 * rcx return address
182 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
303 * rdi arg0 183 * rdi arg0
304 * rcx return address for syscall/sysret, C arg3
305 * rsi arg1 184 * rsi arg1
306 * rdx arg2 185 * rdx arg2
307 * r10 arg3 (--> moved to rcx for C) 186 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
308 * r8 arg4 187 * r8 arg4
309 * r9 arg5 188 * r9 arg5
310 * r11 eflags for syscall/sysret, temporary for C 189 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
311 * r12-r15,rbp,rbx saved by C code, not touched.
312 * 190 *
313 * Interrupts are off on entry.
314 * Only called from user space. 191 * Only called from user space.
315 * 192 *
316 * XXX if we had a free scratch register we could save the RSP into the stack frame 193 * When user can change pt_regs->foo always force IRET. That is because
317 * and report it properly in ps. Unfortunately we haven't.
318 *
319 * When user can change the frames always force IRET. That is because
320 * it deals with uncanonical addresses better. SYSRET has trouble 194 * it deals with uncanonical addresses better. SYSRET has trouble
321 * with them due to bugs in both AMD and Intel CPUs. 195 * with them due to bugs in both AMD and Intel CPUs.
322 */ 196 */
@@ -324,9 +198,15 @@ END(ret_from_fork)
324ENTRY(system_call) 198ENTRY(system_call)
325 CFI_STARTPROC simple 199 CFI_STARTPROC simple
326 CFI_SIGNAL_FRAME 200 CFI_SIGNAL_FRAME
327 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 201 CFI_DEF_CFA rsp,0
328 CFI_REGISTER rip,rcx 202 CFI_REGISTER rip,rcx
329 /*CFI_REGISTER rflags,r11*/ 203 /*CFI_REGISTER rflags,r11*/
204
205 /*
206 * Interrupts are off on entry.
207 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
208 * it is too small to ever cause noticeable irq latency.
209 */
330 SWAPGS_UNSAFE_STACK 210 SWAPGS_UNSAFE_STACK
331 /* 211 /*
332 * A hypervisor implementation might want to use a label 212 * A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
335 */ 215 */
336GLOBAL(system_call_after_swapgs) 216GLOBAL(system_call_after_swapgs)
337 217
338 movq %rsp,PER_CPU_VAR(old_rsp) 218 movq %rsp,PER_CPU_VAR(rsp_scratch)
339 movq PER_CPU_VAR(kernel_stack),%rsp 219 movq PER_CPU_VAR(kernel_stack),%rsp
220
221 /* Construct struct pt_regs on stack */
222 pushq_cfi $__USER_DS /* pt_regs->ss */
223 pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
340 /* 224 /*
341 * No need to follow this irqs off/on section - it's straight 225 * Re-enable interrupts.
342 * and short: 226 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
227 * must execute atomically in the face of possible interrupt-driven
228 * task preemption. We must enable interrupts only after we're done
229 * with using rsp_scratch:
343 */ 230 */
344 ENABLE_INTERRUPTS(CLBR_NONE) 231 ENABLE_INTERRUPTS(CLBR_NONE)
345 SAVE_ARGS 8, 0, rax_enosys=1 232 pushq_cfi %r11 /* pt_regs->flags */
346 movq_cfi rax,(ORIG_RAX-ARGOFFSET) 233 pushq_cfi $__USER_CS /* pt_regs->cs */
347 movq %rcx,RIP-ARGOFFSET(%rsp) 234 pushq_cfi %rcx /* pt_regs->ip */
348 CFI_REL_OFFSET rip,RIP-ARGOFFSET 235 CFI_REL_OFFSET rip,0
349 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 236 pushq_cfi_reg rax /* pt_regs->orig_ax */
237 pushq_cfi_reg rdi /* pt_regs->di */
238 pushq_cfi_reg rsi /* pt_regs->si */
239 pushq_cfi_reg rdx /* pt_regs->dx */
240 pushq_cfi_reg rcx /* pt_regs->cx */
241 pushq_cfi $-ENOSYS /* pt_regs->ax */
242 pushq_cfi_reg r8 /* pt_regs->r8 */
243 pushq_cfi_reg r9 /* pt_regs->r9 */
244 pushq_cfi_reg r10 /* pt_regs->r10 */
245 pushq_cfi_reg r11 /* pt_regs->r11 */
246 sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
247 CFI_ADJUST_CFA_OFFSET 6*8
248
249 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
350 jnz tracesys 250 jnz tracesys
351system_call_fastpath: 251system_call_fastpath:
352#if __SYSCALL_MASK == ~0 252#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
355 andl $__SYSCALL_MASK,%eax 255 andl $__SYSCALL_MASK,%eax
356 cmpl $__NR_syscall_max,%eax 256 cmpl $__NR_syscall_max,%eax
357#endif 257#endif
358 ja ret_from_sys_call /* and return regs->ax */ 258 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
359 movq %r10,%rcx 259 movq %r10,%rcx
360 call *sys_call_table(,%rax,8) # XXX: rip relative 260 call *sys_call_table(,%rax,8)
361 movq %rax,RAX-ARGOFFSET(%rsp) 261 movq %rax,RAX(%rsp)
2621:
362/* 263/*
363 * Syscall return path ending with SYSRET (fast path) 264 * Syscall return path ending with SYSRET (fast path).
364 * Has incomplete stack frame and undefined top of stack. 265 * Has incompletely filled pt_regs.
365 */ 266 */
366ret_from_sys_call:
367 LOCKDEP_SYS_EXIT 267 LOCKDEP_SYS_EXIT
268 /*
269 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
270 * it is too small to ever cause noticeable irq latency.
271 */
368 DISABLE_INTERRUPTS(CLBR_NONE) 272 DISABLE_INTERRUPTS(CLBR_NONE)
369 TRACE_IRQS_OFF
370 273
371 /* 274 /*
372 * We must check ti flags with interrupts (or at least preemption) 275 * We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
376 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 279 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
377 * very bad. 280 * very bad.
378 */ 281 */
379 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 282 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
380 jnz int_ret_from_sys_call_fixup /* Go the the slow path */ 283 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
381 284
382 CFI_REMEMBER_STATE 285 CFI_REMEMBER_STATE
383 /* 286
384 * sysretq will re-enable interrupts: 287 RESTORE_C_REGS_EXCEPT_RCX_R11
385 */ 288 movq RIP(%rsp),%rcx
386 TRACE_IRQS_ON
387 movq RIP-ARGOFFSET(%rsp),%rcx
388 CFI_REGISTER rip,rcx 289 CFI_REGISTER rip,rcx
389 RESTORE_ARGS 1,-ARG_SKIP,0 290 movq EFLAGS(%rsp),%r11
390 /*CFI_REGISTER rflags,r11*/ 291 /*CFI_REGISTER rflags,r11*/
391 movq PER_CPU_VAR(old_rsp), %rsp 292 movq RSP(%rsp),%rsp
293 /*
294 * 64bit SYSRET restores rip from rcx,
295 * rflags from r11 (but RF and VM bits are forced to 0),
296 * cs and ss are loaded from MSRs.
297 * Restoration of rflags re-enables interrupts.
298 */
392 USERGS_SYSRET64 299 USERGS_SYSRET64
393 300
394 CFI_RESTORE_STATE 301 CFI_RESTORE_STATE
395 302
396int_ret_from_sys_call_fixup: 303 /* Do syscall entry tracing */
397 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
398 jmp int_ret_from_sys_call_irqs_off
399
400 /* Do syscall tracing */
401tracesys: 304tracesys:
402 leaq -REST_SKIP(%rsp), %rdi 305 movq %rsp, %rdi
403 movq $AUDIT_ARCH_X86_64, %rsi 306 movl $AUDIT_ARCH_X86_64, %esi
404 call syscall_trace_enter_phase1 307 call syscall_trace_enter_phase1
405 test %rax, %rax 308 test %rax, %rax
406 jnz tracesys_phase2 /* if needed, run the slow path */ 309 jnz tracesys_phase2 /* if needed, run the slow path */
407 LOAD_ARGS 0 /* else restore clobbered regs */ 310 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
311 movq ORIG_RAX(%rsp), %rax
408 jmp system_call_fastpath /* and return to the fast path */ 312 jmp system_call_fastpath /* and return to the fast path */
409 313
410tracesys_phase2: 314tracesys_phase2:
411 SAVE_REST 315 SAVE_EXTRA_REGS
412 FIXUP_TOP_OF_STACK %rdi
413 movq %rsp, %rdi 316 movq %rsp, %rdi
414 movq $AUDIT_ARCH_X86_64, %rsi 317 movl $AUDIT_ARCH_X86_64, %esi
415 movq %rax,%rdx 318 movq %rax,%rdx
416 call syscall_trace_enter_phase2 319 call syscall_trace_enter_phase2
417 320
418 /* 321 /*
419 * Reload arg registers from stack in case ptrace changed them. 322 * Reload registers from stack in case ptrace changed them.
420 * We don't reload %rax because syscall_trace_entry_phase2() returned 323 * We don't reload %rax because syscall_trace_entry_phase2() returned
421 * the value it wants us to use in the table lookup. 324 * the value it wants us to use in the table lookup.
422 */ 325 */
423 LOAD_ARGS ARGOFFSET, 1 326 RESTORE_C_REGS_EXCEPT_RAX
424 RESTORE_REST 327 RESTORE_EXTRA_REGS
425#if __SYSCALL_MASK == ~0 328#if __SYSCALL_MASK == ~0
426 cmpq $__NR_syscall_max,%rax 329 cmpq $__NR_syscall_max,%rax
427#else 330#else
428 andl $__SYSCALL_MASK,%eax 331 andl $__SYSCALL_MASK,%eax
429 cmpl $__NR_syscall_max,%eax 332 cmpl $__NR_syscall_max,%eax
430#endif 333#endif
431 ja int_ret_from_sys_call /* RAX(%rsp) is already set */ 334 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
432 movq %r10,%rcx /* fixup for C */ 335 movq %r10,%rcx /* fixup for C */
433 call *sys_call_table(,%rax,8) 336 call *sys_call_table(,%rax,8)
434 movq %rax,RAX-ARGOFFSET(%rsp) 337 movq %rax,RAX(%rsp)
435 /* Use IRET because user could have changed frame */ 3381:
339 /* Use IRET because user could have changed pt_regs->foo */
436 340
437/* 341/*
438 * Syscall return path ending with IRET. 342 * Syscall return path ending with IRET.
439 * Has correct top of stack, but partial stack frame. 343 * Has correct iret frame.
440 */ 344 */
441GLOBAL(int_ret_from_sys_call) 345GLOBAL(int_ret_from_sys_call)
442 DISABLE_INTERRUPTS(CLBR_NONE) 346 DISABLE_INTERRUPTS(CLBR_NONE)
347int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
443 TRACE_IRQS_OFF 348 TRACE_IRQS_OFF
444int_ret_from_sys_call_irqs_off:
445 movl $_TIF_ALLWORK_MASK,%edi 349 movl $_TIF_ALLWORK_MASK,%edi
446 /* edi: mask to check */ 350 /* edi: mask to check */
447GLOBAL(int_with_check) 351GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
450 movl TI_flags(%rcx),%edx 354 movl TI_flags(%rcx),%edx
451 andl %edi,%edx 355 andl %edi,%edx
452 jnz int_careful 356 jnz int_careful
453 andl $~TS_COMPAT,TI_status(%rcx) 357 andl $~TS_COMPAT,TI_status(%rcx)
454 jmp retint_swapgs 358 jmp syscall_return
455 359
456 /* Either reschedule or signal or syscall exit tracking needed. */ 360 /* Either reschedule or signal or syscall exit tracking needed. */
457 /* First do a reschedule test. */ 361 /* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
468 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
469 jmp int_with_check 373 jmp int_with_check
470 374
471 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full pt_regs */
472int_very_careful: 376int_very_careful:
473 TRACE_IRQS_ON 377 TRACE_IRQS_ON
474 ENABLE_INTERRUPTS(CLBR_NONE) 378 ENABLE_INTERRUPTS(CLBR_NONE)
475int_check_syscall_exit_work: 379 SAVE_EXTRA_REGS
476 SAVE_REST
477 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
478 testl $_TIF_WORK_SYSCALL_EXIT,%edx 381 testl $_TIF_WORK_SYSCALL_EXIT,%edx
479 jz int_signal 382 jz int_signal
@@ -492,86 +395,192 @@ int_signal:
492 call do_notify_resume 395 call do_notify_resume
4931: movl $_TIF_WORK_MASK,%edi 3961: movl $_TIF_WORK_MASK,%edi
494int_restore_rest: 397int_restore_rest:
495 RESTORE_REST 398 RESTORE_EXTRA_REGS
496 DISABLE_INTERRUPTS(CLBR_NONE) 399 DISABLE_INTERRUPTS(CLBR_NONE)
497 TRACE_IRQS_OFF 400 TRACE_IRQS_OFF
498 jmp int_with_check 401 jmp int_with_check
402
403syscall_return:
404 /* The IRETQ could re-enable interrupts: */
405 DISABLE_INTERRUPTS(CLBR_ANY)
406 TRACE_IRQS_IRETQ
407
408 /*
409 * Try to use SYSRET instead of IRET if we're returning to
410 * a completely clean 64-bit userspace context.
411 */
412 movq RCX(%rsp),%rcx
413 cmpq %rcx,RIP(%rsp) /* RCX == RIP */
414 jne opportunistic_sysret_failed
415
416 /*
417 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
418 * in kernel space. This essentially lets the user take over
419 * the kernel, since userspace controls RSP. It's not worth
420 * testing for canonicalness exactly -- this check detects any
421 * of the 17 high bits set, which is true for non-canonical
422 * or kernel addresses. (This will pessimize vsyscall=native.
423 * Big deal.)
424 *
425 * If virtual addresses ever become wider, this will need
426 * to be updated to remain correct on both old and new CPUs.
427 */
428 .ifne __VIRTUAL_MASK_SHIFT - 47
429 .error "virtual address width changed -- SYSRET checks need update"
430 .endif
431 shr $__VIRTUAL_MASK_SHIFT, %rcx
432 jnz opportunistic_sysret_failed
433
434 cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
435 jne opportunistic_sysret_failed
436
437 movq R11(%rsp),%r11
438 cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
439 jne opportunistic_sysret_failed
440
441 /*
442 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
443 * restoring TF results in a trap from userspace immediately after
444 * SYSRET. This would cause an infinite loop whenever #DB happens
445 * with register state that satisfies the opportunistic SYSRET
446 * conditions. For example, single-stepping this user code:
447 *
448 * movq $stuck_here,%rcx
449 * pushfq
450 * popq %r11
451 * stuck_here:
452 *
453 * would never get past 'stuck_here'.
454 */
455 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
456 jnz opportunistic_sysret_failed
457
458 /* nothing to check for RSP */
459
460 cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
461 jne opportunistic_sysret_failed
462
463 /*
464 * We win! This label is here just for ease of understanding
465 * perf profiles. Nothing jumps here.
466 */
467syscall_return_via_sysret:
468 CFI_REMEMBER_STATE
469 /* r11 is already restored (see code above) */
470 RESTORE_C_REGS_EXCEPT_R11
471 movq RSP(%rsp),%rsp
472 USERGS_SYSRET64
473 CFI_RESTORE_STATE
474
475opportunistic_sysret_failed:
476 SWAPGS
477 jmp restore_c_regs_and_iret
499 CFI_ENDPROC 478 CFI_ENDPROC
500END(system_call) 479END(system_call)
501 480
481
502 .macro FORK_LIKE func 482 .macro FORK_LIKE func
503ENTRY(stub_\func) 483ENTRY(stub_\func)
504 CFI_STARTPROC 484 CFI_STARTPROC
505 popq %r11 /* save return address */ 485 DEFAULT_FRAME 0, 8 /* offset 8: return address */
506 PARTIAL_FRAME 0 486 SAVE_EXTRA_REGS 8
507 SAVE_REST 487 jmp sys_\func
508 pushq %r11 /* put it back on stack */
509 FIXUP_TOP_OF_STACK %r11, 8
510 DEFAULT_FRAME 0 8 /* offset 8: return address */
511 call sys_\func
512 RESTORE_TOP_OF_STACK %r11, 8
513 ret $REST_SKIP /* pop extended registers */
514 CFI_ENDPROC 488 CFI_ENDPROC
515END(stub_\func) 489END(stub_\func)
516 .endm 490 .endm
517 491
518 .macro FIXED_FRAME label,func
519ENTRY(\label)
520 CFI_STARTPROC
521 PARTIAL_FRAME 0 8 /* offset 8: return address */
522 FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
523 call \func
524 RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
525 ret
526 CFI_ENDPROC
527END(\label)
528 .endm
529
530 FORK_LIKE clone 492 FORK_LIKE clone
531 FORK_LIKE fork 493 FORK_LIKE fork
532 FORK_LIKE vfork 494 FORK_LIKE vfork
533 FIXED_FRAME stub_iopl, sys_iopl
534 495
535ENTRY(stub_execve) 496ENTRY(stub_execve)
536 CFI_STARTPROC 497 CFI_STARTPROC
537 addq $8, %rsp 498 DEFAULT_FRAME 0, 8
538 PARTIAL_FRAME 0 499 call sys_execve
539 SAVE_REST 500return_from_execve:
540 FIXUP_TOP_OF_STACK %r11 501 testl %eax, %eax
541 call sys_execve 502 jz 1f
542 movq %rax,RAX(%rsp) 503 /* exec failed, can use fast SYSRET code path in this case */
543 RESTORE_REST 504 ret
544 jmp int_ret_from_sys_call 5051:
506 /* must use IRET code path (pt_regs->cs may have changed) */
507 addq $8, %rsp
508 CFI_ADJUST_CFA_OFFSET -8
509 ZERO_EXTRA_REGS
510 movq %rax,RAX(%rsp)
511 jmp int_ret_from_sys_call
545 CFI_ENDPROC 512 CFI_ENDPROC
546END(stub_execve) 513END(stub_execve)
547 514/*
548ENTRY(stub_execveat) 515 * Remaining execve stubs are only 7 bytes long.
516 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
517 */
518 .align 8
519GLOBAL(stub_execveat)
549 CFI_STARTPROC 520 CFI_STARTPROC
550 addq $8, %rsp 521 DEFAULT_FRAME 0, 8
551 PARTIAL_FRAME 0 522 call sys_execveat
552 SAVE_REST 523 jmp return_from_execve
553 FIXUP_TOP_OF_STACK %r11
554 call sys_execveat
555 RESTORE_TOP_OF_STACK %r11
556 movq %rax,RAX(%rsp)
557 RESTORE_REST
558 jmp int_ret_from_sys_call
559 CFI_ENDPROC 524 CFI_ENDPROC
560END(stub_execveat) 525END(stub_execveat)
561 526
527#ifdef CONFIG_X86_X32_ABI
528 .align 8
529GLOBAL(stub_x32_execve)
530 CFI_STARTPROC
531 DEFAULT_FRAME 0, 8
532 call compat_sys_execve
533 jmp return_from_execve
534 CFI_ENDPROC
535END(stub_x32_execve)
536 .align 8
537GLOBAL(stub_x32_execveat)
538 CFI_STARTPROC
539 DEFAULT_FRAME 0, 8
540 call compat_sys_execveat
541 jmp return_from_execve
542 CFI_ENDPROC
543END(stub_x32_execveat)
544#endif
545
546#ifdef CONFIG_IA32_EMULATION
547 .align 8
548GLOBAL(stub32_execve)
549 CFI_STARTPROC
550 call compat_sys_execve
551 jmp return_from_execve
552 CFI_ENDPROC
553END(stub32_execve)
554 .align 8
555GLOBAL(stub32_execveat)
556 CFI_STARTPROC
557 call compat_sys_execveat
558 jmp return_from_execve
559 CFI_ENDPROC
560END(stub32_execveat)
561#endif
562
562/* 563/*
563 * sigreturn is special because it needs to restore all registers on return. 564 * sigreturn is special because it needs to restore all registers on return.
564 * This cannot be done with SYSRET, so use the IRET return path instead. 565 * This cannot be done with SYSRET, so use the IRET return path instead.
565 */ 566 */
566ENTRY(stub_rt_sigreturn) 567ENTRY(stub_rt_sigreturn)
567 CFI_STARTPROC 568 CFI_STARTPROC
568 addq $8, %rsp 569 DEFAULT_FRAME 0, 8
569 PARTIAL_FRAME 0 570 /*
570 SAVE_REST 571 * SAVE_EXTRA_REGS result is not normally needed:
571 FIXUP_TOP_OF_STACK %r11 572 * sigreturn overwrites all pt_regs->GPREGS.
573 * But sigreturn can fail (!), and there is no easy way to detect that.
574 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
575 * we SAVE_EXTRA_REGS here.
576 */
577 SAVE_EXTRA_REGS 8
572 call sys_rt_sigreturn 578 call sys_rt_sigreturn
573 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 579return_from_stub:
574 RESTORE_REST 580 addq $8, %rsp
581 CFI_ADJUST_CFA_OFFSET -8
582 RESTORE_EXTRA_REGS
583 movq %rax,RAX(%rsp)
575 jmp int_ret_from_sys_call 584 jmp int_ret_from_sys_call
576 CFI_ENDPROC 585 CFI_ENDPROC
577END(stub_rt_sigreturn) 586END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
579#ifdef CONFIG_X86_X32_ABI 588#ifdef CONFIG_X86_X32_ABI
580ENTRY(stub_x32_rt_sigreturn) 589ENTRY(stub_x32_rt_sigreturn)
581 CFI_STARTPROC 590 CFI_STARTPROC
582 addq $8, %rsp 591 DEFAULT_FRAME 0, 8
583 PARTIAL_FRAME 0 592 SAVE_EXTRA_REGS 8
584 SAVE_REST
585 FIXUP_TOP_OF_STACK %r11
586 call sys32_x32_rt_sigreturn 593 call sys32_x32_rt_sigreturn
587 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 594 jmp return_from_stub
588 RESTORE_REST
589 jmp int_ret_from_sys_call
590 CFI_ENDPROC 595 CFI_ENDPROC
591END(stub_x32_rt_sigreturn) 596END(stub_x32_rt_sigreturn)
597#endif
592 598
593ENTRY(stub_x32_execve) 599/*
594 CFI_STARTPROC 600 * A newly forked process directly context switches into this address.
595 addq $8, %rsp 601 *
596 PARTIAL_FRAME 0 602 * rdi: prev task we switched from
597 SAVE_REST 603 */
598 FIXUP_TOP_OF_STACK %r11 604ENTRY(ret_from_fork)
599 call compat_sys_execve 605 DEFAULT_FRAME
600 RESTORE_TOP_OF_STACK %r11
601 movq %rax,RAX(%rsp)
602 RESTORE_REST
603 jmp int_ret_from_sys_call
604 CFI_ENDPROC
605END(stub_x32_execve)
606 606
607ENTRY(stub_x32_execveat) 607 LOCK ; btr $TIF_FORK,TI_flags(%r8)
608 CFI_STARTPROC 608
609 addq $8, %rsp 609 pushq_cfi $0x0002
610 PARTIAL_FRAME 0 610 popfq_cfi # reset kernel eflags
611 SAVE_REST 611
612 FIXUP_TOP_OF_STACK %r11 612 call schedule_tail # rdi: 'prev' task parameter
613 call compat_sys_execveat 613
614 RESTORE_TOP_OF_STACK %r11 614 RESTORE_EXTRA_REGS
615 movq %rax,RAX(%rsp) 615
616 RESTORE_REST 616 testl $3,CS(%rsp) # from kernel_thread?
617
618 /*
619 * By the time we get here, we have no idea whether our pt_regs,
620 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
621 * the slow path, or one of the ia32entry paths.
622 * Use IRET code path to return, since it can safely handle
623 * all of the above.
624 */
625 jnz int_ret_from_sys_call
626
627 /* We came from kernel_thread */
628 /* nb: we depend on RESTORE_EXTRA_REGS above */
629 movq %rbp, %rdi
630 call *%rbx
631 movl $0, RAX(%rsp)
632 RESTORE_EXTRA_REGS
617 jmp int_ret_from_sys_call 633 jmp int_ret_from_sys_call
618 CFI_ENDPROC 634 CFI_ENDPROC
619END(stub_x32_execveat) 635END(ret_from_fork)
620
621#endif
622 636
623/* 637/*
624 * Build the entry stubs and pointer table with some assembler magic. 638 * Build the entry stubs with some assembler magic.
625 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 639 * We pack 1 stub into every 8-byte block.
626 * single cache line on all modern x86 implementations.
627 */ 640 */
628 .section .init.rodata,"a" 641 .align 8
629ENTRY(interrupt)
630 .section .entry.text
631 .p2align 5
632 .p2align CONFIG_X86_L1_CACHE_SHIFT
633ENTRY(irq_entries_start) 642ENTRY(irq_entries_start)
634 INTR_FRAME 643 INTR_FRAME
635vector=FIRST_EXTERNAL_VECTOR 644 vector=FIRST_EXTERNAL_VECTOR
636.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 645 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
637 .balign 32 646 pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
638 .rept 7 647 vector=vector+1
639 .if vector < FIRST_SYSTEM_VECTOR 648 jmp common_interrupt
640 .if vector <> FIRST_EXTERNAL_VECTOR
641 CFI_ADJUST_CFA_OFFSET -8 649 CFI_ADJUST_CFA_OFFSET -8
642 .endif 650 .align 8
6431: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 651 .endr
644 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
645 jmp 2f
646 .endif
647 .previous
648 .quad 1b
649 .section .entry.text
650vector=vector+1
651 .endif
652 .endr
6532: jmp common_interrupt
654.endr
655 CFI_ENDPROC 652 CFI_ENDPROC
656END(irq_entries_start) 653END(irq_entries_start)
657 654
658.previous
659END(interrupt)
660.previous
661
662/* 655/*
663 * Interrupt entry/exit. 656 * Interrupt entry/exit.
664 * 657 *
@@ -669,47 +662,45 @@ END(interrupt)
669 662
670/* 0(%rsp): ~(interrupt number) */ 663/* 0(%rsp): ~(interrupt number) */
671 .macro interrupt func 664 .macro interrupt func
672 /* reserve pt_regs for scratch regs and rbp */
673 subq $ORIG_RAX-RBP, %rsp
674 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
675 cld 665 cld
676 /* start from rbp in pt_regs and jump over */ 666 /*
677 movq_cfi rdi, (RDI-RBP) 667 * Since nothing in interrupt handling code touches r12...r15 members
678 movq_cfi rsi, (RSI-RBP) 668 * of "struct pt_regs", and since interrupts can nest, we can save
679 movq_cfi rdx, (RDX-RBP) 669 * four stack slots and simultaneously provide
680 movq_cfi rcx, (RCX-RBP) 670 * an unwind-friendly stack layout by saving "truncated" pt_regs
681 movq_cfi rax, (RAX-RBP) 671 * exactly up to rbp slot, without these members.
682 movq_cfi r8, (R8-RBP) 672 */
683 movq_cfi r9, (R9-RBP) 673 ALLOC_PT_GPREGS_ON_STACK -RBP
684 movq_cfi r10, (R10-RBP) 674 SAVE_C_REGS -RBP
685 movq_cfi r11, (R11-RBP) 675 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
686 676 SAVE_EXTRA_REGS_RBP -RBP
687 /* Save rbp so that we can unwind from get_irq_regs() */
688 movq_cfi rbp, 0
689
690 /* Save previous stack value */
691 movq %rsp, %rsi
692 677
693 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 678 leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
694 testl $3, CS-RBP(%rsi) 679
680 testl $3, CS-RBP(%rsp)
695 je 1f 681 je 1f
696 SWAPGS 682 SWAPGS
6831:
697 /* 684 /*
685 * Save previous stack pointer, optionally switch to interrupt stack.
698 * irq_count is used to check if a CPU is already on an interrupt stack 686 * irq_count is used to check if a CPU is already on an interrupt stack
699 * or not. While this is essentially redundant with preempt_count it is 687 * or not. While this is essentially redundant with preempt_count it is
700 * a little cheaper to use a separate counter in the PDA (short of 688 * a little cheaper to use a separate counter in the PDA (short of
701 * moving irq_enter into assembly, which would be too much work) 689 * moving irq_enter into assembly, which would be too much work)
702 */ 690 */
7031: incl PER_CPU_VAR(irq_count) 691 movq %rsp, %rsi
692 incl PER_CPU_VAR(irq_count)
704 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 693 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
705 CFI_DEF_CFA_REGISTER rsi 694 CFI_DEF_CFA_REGISTER rsi
706
707 /* Store previous stack value */
708 pushq %rsi 695 pushq %rsi
696 /*
697 * For debugger:
698 * "CFA (Current Frame Address) is the value on stack + offset"
699 */
709 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 700 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
710 0x77 /* DW_OP_breg7 */, 0, \ 701 0x77 /* DW_OP_breg7 (rsp) */, 0, \
711 0x06 /* DW_OP_deref */, \ 702 0x06 /* DW_OP_deref */, \
712 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 703 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
713 0x22 /* DW_OP_plus */ 704 0x22 /* DW_OP_plus */
714 /* We entered an interrupt context - irqs are off: */ 705 /* We entered an interrupt context - irqs are off: */
715 TRACE_IRQS_OFF 706 TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
727 ASM_CLAC 718 ASM_CLAC
728 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 719 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
729 interrupt do_IRQ 720 interrupt do_IRQ
730 /* 0(%rsp): old_rsp-ARGOFFSET */ 721 /* 0(%rsp): old RSP */
731ret_from_intr: 722ret_from_intr:
732 DISABLE_INTERRUPTS(CLBR_NONE) 723 DISABLE_INTERRUPTS(CLBR_NONE)
733 TRACE_IRQS_OFF 724 TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
735 726
736 /* Restore saved previous stack */ 727 /* Restore saved previous stack */
737 popq %rsi 728 popq %rsi
738 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 729 CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
739 leaq ARGOFFSET-RBP(%rsi), %rsp 730 /* return code expects complete pt_regs - adjust rsp accordingly: */
731 leaq -RBP(%rsi),%rsp
740 CFI_DEF_CFA_REGISTER rsp 732 CFI_DEF_CFA_REGISTER rsp
741 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 733 CFI_ADJUST_CFA_OFFSET RBP
742 734
743exit_intr: 735 testl $3,CS(%rsp)
744 GET_THREAD_INFO(%rcx)
745 testl $3,CS-ARGOFFSET(%rsp)
746 je retint_kernel 736 je retint_kernel
747
748 /* Interrupt came from user space */ 737 /* Interrupt came from user space */
738
739 GET_THREAD_INFO(%rcx)
749 /* 740 /*
750 * Has a correct top of stack, but a partial stack frame
751 * %rcx: thread info. Interrupts off. 741 * %rcx: thread info. Interrupts off.
752 */ 742 */
753retint_with_reschedule: 743retint_with_reschedule:
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */
766 DISABLE_INTERRUPTS(CLBR_ANY) 756 DISABLE_INTERRUPTS(CLBR_ANY)
767 TRACE_IRQS_IRETQ 757 TRACE_IRQS_IRETQ
768 758
769 /*
770 * Try to use SYSRET instead of IRET if we're returning to
771 * a completely clean 64-bit userspace context.
772 */
773 movq (RCX-R11)(%rsp), %rcx
774 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
775 jne opportunistic_sysret_failed
776
777 /*
778 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
779 * in kernel space. This essentially lets the user take over
780 * the kernel, since userspace controls RSP. It's not worth
781 * testing for canonicalness exactly -- this check detects any
782 * of the 17 high bits set, which is true for non-canonical
783 * or kernel addresses. (This will pessimize vsyscall=native.
784 * Big deal.)
785 *
786 * If virtual addresses ever become wider, this will need
787 * to be updated to remain correct on both old and new CPUs.
788 */
789 .ifne __VIRTUAL_MASK_SHIFT - 47
790 .error "virtual address width changed -- sysret checks need update"
791 .endif
792 shr $__VIRTUAL_MASK_SHIFT, %rcx
793 jnz opportunistic_sysret_failed
794
795 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
796 jne opportunistic_sysret_failed
797
798 movq (R11-ARGOFFSET)(%rsp), %r11
799 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
800 jne opportunistic_sysret_failed
801
802 /*
803 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
804 * restoring TF results in a trap from userspace immediately after
805 * SYSRET. This would cause an infinite loop whenever #DB happens
806 * with register state that satisfies the opportunistic SYSRET
807 * conditions. For example, single-stepping this user code:
808 *
809 * movq $stuck_here,%rcx
810 * pushfq
811 * popq %r11
812 * stuck_here:
813 *
814 * would never get past 'stuck_here'.
815 */
816 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
817 jnz opportunistic_sysret_failed
818
819 /* nothing to check for RSP */
820
821 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
822 jne opportunistic_sysret_failed
823
824 /*
825 * We win! This label is here just for ease of understanding
826 * perf profiles. Nothing jumps here.
827 */
828irq_return_via_sysret:
829 CFI_REMEMBER_STATE
830 RESTORE_ARGS 1,8,1
831 movq (RSP-RIP)(%rsp),%rsp
832 USERGS_SYSRET64
833 CFI_RESTORE_STATE
834
835opportunistic_sysret_failed:
836 SWAPGS 759 SWAPGS
837 jmp restore_args 760 jmp restore_c_regs_and_iret
838 761
839retint_restore_args: /* return to kernel space */ 762/* Returning to kernel space */
840 DISABLE_INTERRUPTS(CLBR_ANY) 763retint_kernel:
764#ifdef CONFIG_PREEMPT
765 /* Interrupts are off */
766 /* Check if we need preemption */
767 bt $9,EFLAGS(%rsp) /* interrupts were off? */
768 jnc 1f
7690: cmpl $0,PER_CPU_VAR(__preempt_count)
770 jnz 1f
771 call preempt_schedule_irq
772 jmp 0b
7731:
774#endif
841 /* 775 /*
842 * The iretq could re-enable interrupts: 776 * The iretq could re-enable interrupts:
843 */ 777 */
844 TRACE_IRQS_IRETQ 778 TRACE_IRQS_IRETQ
845restore_args: 779
846 RESTORE_ARGS 1,8,1 780/*
781 * At this label, code paths which return to kernel and to user,
782 * which come from interrupts/exception and from syscalls, merge.
783 */
784restore_c_regs_and_iret:
785 RESTORE_C_REGS
786 REMOVE_PT_GPREGS_FROM_STACK 8
847 787
848irq_return: 788irq_return:
849 INTERRUPT_RETURN 789 INTERRUPT_RETURN
@@ -914,28 +854,17 @@ retint_signal:
914 jz retint_swapgs 854 jz retint_swapgs
915 TRACE_IRQS_ON 855 TRACE_IRQS_ON
916 ENABLE_INTERRUPTS(CLBR_NONE) 856 ENABLE_INTERRUPTS(CLBR_NONE)
917 SAVE_REST 857 SAVE_EXTRA_REGS
918 movq $-1,ORIG_RAX(%rsp) 858 movq $-1,ORIG_RAX(%rsp)
919 xorl %esi,%esi # oldset 859 xorl %esi,%esi # oldset
920 movq %rsp,%rdi # &pt_regs 860 movq %rsp,%rdi # &pt_regs
921 call do_notify_resume 861 call do_notify_resume
922 RESTORE_REST 862 RESTORE_EXTRA_REGS
923 DISABLE_INTERRUPTS(CLBR_NONE) 863 DISABLE_INTERRUPTS(CLBR_NONE)
924 TRACE_IRQS_OFF 864 TRACE_IRQS_OFF
925 GET_THREAD_INFO(%rcx) 865 GET_THREAD_INFO(%rcx)
926 jmp retint_with_reschedule 866 jmp retint_with_reschedule
927 867
928#ifdef CONFIG_PREEMPT
929 /* Returning to kernel space. Check if we need preemption */
930 /* rcx: threadinfo. interrupts off. */
931ENTRY(retint_kernel)
932 cmpl $0,PER_CPU_VAR(__preempt_count)
933 jnz retint_restore_args
934 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
935 jnc retint_restore_args
936 call preempt_schedule_irq
937 jmp exit_intr
938#endif
939 CFI_ENDPROC 868 CFI_ENDPROC
940END(common_interrupt) 869END(common_interrupt)
941 870
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
1024/* 953/*
1025 * Exception entry points. 954 * Exception entry points.
1026 */ 955 */
1027#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 956#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
1028 957
1029.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 958.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
1030ENTRY(\sym) 959ENTRY(\sym)
@@ -1046,8 +975,7 @@ ENTRY(\sym)
1046 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 975 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1047 .endif 976 .endif
1048 977
1049 subq $ORIG_RAX-R15, %rsp 978 ALLOC_PT_GPREGS_ON_STACK
1050 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1051 979
1052 .if \paranoid 980 .if \paranoid
1053 .if \paranoid == 1 981 .if \paranoid == 1
@@ -1055,10 +983,11 @@ ENTRY(\sym)
1055 testl $3, CS(%rsp) /* If coming from userspace, switch */ 983 testl $3, CS(%rsp) /* If coming from userspace, switch */
1056 jnz 1f /* stacks. */ 984 jnz 1f /* stacks. */
1057 .endif 985 .endif
1058 call save_paranoid 986 call paranoid_entry
1059 .else 987 .else
1060 call error_entry 988 call error_entry
1061 .endif 989 .endif
990 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
1062 991
1063 DEFAULT_FRAME 0 992 DEFAULT_FRAME 0
1064 993
@@ -1080,19 +1009,20 @@ ENTRY(\sym)
1080 .endif 1009 .endif
1081 1010
1082 .if \shift_ist != -1 1011 .if \shift_ist != -1
1083 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1012 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1084 .endif 1013 .endif
1085 1014
1086 call \do_sym 1015 call \do_sym
1087 1016
1088 .if \shift_ist != -1 1017 .if \shift_ist != -1
1089 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1018 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1090 .endif 1019 .endif
1091 1020
1021 /* these procedures expect "no swapgs" flag in ebx */
1092 .if \paranoid 1022 .if \paranoid
1093 jmp paranoid_exit /* %ebx: no swapgs flag */ 1023 jmp paranoid_exit
1094 .else 1024 .else
1095 jmp error_exit /* %ebx: no swapgs flag */ 1025 jmp error_exit
1096 .endif 1026 .endif
1097 1027
1098 .if \paranoid == 1 1028 .if \paranoid == 1
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
1296 addq $0x30,%rsp 1226 addq $0x30,%rsp
1297 CFI_ADJUST_CFA_OFFSET -0x30 1227 CFI_ADJUST_CFA_OFFSET -0x30
1298 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1228 pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1299 SAVE_ALL 1229 ALLOC_PT_GPREGS_ON_STACK
1230 SAVE_C_REGS
1231 SAVE_EXTRA_REGS
1300 jmp error_exit 1232 jmp error_exit
1301 CFI_ENDPROC 1233 CFI_ENDPROC
1302END(xen_failsafe_callback) 1234END(xen_failsafe_callback)
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
1328idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 1260idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
1329#endif 1261#endif
1330 1262
1331 /* 1263/*
1332 * "Paranoid" exit path from exception stack. This is invoked 1264 * Save all registers in pt_regs, and switch gs if needed.
1333 * only on return from non-NMI IST interrupts that came 1265 * Use slow, but surefire "are we in kernel?" check.
1334 * from kernel space. 1266 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1335 * 1267 */
1336 * We may be returning to very strange contexts (e.g. very early 1268ENTRY(paranoid_entry)
1337 * in syscall entry), so checking for preemption here would 1269 XCPT_FRAME 1 15*8
1338 * be complicated. Fortunately, we there's no good reason 1270 cld
1339 * to try to handle preemption here. 1271 SAVE_C_REGS 8
1340 */ 1272 SAVE_EXTRA_REGS 8
1273 movl $1,%ebx
1274 movl $MSR_GS_BASE,%ecx
1275 rdmsr
1276 testl %edx,%edx
1277 js 1f /* negative -> in kernel */
1278 SWAPGS
1279 xorl %ebx,%ebx
12801: ret
1281 CFI_ENDPROC
1282END(paranoid_entry)
1341 1283
1342 /* ebx: no swapgs flag */ 1284/*
1285 * "Paranoid" exit path from exception stack. This is invoked
1286 * only on return from non-NMI IST interrupts that came
1287 * from kernel space.
1288 *
1289 * We may be returning to very strange contexts (e.g. very early
1290 * in syscall entry), so checking for preemption here would
1291 * be complicated. Fortunately, we there's no good reason
1292 * to try to handle preemption here.
1293 */
1294/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1343ENTRY(paranoid_exit) 1295ENTRY(paranoid_exit)
1344 DEFAULT_FRAME 1296 DEFAULT_FRAME
1345 DISABLE_INTERRUPTS(CLBR_NONE) 1297 DISABLE_INTERRUPTS(CLBR_NONE)
1346 TRACE_IRQS_OFF_DEBUG 1298 TRACE_IRQS_OFF_DEBUG
1347 testl %ebx,%ebx /* swapgs needed? */ 1299 testl %ebx,%ebx /* swapgs needed? */
1348 jnz paranoid_restore 1300 jnz paranoid_exit_no_swapgs
1349 TRACE_IRQS_IRETQ 0 1301 TRACE_IRQS_IRETQ
1350 SWAPGS_UNSAFE_STACK 1302 SWAPGS_UNSAFE_STACK
1351 RESTORE_ALL 8 1303 jmp paranoid_exit_restore
1352 INTERRUPT_RETURN 1304paranoid_exit_no_swapgs:
1353paranoid_restore: 1305 TRACE_IRQS_IRETQ_DEBUG
1354 TRACE_IRQS_IRETQ_DEBUG 0 1306paranoid_exit_restore:
1355 RESTORE_ALL 8 1307 RESTORE_EXTRA_REGS
1308 RESTORE_C_REGS
1309 REMOVE_PT_GPREGS_FROM_STACK 8
1356 INTERRUPT_RETURN 1310 INTERRUPT_RETURN
1357 CFI_ENDPROC 1311 CFI_ENDPROC
1358END(paranoid_exit) 1312END(paranoid_exit)
1359 1313
1360/* 1314/*
1361 * Exception entry point. This expects an error code/orig_rax on the stack. 1315 * Save all registers in pt_regs, and switch gs if needed.
1362 * returns in "no swapgs flag" in %ebx. 1316 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1363 */ 1317 */
1364ENTRY(error_entry) 1318ENTRY(error_entry)
1365 XCPT_FRAME 1319 XCPT_FRAME 1 15*8
1366 CFI_ADJUST_CFA_OFFSET 15*8
1367 /* oldrax contains error code */
1368 cld 1320 cld
1369 movq %rdi, RDI+8(%rsp) 1321 SAVE_C_REGS 8
1370 movq %rsi, RSI+8(%rsp) 1322 SAVE_EXTRA_REGS 8
1371 movq %rdx, RDX+8(%rsp)
1372 movq %rcx, RCX+8(%rsp)
1373 movq %rax, RAX+8(%rsp)
1374 movq %r8, R8+8(%rsp)
1375 movq %r9, R9+8(%rsp)
1376 movq %r10, R10+8(%rsp)
1377 movq %r11, R11+8(%rsp)
1378 movq_cfi rbx, RBX+8
1379 movq %rbp, RBP+8(%rsp)
1380 movq %r12, R12+8(%rsp)
1381 movq %r13, R13+8(%rsp)
1382 movq %r14, R14+8(%rsp)
1383 movq %r15, R15+8(%rsp)
1384 xorl %ebx,%ebx 1323 xorl %ebx,%ebx
1385 testl $3,CS+8(%rsp) 1324 testl $3,CS+8(%rsp)
1386 je error_kernelspace 1325 je error_kernelspace
@@ -1390,12 +1329,12 @@ error_sti:
1390 TRACE_IRQS_OFF 1329 TRACE_IRQS_OFF
1391 ret 1330 ret
1392 1331
1393/* 1332 /*
1394 * There are two places in the kernel that can potentially fault with 1333 * There are two places in the kernel that can potentially fault with
1395 * usergs. Handle them here. B stepping K8s sometimes report a 1334 * usergs. Handle them here. B stepping K8s sometimes report a
1396 * truncated RIP for IRET exceptions returning to compat mode. Check 1335 * truncated RIP for IRET exceptions returning to compat mode. Check
1397 * for these here too. 1336 * for these here too.
1398 */ 1337 */
1399error_kernelspace: 1338error_kernelspace:
1400 CFI_REL_OFFSET rcx, RCX+8 1339 CFI_REL_OFFSET rcx, RCX+8
1401 incl %ebx 1340 incl %ebx
@@ -1425,11 +1364,11 @@ error_bad_iret:
1425END(error_entry) 1364END(error_entry)
1426 1365
1427 1366
1428/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1367/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1429ENTRY(error_exit) 1368ENTRY(error_exit)
1430 DEFAULT_FRAME 1369 DEFAULT_FRAME
1431 movl %ebx,%eax 1370 movl %ebx,%eax
1432 RESTORE_REST 1371 RESTORE_EXTRA_REGS
1433 DISABLE_INTERRUPTS(CLBR_NONE) 1372 DISABLE_INTERRUPTS(CLBR_NONE)
1434 TRACE_IRQS_OFF 1373 TRACE_IRQS_OFF
1435 GET_THREAD_INFO(%rcx) 1374 GET_THREAD_INFO(%rcx)
@@ -1444,19 +1383,7 @@ ENTRY(error_exit)
1444 CFI_ENDPROC 1383 CFI_ENDPROC
1445END(error_exit) 1384END(error_exit)
1446 1385
1447/* 1386/* Runs on exception stack */
1448 * Test if a given stack is an NMI stack or not.
1449 */
1450 .macro test_in_nmi reg stack nmi_ret normal_ret
1451 cmpq %\reg, \stack
1452 ja \normal_ret
1453 subq $EXCEPTION_STKSZ, %\reg
1454 cmpq %\reg, \stack
1455 jb \normal_ret
1456 jmp \nmi_ret
1457 .endm
1458
1459 /* runs on exception stack */
1460ENTRY(nmi) 1387ENTRY(nmi)
1461 INTR_FRAME 1388 INTR_FRAME
1462 PARAVIRT_ADJUST_EXCEPTION_FRAME 1389 PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1492,7 +1419,7 @@ ENTRY(nmi)
1492 * NMI. 1419 * NMI.
1493 */ 1420 */
1494 1421
1495 /* Use %rdx as out temp variable throughout */ 1422 /* Use %rdx as our temp variable throughout */
1496 pushq_cfi %rdx 1423 pushq_cfi %rdx
1497 CFI_REL_OFFSET rdx, 0 1424 CFI_REL_OFFSET rdx, 0
1498 1425
@@ -1517,8 +1444,17 @@ ENTRY(nmi)
1517 * We check the variable because the first NMI could be in a 1444 * We check the variable because the first NMI could be in a
1518 * breakpoint routine using a breakpoint stack. 1445 * breakpoint routine using a breakpoint stack.
1519 */ 1446 */
1520 lea 6*8(%rsp), %rdx 1447 lea 6*8(%rsp), %rdx
1521 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1448 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1449 cmpq %rdx, 4*8(%rsp)
1450 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1451 ja first_nmi
1452 subq $EXCEPTION_STKSZ, %rdx
1453 cmpq %rdx, 4*8(%rsp)
1454 /* If it is below the NMI stack, it is a normal NMI */
1455 jb first_nmi
1456 /* Ah, it is within the NMI stack, treat it as nested */
1457
1522 CFI_REMEMBER_STATE 1458 CFI_REMEMBER_STATE
1523 1459
1524nested_nmi: 1460nested_nmi:
@@ -1611,7 +1547,7 @@ first_nmi:
1611 .rept 5 1547 .rept 5
1612 pushq_cfi 11*8(%rsp) 1548 pushq_cfi 11*8(%rsp)
1613 .endr 1549 .endr
1614 CFI_DEF_CFA_OFFSET SS+8-RIP 1550 CFI_DEF_CFA_OFFSET 5*8
1615 1551
1616 /* Everything up to here is safe from nested NMIs */ 1552 /* Everything up to here is safe from nested NMIs */
1617 1553
@@ -1639,7 +1575,7 @@ repeat_nmi:
1639 pushq_cfi -6*8(%rsp) 1575 pushq_cfi -6*8(%rsp)
1640 .endr 1576 .endr
1641 subq $(5*8), %rsp 1577 subq $(5*8), %rsp
1642 CFI_DEF_CFA_OFFSET SS+8-RIP 1578 CFI_DEF_CFA_OFFSET 5*8
1643end_repeat_nmi: 1579end_repeat_nmi:
1644 1580
1645 /* 1581 /*
@@ -1648,16 +1584,16 @@ end_repeat_nmi:
1648 * so that we repeat another NMI. 1584 * so that we repeat another NMI.
1649 */ 1585 */
1650 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1586 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1651 subq $ORIG_RAX-R15, %rsp 1587 ALLOC_PT_GPREGS_ON_STACK
1652 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1588
1653 /* 1589 /*
1654 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1590 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1655 * as we should not be calling schedule in NMI context. 1591 * as we should not be calling schedule in NMI context.
1656 * Even with normal interrupts enabled. An NMI should not be 1592 * Even with normal interrupts enabled. An NMI should not be
1657 * setting NEED_RESCHED or anything that normal interrupts and 1593 * setting NEED_RESCHED or anything that normal interrupts and
1658 * exceptions might do. 1594 * exceptions might do.
1659 */ 1595 */
1660 call save_paranoid 1596 call paranoid_entry
1661 DEFAULT_FRAME 0 1597 DEFAULT_FRAME 0
1662 1598
1663 /* 1599 /*
@@ -1688,8 +1624,10 @@ end_repeat_nmi:
1688nmi_swapgs: 1624nmi_swapgs:
1689 SWAPGS_UNSAFE_STACK 1625 SWAPGS_UNSAFE_STACK
1690nmi_restore: 1626nmi_restore:
1627 RESTORE_EXTRA_REGS
1628 RESTORE_C_REGS
1691 /* Pop the extra iret frame at once */ 1629 /* Pop the extra iret frame at once */
1692 RESTORE_ALL 6*8 1630 REMOVE_PT_GPREGS_FROM_STACK 6*8
1693 1631
1694 /* Clear the NMI executing stack variable */ 1632 /* Clear the NMI executing stack variable */
1695 movq $0, 5*8(%rsp) 1633 movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h> 24#include <asm/nops.h>
25#include <asm/bootparam.h>
25 26
26/* Physical address */ 27/* Physical address */
27#define pa(X) ((X) - __PAGE_OFFSET) 28#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
90 91
91 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 92 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
92 us to not reload segments */ 93 us to not reload segments */
93 testb $(1<<6), BP_loadflags(%esi) 94 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
94 jnz 2f 95 jnz 2f
95 96
96/* 97/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit 2 * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
3 * 3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
56 * %rsi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
57 * 57 *
58 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
59 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86/boot/compressed/head_64.S.
60 * 60 *
61 * We only come here initially at boot nothing else comes here. 61 * We only come here initially at boot nothing else comes here.
62 * 62 *
@@ -146,7 +146,7 @@ startup_64:
146 leaq level2_kernel_pgt(%rip), %rdi 146 leaq level2_kernel_pgt(%rip), %rdi
147 leaq 4096(%rdi), %r8 147 leaq 4096(%rdi), %r8
148 /* See if it is a valid page table entry */ 148 /* See if it is a valid page table entry */
1491: testq $1, 0(%rdi) 1491: testb $1, 0(%rdi)
150 jz 2f 150 jz 2f
151 addq %rbp, 0(%rdi) 151 addq %rbp, 0(%rdi)
152 /* Go to the next page */ 152 /* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..29c740deafec 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
68static inline bool interrupted_user_mode(void) 68static inline bool interrupted_user_mode(void)
69{ 69{
70 struct pt_regs *regs = get_irq_regs(); 70 struct pt_regs *regs = get_irq_regs();
71 return regs && user_mode_vm(regs); 71 return regs && user_mode(regs);
72} 72}
73 73
74/* 74/*
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
54 * because the ->io_bitmap_max value must match the bitmap 54 * because the ->io_bitmap_max value must match the bitmap
55 * contents: 55 * contents:
56 */ 56 */
57 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(cpu_tss, get_cpu());
58 58
59 if (turn_on) 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num); 60 bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
165 if (unlikely(!desc)) 165 if (unlikely(!desc))
166 return false; 166 return false;
167 167
168 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { 168 if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
169 if (unlikely(overflow)) 169 if (unlikely(overflow))
170 print_stack_overflow(); 170 print_stack_overflow();
171 desc->handle_irq(irq, desc); 171 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
44 u64 estack_top, estack_bottom; 44 u64 estack_top, estack_bottom;
45 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
46 46
47 if (user_mode_vm(regs)) 47 if (user_mode(regs))
48 return; 48 return;
49 49
50 if (regs->sp >= curbase + sizeof(struct thread_info) + 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
178#endif 178#endif
179 for_each_clear_bit_from(i, used_vectors, first_system_vector) { 179 for_each_clear_bit_from(i, used_vectors, first_system_vector) {
180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
181 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 181 set_intr_gate(i, irq_entries_start +
182 8 * (i - FIRST_EXTERNAL_VECTOR));
182 } 183 }
183#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
184 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) 185 for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 25ecd56cefa8..d6178d9791db 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
126#ifdef CONFIG_X86_32 126#ifdef CONFIG_X86_32
127 switch (regno) { 127 switch (regno) {
128 case GDB_SS: 128 case GDB_SS:
129 if (!user_mode_vm(regs)) 129 if (!user_mode(regs))
130 *(unsigned long *)mem = __KERNEL_DS; 130 *(unsigned long *)mem = __KERNEL_DS;
131 break; 131 break;
132 case GDB_SP: 132 case GDB_SP:
133 if (!user_mode_vm(regs)) 133 if (!user_mode(regs))
134 *(unsigned long *)mem = kernel_stack_pointer(regs); 134 *(unsigned long *)mem = kernel_stack_pointer(regs);
135 break; 135 break;
136 case GDB_GS: 136 case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..24d079604fd5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
602 struct kprobe *p; 602 struct kprobe *p;
603 struct kprobe_ctlblk *kcb; 603 struct kprobe_ctlblk *kcb;
604 604
605 if (user_mode_vm(regs)) 605 if (user_mode(regs))
606 return 0; 606 return 0;
607 607
608 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 608 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
1007 struct die_args *args = data; 1007 struct die_args *args = data;
1008 int ret = NOTIFY_DONE; 1008 int ret = NOTIFY_DONE;
1009 1009
1010 if (args->regs && user_mode_vm(args->regs)) 1010 if (args->regs && user_mode(args->regs))
1011 return ret; 1011 return ret;
1012 1012
1013 if (val == DIE_GPF) { 1013 if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> 34#include <asm/page.h>
35#include <asm/pgtable.h> 35#include <asm/pgtable.h>
36#include <asm/setup.h>
36 37
37#if 0 38#if 0
38#define DEBUGP(fmt, ...) \ 39#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
47 48
48#ifdef CONFIG_RANDOMIZE_BASE 49#ifdef CONFIG_RANDOMIZE_BASE
49static unsigned long module_load_offset; 50static unsigned long module_load_offset;
50static int randomize_modules = 1;
51 51
52/* Mutex protects the module_load_offset. */ 52/* Mutex protects the module_load_offset. */
53static DEFINE_MUTEX(module_kaslr_mutex); 53static DEFINE_MUTEX(module_kaslr_mutex);
54 54
55static int __init parse_nokaslr(char *p)
56{
57 randomize_modules = 0;
58 return 0;
59}
60early_param("nokaslr", parse_nokaslr);
61
62static unsigned long int get_module_load_offset(void) 55static unsigned long int get_module_load_offset(void)
63{ 56{
64 if (randomize_modules) { 57 if (kaslr_enabled()) {
65 mutex_lock(&module_kaslr_mutex); 58 mutex_lock(&module_kaslr_mutex);
66 /* 59 /*
67 * Calculate the module_load_offset the first time this 60 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
131 } 131 }
132 132
133 /* 133 /*
134 * RIP, flags, and the argument registers are usually saved. 134 * These registers are always saved on 64-bit syscall entry.
135 * orig_ax is probably okay, too. 135 * On 32-bit entry points, they are saved too except r8..r11.
136 */ 136 */
137 regs_user_copy->ip = user_regs->ip; 137 regs_user_copy->ip = user_regs->ip;
138 regs_user_copy->ax = user_regs->ax;
138 regs_user_copy->cx = user_regs->cx; 139 regs_user_copy->cx = user_regs->cx;
139 regs_user_copy->dx = user_regs->dx; 140 regs_user_copy->dx = user_regs->dx;
140 regs_user_copy->si = user_regs->si; 141 regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
145 regs_user_copy->r11 = user_regs->r11; 146 regs_user_copy->r11 = user_regs->r11;
146 regs_user_copy->orig_ax = user_regs->orig_ax; 147 regs_user_copy->orig_ax = user_regs->orig_ax;
147 regs_user_copy->flags = user_regs->flags; 148 regs_user_copy->flags = user_regs->flags;
149 regs_user_copy->sp = user_regs->sp;
150 regs_user_copy->cs = user_regs->cs;
151 regs_user_copy->ss = user_regs->ss;
148 152
149 /* 153 /*
150 * Don't even try to report the "rest" regs. 154 * Most system calls don't save these registers, don't report them.
151 */ 155 */
152 regs_user_copy->bx = -1; 156 regs_user_copy->bx = -1;
153 regs_user_copy->bp = -1; 157 regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
158 162
159 /* 163 /*
160 * For this to be at all useful, we need a reasonable guess for 164 * For this to be at all useful, we need a reasonable guess for
161 * sp and the ABI. Be careful: we're in NMI context, and we're 165 * the ABI. Be careful: we're in NMI context, and we're
162 * considering current to be the current task, so we should 166 * considering current to be the current task, so we should
163 * be careful not to look at any other percpu variables that might 167 * be careful not to look at any other percpu variables that might
164 * change during context switches. 168 * change during context switches.
165 */ 169 */
166 if (IS_ENABLED(CONFIG_IA32_EMULATION) && 170 regs_user->abi = user_64bit_mode(user_regs) ?
167 task_thread_info(current)->status & TS_COMPAT) { 171 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
168 /* Easy case: we're in a compat syscall. */
169 regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
170 regs_user_copy->sp = user_regs->sp;
171 regs_user_copy->cs = user_regs->cs;
172 regs_user_copy->ss = user_regs->ss;
173 } else if (user_regs->orig_ax != -1) {
174 /*
175 * We're probably in a 64-bit syscall.
176 * Warning: this code is severely racy. At least it's better
177 * than just blindly copying user_regs.
178 */
179 regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
180 regs_user_copy->sp = this_cpu_read(old_rsp);
181 regs_user_copy->cs = __USER_CS;
182 regs_user_copy->ss = __USER_DS;
183 regs_user_copy->cx = -1; /* usually contains garbage */
184 } else {
185 /* We're probably in an interrupt or exception. */
186 regs_user->abi = user_64bit_mode(user_regs) ?
187 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
188 regs_user_copy->sp = user_regs->sp;
189 regs_user_copy->cs = user_regs->cs;
190 regs_user_copy->ss = user_regs->ss;
191 }
192 172
193 regs_user->regs = regs_user_copy; 173 regs_user->regs = regs_user_copy;
194} 174}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7af7b6478637..0c8992dbead5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,7 +38,26 @@
38 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
39 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40 */ 40 */
41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
42 .x86_tss = {
43 .sp0 = TOP_OF_INIT_STACK,
44#ifdef CONFIG_X86_32
45 .ss0 = __KERNEL_DS,
46 .ss1 = __KERNEL_CS,
47 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
48#endif
49 },
50#ifdef CONFIG_X86_32
51 /*
52 * Note that the .io_bitmap member must be extra-big. This is because
53 * the CPU will access an additional byte beyond the end of the IO
54 * permission bitmap. The extra byte must be all 1 bits, and must
55 * be within the limit.
56 */
57 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
58#endif
59};
60EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
42 61
43#ifdef CONFIG_X86_64 62#ifdef CONFIG_X86_64
44static DEFINE_PER_CPU(unsigned char, is_idle); 63static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -110,7 +129,7 @@ void exit_thread(void)
110 unsigned long *bp = t->io_bitmap_ptr; 129 unsigned long *bp = t->io_bitmap_ptr;
111 130
112 if (bp) { 131 if (bp) {
113 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 132 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
114 133
115 t->io_bitmap_ptr = NULL; 134 t->io_bitmap_ptr = NULL;
116 clear_thread_flag(TIF_IO_BITMAP); 135 clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
73 unsigned long sp; 73 unsigned long sp;
74 unsigned short ss, gs; 74 unsigned short ss, gs;
75 75
76 if (user_mode_vm(regs)) { 76 if (user_mode(regs)) {
77 sp = regs->sp; 77 sp = regs->sp;
78 ss = regs->ss & 0xffff; 78 ss = regs->ss & 0xffff;
79 gs = get_user_gs(regs); 79 gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
206 regs->ip = new_ip; 206 regs->ip = new_ip;
207 regs->sp = new_sp; 207 regs->sp = new_sp;
208 regs->flags = X86_EFLAGS_IF; 208 regs->flags = X86_EFLAGS_IF;
209 /* 209 force_iret();
210 * force it to the iret return path by making it look as if there was
211 * some work pending.
212 */
213 set_thread_flag(TIF_NOTIFY_RESUME);
214} 210}
215EXPORT_SYMBOL_GPL(start_thread); 211EXPORT_SYMBOL_GPL(start_thread);
216 212
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
248 struct thread_struct *prev = &prev_p->thread, 244 struct thread_struct *prev = &prev_p->thread,
249 *next = &next_p->thread; 245 *next = &next_p->thread;
250 int cpu = smp_processor_id(); 246 int cpu = smp_processor_id();
251 struct tss_struct *tss = &per_cpu(init_tss, cpu); 247 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
252 fpu_switch_t fpu; 248 fpu_switch_t fpu;
253 249
254 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 250 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
256 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 252 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
257 253
258 /* 254 /*
259 * Reload esp0.
260 */
261 load_sp0(tss, next);
262
263 /*
264 * Save away %gs. No need to save %fs, as it was saved on the 255 * Save away %gs. No need to save %fs, as it was saved on the
265 * stack on entry. No need to save %es and %ds, as those are 256 * stack on entry. No need to save %es and %ds, as those are
266 * always kernel segments while inside the kernel. Doing this 257 * always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
310 */ 301 */
311 arch_end_context_switch(next_p); 302 arch_end_context_switch(next_p);
312 303
304 /*
305 * Reload esp0, kernel_stack, and current_top_of_stack. This changes
306 * current_thread_info().
307 */
308 load_sp0(tss, next);
313 this_cpu_write(kernel_stack, 309 this_cpu_write(kernel_stack,
314 (unsigned long)task_stack_page(next_p) + 310 (unsigned long)task_stack_page(next_p) +
315 THREAD_SIZE - KERNEL_STACK_OFFSET); 311 THREAD_SIZE);
312 this_cpu_write(cpu_current_top_of_stack,
313 (unsigned long)task_stack_page(next_p) +
314 THREAD_SIZE);
316 315
317 /* 316 /*
318 * Restore %gs if needed (which is common) 317 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52 52
53asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
54 54
55__visible DEFINE_PER_CPU(unsigned long, old_rsp); 55__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
56 56
57/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 childregs = task_pt_regs(p); 162 childregs = task_pt_regs(p);
163 p->thread.sp = (unsigned long) childregs; 163 p->thread.sp = (unsigned long) childregs;
164 p->thread.usersp = me->thread.usersp;
165 set_tsk_thread_flag(p, TIF_FORK); 164 set_tsk_thread_flag(p, TIF_FORK);
166 p->thread.io_bitmap_ptr = NULL; 165 p->thread.io_bitmap_ptr = NULL;
167 166
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
207 */ 206 */
208 if (clone_flags & CLONE_SETTLS) { 207 if (clone_flags & CLONE_SETTLS) {
209#ifdef CONFIG_IA32_EMULATION 208#ifdef CONFIG_IA32_EMULATION
210 if (test_thread_flag(TIF_IA32)) 209 if (is_ia32_task())
211 err = do_set_thread_area(p, -1, 210 err = do_set_thread_area(p, -1,
212 (struct user_desc __user *)childregs->si, 0); 211 (struct user_desc __user *)childregs->si, 0);
213 else 212 else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
235 loadsegment(es, _ds); 234 loadsegment(es, _ds);
236 loadsegment(ds, _ds); 235 loadsegment(ds, _ds);
237 load_gs_index(0); 236 load_gs_index(0);
238 current->thread.usersp = new_sp;
239 regs->ip = new_ip; 237 regs->ip = new_ip;
240 regs->sp = new_sp; 238 regs->sp = new_sp;
241 this_cpu_write(old_rsp, new_sp);
242 regs->cs = _cs; 239 regs->cs = _cs;
243 regs->ss = _ss; 240 regs->ss = _ss;
244 regs->flags = X86_EFLAGS_IF; 241 regs->flags = X86_EFLAGS_IF;
242 force_iret();
245} 243}
246 244
247void 245void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
277 struct thread_struct *prev = &prev_p->thread; 275 struct thread_struct *prev = &prev_p->thread;
278 struct thread_struct *next = &next_p->thread; 276 struct thread_struct *next = &next_p->thread;
279 int cpu = smp_processor_id(); 277 int cpu = smp_processor_id();
280 struct tss_struct *tss = &per_cpu(init_tss, cpu); 278 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
281 unsigned fsindex, gsindex; 279 unsigned fsindex, gsindex;
282 fpu_switch_t fpu; 280 fpu_switch_t fpu;
283 281
284 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 282 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 283
286 /* Reload esp0 and ss1. */
287 load_sp0(tss, next);
288
289 /* We must save %fs and %gs before load_TLS() because 284 /* We must save %fs and %gs before load_TLS() because
290 * %fs and %gs may be cleared by load_TLS(). 285 * %fs and %gs may be cleared by load_TLS().
291 * 286 *
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
401 /* 396 /*
402 * Switch the PDA and FPU contexts. 397 * Switch the PDA and FPU contexts.
403 */ 398 */
404 prev->usersp = this_cpu_read(old_rsp);
405 this_cpu_write(old_rsp, next->usersp);
406 this_cpu_write(current_task, next_p); 399 this_cpu_write(current_task, next_p);
407 400
408 /* 401 /*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
413 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); 406 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
414 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); 407 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
415 408
409 /* Reload esp0 and ss1. This changes current_thread_info(). */
410 load_sp0(tss, next);
411
416 this_cpu_write(kernel_stack, 412 this_cpu_write(kernel_stack,
417 (unsigned long)task_stack_page(next_p) + 413 (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
418 THREAD_SIZE - KERNEL_STACK_OFFSET);
419 414
420 /* 415 /*
421 * Now maybe reload the debug registers and handle I/O bitmaps 416 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
602 597
603unsigned long KSTK_ESP(struct task_struct *task) 598unsigned long KSTK_ESP(struct task_struct *task)
604{ 599{
605 return (test_tsk_thread_flag(task, TIF_IA32)) ? 600 return task_pt_regs(task)->sp;
606 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
607} 601}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
364 case offsetof(struct user_regs_struct,cs): 364 case offsetof(struct user_regs_struct,cs):
365 if (unlikely(value == 0)) 365 if (unlikely(value == 0))
366 return -EIO; 366 return -EIO;
367#ifdef CONFIG_IA32_EMULATION 367 task_pt_regs(task)->cs = value;
368 if (test_tsk_thread_flag(task, TIF_IA32))
369 task_pt_regs(task)->cs = value;
370#endif
371 break; 368 break;
372 case offsetof(struct user_regs_struct,ss): 369 case offsetof(struct user_regs_struct,ss):
373 if (unlikely(value == 0)) 370 if (unlikely(value == 0))
374 return -EIO; 371 return -EIO;
375#ifdef CONFIG_IA32_EMULATION 372 task_pt_regs(task)->ss = value;
376 if (test_tsk_thread_flag(task, TIF_IA32))
377 task_pt_regs(task)->ss = value;
378#endif
379 break; 373 break;
380 } 374 }
381 375
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
1421 memset(info, 0, sizeof(*info)); 1415 memset(info, 0, sizeof(*info));
1422 info->si_signo = SIGTRAP; 1416 info->si_signo = SIGTRAP;
1423 info->si_code = si_code; 1417 info->si_code = si_code;
1424 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; 1418 info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
1425} 1419}
1426 1420
1427void user_single_step_siginfo(struct task_struct *tsk, 1421void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
226 movl (%ebx), %ecx 226 movl (%ebx), %ecx
227 addl $4, %ebx 227 addl $4, %ebx
2281: 2281:
229 testl $0x1, %ecx /* is it a destination page */ 229 testb $0x1, %cl /* is it a destination page */
230 jz 2f 230 jz 2f
231 movl %ecx, %edi 231 movl %ecx, %edi
232 andl $0xfffff000, %edi 232 andl $0xfffff000, %edi
233 jmp 0b 233 jmp 0b
2342: 2342:
235 testl $0x2, %ecx /* is it an indirection page */ 235 testb $0x2, %cl /* is it an indirection page */
236 jz 2f 236 jz 2f
237 movl %ecx, %ebx 237 movl %ecx, %ebx
238 andl $0xfffff000, %ebx 238 andl $0xfffff000, %ebx
239 jmp 0b 239 jmp 0b
2402: 2402:
241 testl $0x4, %ecx /* is it the done indicator */ 241 testb $0x4, %cl /* is it the done indicator */
242 jz 2f 242 jz 2f
243 jmp 3f 243 jmp 3f
2442: 2442:
245 testl $0x8, %ecx /* is it the source indicator */ 245 testb $0x8, %cl /* is it the source indicator */
246 jz 0b /* Ignore it otherwise */ 246 jz 0b /* Ignore it otherwise */
247 movl %ecx, %esi /* For every source page do a copy */ 247 movl %ecx, %esi /* For every source page do a copy */
248 andl $0xfffff000, %esi 248 andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
123 * Set cr4 to a known state: 123 * Set cr4 to a known state:
124 * - physical address extension enabled 124 * - physical address extension enabled
125 */ 125 */
126 movq $X86_CR4_PAE, %rax 126 movl $X86_CR4_PAE, %eax
127 movq %rax, %cr4 127 movq %rax, %cr4
128 128
129 jmp 1f 129 jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
221 movq (%rbx), %rcx 221 movq (%rbx), %rcx
222 addq $8, %rbx 222 addq $8, %rbx
2231: 2231:
224 testq $0x1, %rcx /* is it a destination page? */ 224 testb $0x1, %cl /* is it a destination page? */
225 jz 2f 225 jz 2f
226 movq %rcx, %rdi 226 movq %rcx, %rdi
227 andq $0xfffffffffffff000, %rdi 227 andq $0xfffffffffffff000, %rdi
228 jmp 0b 228 jmp 0b
2292: 2292:
230 testq $0x2, %rcx /* is it an indirection page? */ 230 testb $0x2, %cl /* is it an indirection page? */
231 jz 2f 231 jz 2f
232 movq %rcx, %rbx 232 movq %rcx, %rbx
233 andq $0xfffffffffffff000, %rbx 233 andq $0xfffffffffffff000, %rbx
234 jmp 0b 234 jmp 0b
2352: 2352:
236 testq $0x4, %rcx /* is it the done indicator? */ 236 testb $0x4, %cl /* is it the done indicator? */
237 jz 2f 237 jz 2f
238 jmp 3f 238 jmp 3f
2392: 2392:
240 testq $0x8, %rcx /* is it the source indicator? */ 240 testb $0x8, %cl /* is it the source indicator? */
241 jz 0b /* Ignore it otherwise */ 241 jz 0b /* Ignore it otherwise */
242 movq %rcx, %rsi /* For ever source page do a copy */ 242 movq %rcx, %rsi /* For ever source page do a copy */
243 andq $0xfffffffffffff000, %rsi 243 andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
246 movq %rsi, %rax 246 movq %rsi, %rax
247 247
248 movq %r10, %rdi 248 movq %r10, %rdi
249 movq $512, %rcx 249 movl $512, %ecx
250 rep ; movsq 250 rep ; movsq
251 251
252 movq %rax, %rdi 252 movq %rax, %rdi
253 movq %rdx, %rsi 253 movq %rdx, %rsi
254 movq $512, %rcx 254 movl $512, %ecx
255 rep ; movsq 255 rep ; movsq
256 256
257 movq %rdx, %rdi 257 movq %rdx, %rdi
258 movq %r10, %rsi 258 movq %r10, %rsi
259 movq $512, %rcx 259 movl $512, %ecx
260 rep ; movsq 260 rep ; movsq
261 261
262 lea PAGE_SIZE(%rax), %rsi 262 lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..014466b152b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
832static int 832static int
833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
834{ 834{
835 pr_emerg("Kernel Offset: 0x%lx from 0x%lx " 835 if (kaslr_enabled()) {
836 "(relocation range: 0x%lx-0x%lx)\n", 836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
837 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, 837 (unsigned long)&_text - __START_KERNEL,
838 __START_KERNEL_map, MODULES_VADDR-1); 838 __START_KERNEL,
839 __START_KERNEL_map,
840 MODULES_VADDR-1);
841 } else {
842 pr_emerg("Kernel Offset: disabled\n");
843 }
839 844
840 return 0; 845 return 0;
841} 846}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..53cc4085c3d7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
61 regs->seg = GET_SEG(seg) | 3; \ 61 regs->seg = GET_SEG(seg) | 3; \
62} while (0) 62} while (0)
63 63
64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
65 unsigned long *pax)
66{ 65{
67 void __user *buf; 66 void __user *buf;
68 unsigned int tmpflags; 67 unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
81#endif /* CONFIG_X86_32 */ 80#endif /* CONFIG_X86_32 */
82 81
83 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 82 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
84 COPY(dx); COPY(cx); COPY(ip); 83 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
85 84
86#ifdef CONFIG_X86_64 85#ifdef CONFIG_X86_64
87 COPY(r8); 86 COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
94 COPY(r15); 93 COPY(r15);
95#endif /* CONFIG_X86_64 */ 94#endif /* CONFIG_X86_64 */
96 95
97#ifdef CONFIG_X86_32
98 COPY_SEG_CPL3(cs); 96 COPY_SEG_CPL3(cs);
99 COPY_SEG_CPL3(ss); 97 COPY_SEG_CPL3(ss);
100#else /* !CONFIG_X86_32 */
101 /* Kernel saves and restores only the CS segment register on signals,
102 * which is the bare minimum needed to allow mixed 32/64-bit code.
103 * App's signal handler can save/restore other segments if needed. */
104 COPY_SEG_CPL3(cs);
105#endif /* CONFIG_X86_32 */
106 98
107 get_user_ex(tmpflags, &sc->flags); 99 get_user_ex(tmpflags, &sc->flags);
108 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 100 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
109 regs->orig_ax = -1; /* disable syscall checks */ 101 regs->orig_ax = -1; /* disable syscall checks */
110 102
111 get_user_ex(buf, &sc->fpstate); 103 get_user_ex(buf, &sc->fpstate);
112
113 get_user_ex(*pax, &sc->ax);
114 } get_user_catch(err); 104 } get_user_catch(err);
115 105
116 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); 106 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
117 107
108 force_iret();
109
118 return err; 110 return err;
119} 111}
120 112
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
162#else /* !CONFIG_X86_32 */ 154#else /* !CONFIG_X86_32 */
163 put_user_ex(regs->flags, &sc->flags); 155 put_user_ex(regs->flags, &sc->flags);
164 put_user_ex(regs->cs, &sc->cs); 156 put_user_ex(regs->cs, &sc->cs);
165 put_user_ex(0, &sc->gs); 157 put_user_ex(0, &sc->__pad2);
166 put_user_ex(0, &sc->fs); 158 put_user_ex(0, &sc->__pad1);
159 put_user_ex(regs->ss, &sc->ss);
167#endif /* CONFIG_X86_32 */ 160#endif /* CONFIG_X86_32 */
168 161
169 put_user_ex(fpstate, &sc->fpstate); 162 put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
457 450
458 regs->sp = (unsigned long)frame; 451 regs->sp = (unsigned long)frame;
459 452
460 /* Set up the CS register to run signal handlers in 64-bit mode, 453 /*
461 even if the handler happens to be interrupting 32-bit code. */ 454 * Set up the CS and SS registers to run signal handlers in
455 * 64-bit mode, even if the handler happens to be interrupting
456 * 32-bit or 16-bit code.
457 *
458 * SS is subtle. In 64-bit mode, we don't need any particular
459 * SS descriptor, but we do need SS to be valid. It's possible
460 * that the old SS is entirely bogus -- this can happen if the
461 * signal we're trying to deliver is #GP or #SS caused by a bad
462 * SS value.
463 */
462 regs->cs = __USER_CS; 464 regs->cs = __USER_CS;
465 regs->ss = __USER_DS;
463 466
464 return 0; 467 return 0;
465} 468}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
539{ 542{
540 struct pt_regs *regs = current_pt_regs(); 543 struct pt_regs *regs = current_pt_regs();
541 struct sigframe __user *frame; 544 struct sigframe __user *frame;
542 unsigned long ax;
543 sigset_t set; 545 sigset_t set;
544 546
545 frame = (struct sigframe __user *)(regs->sp - 8); 547 frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
553 555
554 set_current_blocked(&set); 556 set_current_blocked(&set);
555 557
556 if (restore_sigcontext(regs, &frame->sc, &ax)) 558 if (restore_sigcontext(regs, &frame->sc))
557 goto badframe; 559 goto badframe;
558 return ax; 560 return regs->ax;
559 561
560badframe: 562badframe:
561 signal_fault(regs, frame, "sigreturn"); 563 signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
568{ 570{
569 struct pt_regs *regs = current_pt_regs(); 571 struct pt_regs *regs = current_pt_regs();
570 struct rt_sigframe __user *frame; 572 struct rt_sigframe __user *frame;
571 unsigned long ax;
572 sigset_t set; 573 sigset_t set;
573 574
574 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 575 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
579 580
580 set_current_blocked(&set); 581 set_current_blocked(&set);
581 582
582 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 583 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
583 goto badframe; 584 goto badframe;
584 585
585 if (restore_altstack(&frame->uc.uc_stack)) 586 if (restore_altstack(&frame->uc.uc_stack))
586 goto badframe; 587 goto badframe;
587 588
588 return ax; 589 return regs->ax;
589 590
590badframe: 591badframe:
591 signal_fault(regs, frame, "rt_sigreturn"); 592 signal_fault(regs, frame, "rt_sigreturn");
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
780 struct pt_regs *regs = current_pt_regs(); 781 struct pt_regs *regs = current_pt_regs();
781 struct rt_sigframe_x32 __user *frame; 782 struct rt_sigframe_x32 __user *frame;
782 sigset_t set; 783 sigset_t set;
783 unsigned long ax;
784 784
785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
786 786
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
791 791
792 set_current_blocked(&set); 792 set_current_blocked(&set);
793 793
794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
795 goto badframe; 795 goto badframe;
796 796
797 if (compat_restore_altstack(&frame->uc.uc_stack)) 797 if (compat_restore_altstack(&frame->uc.uc_stack))
798 goto badframe; 798 goto badframe;
799 799
800 return ax; 800 return regs->ax;
801 801
802badframe: 802badframe:
803 signal_fault(regs, frame, "x32 rt_sigreturn"); 803 signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ddd2c0674cda..7035f6b21c3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -779,6 +779,26 @@ out:
779 return boot_error; 779 return boot_error;
780} 780}
781 781
782void common_cpu_up(unsigned int cpu, struct task_struct *idle)
783{
784 /* Just in case we booted with a single CPU. */
785 alternatives_enable_smp();
786
787 per_cpu(current_task, cpu) = idle;
788
789#ifdef CONFIG_X86_32
790 /* Stack for startup_32 can be just as for start_secondary onwards */
791 irq_ctx_init(cpu);
792 per_cpu(cpu_current_top_of_stack, cpu) =
793 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
794#else
795 clear_tsk_thread_flag(idle, TIF_FORK);
796 initial_gs = per_cpu_offset(cpu);
797#endif
798 per_cpu(kernel_stack, cpu) =
799 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
800}
801
782/* 802/*
783 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 803 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
784 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 804 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
796 int cpu0_nmi_registered = 0; 816 int cpu0_nmi_registered = 0;
797 unsigned long timeout; 817 unsigned long timeout;
798 818
799 /* Just in case we booted with a single CPU. */
800 alternatives_enable_smp();
801
802 idle->thread.sp = (unsigned long) (((struct pt_regs *) 819 idle->thread.sp = (unsigned long) (((struct pt_regs *)
803 (THREAD_SIZE + task_stack_page(idle))) - 1); 820 (THREAD_SIZE + task_stack_page(idle))) - 1);
804 per_cpu(current_task, cpu) = idle;
805 821
806#ifdef CONFIG_X86_32
807 /* Stack for startup_32 can be just as for start_secondary onwards */
808 irq_ctx_init(cpu);
809#else
810 clear_tsk_thread_flag(idle, TIF_FORK);
811 initial_gs = per_cpu_offset(cpu);
812#endif
813 per_cpu(kernel_stack, cpu) =
814 (unsigned long)task_stack_page(idle) -
815 KERNEL_STACK_OFFSET + THREAD_SIZE;
816 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 822 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
817 initial_code = (unsigned long)start_secondary; 823 initial_code = (unsigned long)start_secondary;
818 stack_start = idle->thread.sp; 824 stack_start = idle->thread.sp;
@@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
953 /* the FPU context is blank, nobody can own it */ 959 /* the FPU context is blank, nobody can own it */
954 __cpu_disable_lazy_restore(cpu); 960 __cpu_disable_lazy_restore(cpu);
955 961
962 common_cpu_up(cpu, tidle);
963
956 err = do_boot_cpu(apicid, cpu, tidle); 964 err = do_boot_cpu(apicid, cpu, tidle);
957 if (err) { 965 if (err) {
958 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); 966 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; 8#ifdef CONFIG_IA32_EMULATION
9#define SYM(sym, compat) compat
10#else
11#define SYM(sym, compat) sym
12#define ia32_sys_call_table sys_call_table
13#define __NR_ia32_syscall_max __NR_syscall_max
14#endif
15
16#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
9#include <asm/syscalls_32.h> 17#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386 18#undef __SYSCALL_I386
11 19
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, 20#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
13 21
14typedef asmlinkage void (*sys_call_ptr_t)(void); 22typedef asmlinkage void (*sys_call_ptr_t)(void);
15 23
16extern asmlinkage void sys_ni_syscall(void); 24extern asmlinkage void sys_ni_syscall(void);
17 25
18__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 26__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /* 27 /*
20 * Smells like a compiler bug -- it doesn't work 28 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed. 29 * when the & below is removed.
22 */ 30 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall, 31 [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h> 32#include <asm/syscalls_32.h>
25}; 33};
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
30{ 30{
31 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
32 32
33 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 33 if (!user_mode(regs) && in_lock_functions(pc)) {
34#ifdef CONFIG_FRAME_POINTER 34#ifdef CONFIG_FRAME_POINTER
35 return *(unsigned long *)(regs->bp + sizeof(long)); 35 return *(unsigned long *)(regs->bp + sizeof(long));
36#else 36#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..6751c5c58eec 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
112{ 112{
113 enum ctx_state prev_state; 113 enum ctx_state prev_state;
114 114
115 if (user_mode_vm(regs)) { 115 if (user_mode(regs)) {
116 /* Other than that, we're just an exception. */ 116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter(); 117 prev_state = exception_enter();
118 } else { 118 } else {
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
146 /* Must be before exception_exit. */ 146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET); 147 preempt_count_sub(HARDIRQ_OFFSET);
148 148
149 if (user_mode_vm(regs)) 149 if (user_mode(regs))
150 return exception_exit(prev_state); 150 return exception_exit(prev_state);
151 else 151 else
152 rcu_nmi_exit(); 152 rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
158 * 158 *
159 * IST exception handlers normally cannot schedule. As a special 159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e. 160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not 161 * user_mode(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside 164 * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
167 */ 167 */
168void ist_begin_non_atomic(struct pt_regs *regs) 168void ist_begin_non_atomic(struct pt_regs *regs)
169{ 169{
170 BUG_ON(!user_mode_vm(regs)); 170 BUG_ON(!user_mode(regs));
171 171
172 /* 172 /*
173 * Sanity check: we need to be on the normal thread stack. This 173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable 174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault. 175 * from double_fault.
176 */ 176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) 177 BUG_ON((unsigned long)(current_top_of_stack() -
178 & ~(THREAD_SIZE - 1)) != 0); 178 current_stack_pointer()) >= THREAD_SIZE);
179 179
180 preempt_count_sub(HARDIRQ_OFFSET); 180 preempt_count_sub(HARDIRQ_OFFSET);
181} 181}
@@ -194,8 +194,7 @@ static nokprobe_inline int
194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
195 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
196{ 196{
197#ifdef CONFIG_X86_32 197 if (v8086_mode(regs)) {
198 if (regs->flags & X86_VM_MASK) {
199 /* 198 /*
200 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 199 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
201 * On nmi (interrupt 2), do_trap should not be called. 200 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
207 } 206 }
208 return -1; 207 return -1;
209 } 208 }
210#endif 209
211 if (!user_mode(regs)) { 210 if (!user_mode(regs)) {
212 if (!fixup_exception(regs)) { 211 if (!fixup_exception(regs)) {
213 tsk->thread.error_code = error_code; 212 tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
384 goto exit; 383 goto exit;
385 conditional_sti(regs); 384 conditional_sti(regs);
386 385
387 if (!user_mode_vm(regs)) 386 if (!user_mode(regs))
388 die("bounds", regs, error_code); 387 die("bounds", regs, error_code);
389 388
390 if (!cpu_feature_enabled(X86_FEATURE_MPX)) { 389 if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
462 prev_state = exception_enter(); 461 prev_state = exception_enter();
463 conditional_sti(regs); 462 conditional_sti(regs);
464 463
465#ifdef CONFIG_X86_32 464 if (v8086_mode(regs)) {
466 if (regs->flags & X86_VM_MASK) {
467 local_irq_enable(); 465 local_irq_enable();
468 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 466 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
469 goto exit; 467 goto exit;
470 } 468 }
471#endif
472 469
473 tsk = current; 470 tsk = current;
474 if (!user_mode(regs)) { 471 if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
587 /* Copy the remainder of the stack from the current stack. */ 584 /* Copy the remainder of the stack from the current stack. */
588 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); 585 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
589 586
590 BUG_ON(!user_mode_vm(&new_stack->regs)); 587 BUG_ON(!user_mode(&new_stack->regs));
591 return new_stack; 588 return new_stack;
592} 589}
593NOKPROBE_SYMBOL(fixup_bad_iret); 590NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
637 * then it's very likely the result of an icebp/int01 trap. 634 * then it's very likely the result of an icebp/int01 trap.
638 * User wants a sigtrap for that. 635 * User wants a sigtrap for that.
639 */ 636 */
640 if (!dr6 && user_mode_vm(regs)) 637 if (!dr6 && user_mode(regs))
641 user_icebp = 1; 638 user_icebp = 1;
642 639
643 /* Catch kmemcheck conditions first of all! */ 640 /* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
673 /* It's safe to allow irq's after DR6 has been saved */ 670 /* It's safe to allow irq's after DR6 has been saved */
674 preempt_conditional_sti(regs); 671 preempt_conditional_sti(regs);
675 672
676 if (regs->flags & X86_VM_MASK) { 673 if (v8086_mode(regs)) {
677 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 674 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
678 X86_TRAP_DB); 675 X86_TRAP_DB);
679 preempt_conditional_cli(regs); 676 preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
721 return; 718 return;
722 conditional_sti(regs); 719 conditional_sti(regs);
723 720
724 if (!user_mode_vm(regs)) 721 if (!user_mode(regs))
725 { 722 {
726 if (!fixup_exception(regs)) { 723 if (!fixup_exception(regs)) {
727 task->thread.error_code = error_code; 724 task->thread.error_code = error_code;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
925/* Set of traps needed for early debugging. */ 922/* Set of traps needed for early debugging. */
926void __init early_trap_init(void) 923void __init early_trap_init(void)
927{ 924{
928 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 925 /*
926 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
927 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
928 * CPU runs at ring 0 so it is impossible to hit an invalid
929 * stack. Using the original stack works well enough at this
930 * early stage. DEBUG_STACK will be equipped after cpu_init() in
931 * trap_init().
932 *
933 * We don't need to set trace_idt_table like set_intr_gate(),
934 * since we don't have trace_debug and it will be reset to
935 * 'debug' in trap_init() by set_intr_gate_ist().
936 */
937 set_intr_gate_notrace(X86_TRAP_DB, debug);
929 /* int3 can be called from all */ 938 /* int3 can be called from all */
930 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 939 set_system_intr_gate(X86_TRAP_BP, &int3);
931#ifdef CONFIG_X86_32 940#ifdef CONFIG_X86_32
932 set_intr_gate(X86_TRAP_PF, page_fault); 941 set_intr_gate(X86_TRAP_PF, page_fault);
933#endif 942#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
1005 */ 1014 */
1006 cpu_init(); 1015 cpu_init();
1007 1016
1017 /*
1018 * X86_TRAP_DB and X86_TRAP_BP have been set
1019 * in early_trap_init(). However, ITS works only after
1020 * cpu_init() loads TSS. See comments in early_trap_init().
1021 */
1022 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
1023 /* int3 can be called from all */
1024 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
1025
1008 x86_init.irqs.trap_init(); 1026 x86_init.irqs.trap_init();
1009 1027
1010#ifdef CONFIG_X86_64 1028#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
912 int ret = NOTIFY_DONE; 912 int ret = NOTIFY_DONE;
913 913
914 /* We are only interested in userspace traps */ 914 /* We are only interested in userspace traps */
915 if (regs && !user_mode_vm(regs)) 915 if (regs && !user_mode(regs))
916 return NOTIFY_DONE; 916 return NOTIFY_DONE;
917 917
918 switch (val) { 918 switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
150 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
151 } 151 }
152 152
153 tss = &per_cpu(init_tss, get_cpu()); 153 tss = &per_cpu(cpu_tss, get_cpu());
154 current->thread.sp0 = current->thread.saved_sp0; 154 current->thread.sp0 = current->thread.saved_sp0;
155 current->thread.sysenter_cs = __KERNEL_CS; 155 current->thread.sysenter_cs = __KERNEL_CS;
156 load_sp0(tss, &current->thread); 156 load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 tsk->thread.saved_fs = info->regs32->fs; 318 tsk->thread.saved_fs = info->regs32->fs;
319 tsk->thread.saved_gs = get_user_gs(info->regs32); 319 tsk->thread.saved_gs = get_user_gs(info->regs32);
320 320
321 tss = &per_cpu(init_tss, get_cpu()); 321 tss = &per_cpu(cpu_tss, get_cpu());
322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
323 if (cpu_has_sep) 323 if (cpu_has_sep)
324 tsk->thread.sysenter_cs = 0; 324 tsk->thread.sysenter_cs = 0;