aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-12-18 11:59:15 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-12-18 11:59:15 -0500
commit64a48099b3b31568ac45716b7fafcb74a0c2fcfe (patch)
tree0652431aeb450bbfa74b9be8b7d813ac8511aec3
parent1291a0d5049dbc06baaaf66a9ff3f53db493b19b (diff)
parent6cbd2171e89b13377261d15e64384df60ecb530e (diff)
Merge branch 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 syscall entry code changes for PTI from Ingo Molnar: "The main changes here are Andy Lutomirski's changes to switch the x86-64 entry code to use the 'per CPU entry trampoline stack'. This, besides helping fix KASLR leaks (the pending Page Table Isolation (PTI) work), also robustifies the x86 entry code" * 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) x86/cpufeatures: Make CPU bugs sticky x86/paravirt: Provide a way to check for hypervisors x86/paravirt: Dont patch flush_tlb_single x86/entry/64: Make cpu_entry_area.tss read-only x86/entry: Clean up the SYSENTER_stack code x86/entry/64: Remove the SYSENTER stack canary x86/entry/64: Move the IST stacks into struct cpu_entry_area x86/entry/64: Create a per-CPU SYSCALL entry trampoline x86/entry/64: Return to userspace from the trampoline stack x86/entry/64: Use a per-CPU trampoline stack for IDT entries x86/espfix/64: Stop assuming that pt_regs is on the entry stack x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 x86/entry: Remap the TSS into the CPU entry area x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct x86/dumpstack: Handle stack overflow on all stacks x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss x86/kasan/64: Teach KASAN about the cpu_entry_area x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area x86/entry/gdt: Put per-CPU GDT remaps in ascending order x86/dumpstack: Add get_stack_info() support for the SYSENTER stack ...
-rw-r--r--arch/x86/entry/entry_32.S6
-rw-r--r--arch/x86/entry/entry_64.S189
-rw-r--r--arch/x86/entry/entry_64_compat.S7
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/desc.h11
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/hypervisor.h25
-rw-r--r--arch/x86/include/asm/irqflags.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/processor.h59
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/switch_to.h8
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/unwind.h7
-rw-r--r--arch/x86/kernel/asm-offsets.c6
-rw-r--r--arch/x86/kernel/asm-offsets_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c170
-rw-r--r--arch/x86/kernel/doublefault.c36
-rw-r--r--arch/x86/kernel/dumpstack.c74
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c12
-rw-r--r--arch/x86/kernel/irq_64.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c19
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c14
-rw-r--r--arch/x86/kernel/traps.c69
-rw-r--r--arch/x86/kernel/unwind_orc.c88
-rw-r--r--arch/x86/kernel/vmlinux.lds.S9
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c18
-rw-r--r--arch/x86/power/cpu.c16
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/mmu_pv.c2
40 files changed, 691 insertions, 286 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4838037f97f6..bd8b57a5c874 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -941,7 +941,8 @@ ENTRY(debug)
941 movl %esp, %eax # pt_regs pointer 941 movl %esp, %eax # pt_regs pointer
942 942
943 /* Are we currently on the SYSENTER stack? */ 943 /* Are we currently on the SYSENTER stack? */
944 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) 944 movl PER_CPU_VAR(cpu_entry_area), %ecx
945 addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
945 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 946 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
946 cmpl $SIZEOF_SYSENTER_stack, %ecx 947 cmpl $SIZEOF_SYSENTER_stack, %ecx
947 jb .Ldebug_from_sysenter_stack 948 jb .Ldebug_from_sysenter_stack
@@ -984,7 +985,8 @@ ENTRY(nmi)
984 movl %esp, %eax # pt_regs pointer 985 movl %esp, %eax # pt_regs pointer
985 986
986 /* Are we currently on the SYSENTER stack? */ 987 /* Are we currently on the SYSENTER stack? */
987 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) 988 movl PER_CPU_VAR(cpu_entry_area), %ecx
989 addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
988 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 990 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
989 cmpl $SIZEOF_SYSENTER_stack, %ecx 991 cmpl $SIZEOF_SYSENTER_stack, %ecx
990 jb .Lnmi_from_sysenter_stack 992 jb .Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f81d50d7ceac..423885bee398 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -140,6 +140,64 @@ END(native_usergs_sysret64)
140 * with them due to bugs in both AMD and Intel CPUs. 140 * with them due to bugs in both AMD and Intel CPUs.
141 */ 141 */
142 142
143 .pushsection .entry_trampoline, "ax"
144
145/*
146 * The code in here gets remapped into cpu_entry_area's trampoline. This means
147 * that the assembler and linker have the wrong idea as to where this code
148 * lives (and, in fact, it's mapped more than once, so it's not even at a
149 * fixed address). So we can't reference any symbols outside the entry
150 * trampoline and expect it to work.
151 *
152 * Instead, we carefully abuse %rip-relative addressing.
153 * _entry_trampoline(%rip) refers to the start of the remapped) entry
154 * trampoline. We can thus find cpu_entry_area with this macro:
155 */
156
157#define CPU_ENTRY_AREA \
158 _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
159
160/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
161#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
162 SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
163
164ENTRY(entry_SYSCALL_64_trampoline)
165 UNWIND_HINT_EMPTY
166 swapgs
167
168 /* Stash the user RSP. */
169 movq %rsp, RSP_SCRATCH
170
171 /* Load the top of the task stack into RSP */
172 movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
173
174 /* Start building the simulated IRET frame. */
175 pushq $__USER_DS /* pt_regs->ss */
176 pushq RSP_SCRATCH /* pt_regs->sp */
177 pushq %r11 /* pt_regs->flags */
178 pushq $__USER_CS /* pt_regs->cs */
179 pushq %rcx /* pt_regs->ip */
180
181 /*
182 * x86 lacks a near absolute jump, and we can't jump to the real
183 * entry text with a relative jump. We could push the target
184 * address and then use retq, but this destroys the pipeline on
185 * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
186 * spill RDI and restore it in a second-stage trampoline.
187 */
188 pushq %rdi
189 movq $entry_SYSCALL_64_stage2, %rdi
190 jmp *%rdi
191END(entry_SYSCALL_64_trampoline)
192
193 .popsection
194
195ENTRY(entry_SYSCALL_64_stage2)
196 UNWIND_HINT_EMPTY
197 popq %rdi
198 jmp entry_SYSCALL_64_after_hwframe
199END(entry_SYSCALL_64_stage2)
200
143ENTRY(entry_SYSCALL_64) 201ENTRY(entry_SYSCALL_64)
144 UNWIND_HINT_EMPTY 202 UNWIND_HINT_EMPTY
145 /* 203 /*
@@ -330,8 +388,24 @@ syscall_return_via_sysret:
330 popq %rsi /* skip rcx */ 388 popq %rsi /* skip rcx */
331 popq %rdx 389 popq %rdx
332 popq %rsi 390 popq %rsi
391
392 /*
393 * Now all regs are restored except RSP and RDI.
394 * Save old stack pointer and switch to trampoline stack.
395 */
396 movq %rsp, %rdi
397 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
398
399 pushq RSP-RDI(%rdi) /* RSP */
400 pushq (%rdi) /* RDI */
401
402 /*
403 * We are on the trampoline stack. All regs except RDI are live.
404 * We can do future final exit work right here.
405 */
406
333 popq %rdi 407 popq %rdi
334 movq RSP-ORIG_RAX(%rsp), %rsp 408 popq %rsp
335 USERGS_SYSRET64 409 USERGS_SYSRET64
336END(entry_SYSCALL_64) 410END(entry_SYSCALL_64)
337 411
@@ -466,12 +540,13 @@ END(irq_entries_start)
466 540
467.macro DEBUG_ENTRY_ASSERT_IRQS_OFF 541.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
468#ifdef CONFIG_DEBUG_ENTRY 542#ifdef CONFIG_DEBUG_ENTRY
469 pushfq 543 pushq %rax
470 testl $X86_EFLAGS_IF, (%rsp) 544 SAVE_FLAGS(CLBR_RAX)
545 testl $X86_EFLAGS_IF, %eax
471 jz .Lokay_\@ 546 jz .Lokay_\@
472 ud2 547 ud2
473.Lokay_\@: 548.Lokay_\@:
474 addq $8, %rsp 549 popq %rax
475#endif 550#endif
476.endm 551.endm
477 552
@@ -563,6 +638,13 @@ END(irq_entries_start)
563/* 0(%rsp): ~(interrupt number) */ 638/* 0(%rsp): ~(interrupt number) */
564 .macro interrupt func 639 .macro interrupt func
565 cld 640 cld
641
642 testb $3, CS-ORIG_RAX(%rsp)
643 jz 1f
644 SWAPGS
645 call switch_to_thread_stack
6461:
647
566 ALLOC_PT_GPREGS_ON_STACK 648 ALLOC_PT_GPREGS_ON_STACK
567 SAVE_C_REGS 649 SAVE_C_REGS
568 SAVE_EXTRA_REGS 650 SAVE_EXTRA_REGS
@@ -572,12 +654,8 @@ END(irq_entries_start)
572 jz 1f 654 jz 1f
573 655
574 /* 656 /*
575 * IRQ from user mode. Switch to kernel gsbase and inform context 657 * IRQ from user mode.
576 * tracking that we're in kernel mode. 658 *
577 */
578 SWAPGS
579
580 /*
581 * We need to tell lockdep that IRQs are off. We can't do this until 659 * We need to tell lockdep that IRQs are off. We can't do this until
582 * we fix gsbase, and we should do it before enter_from_user_mode 660 * we fix gsbase, and we should do it before enter_from_user_mode
583 * (which can take locks). Since TRACE_IRQS_OFF idempotent, 661 * (which can take locks). Since TRACE_IRQS_OFF idempotent,
@@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
630 ud2 708 ud2
6311: 7091:
632#endif 710#endif
633 SWAPGS
634 POP_EXTRA_REGS 711 POP_EXTRA_REGS
635 POP_C_REGS 712 popq %r11
636 addq $8, %rsp /* skip regs->orig_ax */ 713 popq %r10
714 popq %r9
715 popq %r8
716 popq %rax
717 popq %rcx
718 popq %rdx
719 popq %rsi
720
721 /*
722 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
723 * Save old stack pointer and switch to trampoline stack.
724 */
725 movq %rsp, %rdi
726 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
727
728 /* Copy the IRET frame to the trampoline stack. */
729 pushq 6*8(%rdi) /* SS */
730 pushq 5*8(%rdi) /* RSP */
731 pushq 4*8(%rdi) /* EFLAGS */
732 pushq 3*8(%rdi) /* CS */
733 pushq 2*8(%rdi) /* RIP */
734
735 /* Push user RDI on the trampoline stack. */
736 pushq (%rdi)
737
738 /*
739 * We are on the trampoline stack. All regs except RDI are live.
740 * We can do future final exit work right here.
741 */
742
743 /* Restore RDI. */
744 popq %rdi
745 SWAPGS
637 INTERRUPT_RETURN 746 INTERRUPT_RETURN
638 747
639 748
@@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
829/* 938/*
830 * Exception entry points. 939 * Exception entry points.
831 */ 940 */
832#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) 941#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
942
943/*
944 * Switch to the thread stack. This is called with the IRET frame and
945 * orig_ax on the stack. (That is, RDI..R12 are not on the stack and
946 * space has not been allocated for them.)
947 */
948ENTRY(switch_to_thread_stack)
949 UNWIND_HINT_FUNC
950
951 pushq %rdi
952 movq %rsp, %rdi
953 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
954 UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
955
956 pushq 7*8(%rdi) /* regs->ss */
957 pushq 6*8(%rdi) /* regs->rsp */
958 pushq 5*8(%rdi) /* regs->eflags */
959 pushq 4*8(%rdi) /* regs->cs */
960 pushq 3*8(%rdi) /* regs->ip */
961 pushq 2*8(%rdi) /* regs->orig_ax */
962 pushq 8(%rdi) /* return address */
963 UNWIND_HINT_FUNC
964
965 movq (%rdi), %rdi
966 ret
967END(switch_to_thread_stack)
833 968
834.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 969.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
835ENTRY(\sym) 970ENTRY(\sym)
@@ -848,11 +983,12 @@ ENTRY(\sym)
848 983
849 ALLOC_PT_GPREGS_ON_STACK 984 ALLOC_PT_GPREGS_ON_STACK
850 985
851 .if \paranoid 986 .if \paranoid < 2
852 .if \paranoid == 1
853 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ 987 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
854 jnz 1f 988 jnz .Lfrom_usermode_switch_stack_\@
855 .endif 989 .endif
990
991 .if \paranoid
856 call paranoid_entry 992 call paranoid_entry
857 .else 993 .else
858 call error_entry 994 call error_entry
@@ -894,20 +1030,15 @@ ENTRY(\sym)
894 jmp error_exit 1030 jmp error_exit
895 .endif 1031 .endif
896 1032
897 .if \paranoid == 1 1033 .if \paranoid < 2
898 /* 1034 /*
899 * Paranoid entry from userspace. Switch stacks and treat it 1035 * Entry from userspace. Switch stacks and treat it
900 * as a normal entry. This means that paranoid handlers 1036 * as a normal entry. This means that paranoid handlers
901 * run in real process context if user_mode(regs). 1037 * run in real process context if user_mode(regs).
902 */ 1038 */
9031: 1039.Lfrom_usermode_switch_stack_\@:
904 call error_entry 1040 call error_entry
905 1041
906
907 movq %rsp, %rdi /* pt_regs pointer */
908 call sync_regs
909 movq %rax, %rsp /* switch stack */
910
911 movq %rsp, %rdi /* pt_regs pointer */ 1042 movq %rsp, %rdi /* pt_regs pointer */
912 1043
913 .if \has_error_code 1044 .if \has_error_code
@@ -1170,6 +1301,14 @@ ENTRY(error_entry)
1170 SWAPGS 1301 SWAPGS
1171 1302
1172.Lerror_entry_from_usermode_after_swapgs: 1303.Lerror_entry_from_usermode_after_swapgs:
1304 /* Put us onto the real thread stack. */
1305 popq %r12 /* save return addr in %12 */
1306 movq %rsp, %rdi /* arg0 = pt_regs pointer */
1307 call sync_regs
1308 movq %rax, %rsp /* switch stack */
1309 ENCODE_FRAME_POINTER
1310 pushq %r12
1311
1173 /* 1312 /*
1174 * We need to tell lockdep that IRQs are off. We can't do this until 1313 * We need to tell lockdep that IRQs are off. We can't do this until
1175 * we fix gsbase, and we should do it before enter_from_user_mode 1314 * we fix gsbase, and we should do it before enter_from_user_mode
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 568e130d932c..95ad40eb7eff 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,7 +48,7 @@
48 */ 48 */
49ENTRY(entry_SYSENTER_compat) 49ENTRY(entry_SYSENTER_compat)
50 /* Interrupts are off on entry. */ 50 /* Interrupts are off on entry. */
51 SWAPGS_UNSAFE_STACK 51 SWAPGS
52 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 52 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
53 53
54 /* 54 /*
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
306 */ 306 */
307 movl %eax, %eax 307 movl %eax, %eax
308 308
309 /* Construct struct pt_regs on stack (iret frame is already on stack) */
310 pushq %rax /* pt_regs->orig_ax */ 309 pushq %rax /* pt_regs->orig_ax */
310
311 /* switch to thread stack expects orig_ax to be pushed */
312 call switch_to_thread_stack
313
311 pushq %rdi /* pt_regs->di */ 314 pushq %rdi /* pt_regs->di */
312 pushq %rsi /* pt_regs->si */ 315 pushq %rsi /* pt_regs->si */
313 pushq %rdx /* pt_regs->dx */ 316 pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bf6a76202a77..ea9a7dde62e5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
135 set_bit(bit, (unsigned long *)cpu_caps_set); \ 135 set_bit(bit, (unsigned long *)cpu_caps_set); \
136} while (0) 136} while (0)
137 137
138#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
139
138#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) 140#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
139/* 141/*
140 * Static testing of CPU features. Used the same as boot_cpu_has(). 142 * Static testing of CPU features. Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4011cb03ef08..aab4fe9f49f8 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
60 return this_cpu_ptr(&gdt_page)->gdt; 60 return this_cpu_ptr(&gdt_page)->gdt;
61} 61}
62 62
63/* Get the fixmap index for a specific processor */
64static inline unsigned int get_cpu_gdt_ro_index(int cpu)
65{
66 return FIX_GDT_REMAP_BEGIN + cpu;
67}
68
69/* Provide the fixmap address of the remapped GDT */ 63/* Provide the fixmap address of the remapped GDT */
70static inline struct desc_struct *get_cpu_gdt_ro(int cpu) 64static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
71{ 65{
72 unsigned int idx = get_cpu_gdt_ro_index(cpu); 66 return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
73 return (struct desc_struct *)__fix_to_virt(idx);
74} 67}
75 68
76/* Provide the current read-only GDT */ 69/* Provide the current read-only GDT */
@@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
185#endif 178#endif
186} 179}
187 180
188static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) 181static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
189{ 182{
190 struct desc_struct *d = get_cpu_gdt_rw(cpu); 183 struct desc_struct *d = get_cpu_gdt_rw(cpu);
191 tss_desc tss; 184 tss_desc tss;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b0c505fe9a95..94fc4fa14127 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
44 PAGE_SIZE) 44 PAGE_SIZE)
45#endif 45#endif
46 46
47/*
48 * cpu_entry_area is a percpu region in the fixmap that contains things
49 * needed by the CPU and early entry/exit code. Real types aren't used
50 * for all fields here to avoid circular header dependencies.
51 *
52 * Every field is a virtual alias of some other allocated backing store.
53 * There is no direct allocation of a struct cpu_entry_area.
54 */
55struct cpu_entry_area {
56 char gdt[PAGE_SIZE];
57
58 /*
59 * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
60 * a a read-only guard page.
61 */
62 struct SYSENTER_stack_page SYSENTER_stack_page;
63
64 /*
65 * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
66 * we need task switches to work, and task switches write to the TSS.
67 */
68 struct tss_struct tss;
69
70 char entry_trampoline[PAGE_SIZE];
71
72#ifdef CONFIG_X86_64
73 /*
74 * Exception stacks used for IST entries.
75 *
76 * In the future, this should have a separate slot for each stack
77 * with guard pages between them.
78 */
79 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
80#endif
81};
82
83#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
84
85extern void setup_cpu_entry_areas(void);
47 86
48/* 87/*
49 * Here we define all the compile-time 'special' virtual 88 * Here we define all the compile-time 'special' virtual
@@ -101,8 +140,8 @@ enum fixed_addresses {
101 FIX_LNW_VRTC, 140 FIX_LNW_VRTC,
102#endif 141#endif
103 /* Fixmap entries to remap the GDTs, one per processor. */ 142 /* Fixmap entries to remap the GDTs, one per processor. */
104 FIX_GDT_REMAP_BEGIN, 143 FIX_CPU_ENTRY_AREA_TOP,
105 FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, 144 FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
106 145
107#ifdef CONFIG_ACPI_APEI_GHES 146#ifdef CONFIG_ACPI_APEI_GHES
108 /* Used for GHES mapping from assorted contexts */ 147 /* Used for GHES mapping from assorted contexts */
@@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
191void __early_set_fixmap(enum fixed_addresses idx, 230void __early_set_fixmap(enum fixed_addresses idx,
192 phys_addr_t phys, pgprot_t flags); 231 phys_addr_t phys, pgprot_t flags);
193 232
233static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
234{
235 BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
236
237 return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
238}
239
240#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
241 BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
242 __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
243 })
244
245#define get_cpu_entry_area_index(cpu, field) \
246 __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
247
248static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
249{
250 return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
251}
252
253static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
254{
255 return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
256}
257
194#endif /* !__ASSEMBLY__ */ 258#endif /* !__ASSEMBLY__ */
195#endif /* _ASM_X86_FIXMAP_H */ 259#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 1b0a5abcd8ae..96aa6b9884dc 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,16 +20,7 @@
20#ifndef _ASM_X86_HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define _ASM_X86_HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23#ifdef CONFIG_HYPERVISOR_GUEST 23/* x86 hypervisor types */
24
25#include <asm/kvm_para.h>
26#include <asm/x86_init.h>
27#include <asm/xen/hypervisor.h>
28
29/*
30 * x86 hypervisor information
31 */
32
33enum x86_hypervisor_type { 24enum x86_hypervisor_type {
34 X86_HYPER_NATIVE = 0, 25 X86_HYPER_NATIVE = 0,
35 X86_HYPER_VMWARE, 26 X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
39 X86_HYPER_KVM, 30 X86_HYPER_KVM,
40}; 31};
41 32
33#ifdef CONFIG_HYPERVISOR_GUEST
34
35#include <asm/kvm_para.h>
36#include <asm/x86_init.h>
37#include <asm/xen/hypervisor.h>
38
42struct hypervisor_x86 { 39struct hypervisor_x86 {
43 /* Hypervisor name */ 40 /* Hypervisor name */
44 const char *name; 41 const char *name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
58 55
59extern enum x86_hypervisor_type x86_hyper_type; 56extern enum x86_hypervisor_type x86_hyper_type;
60extern void init_hypervisor_platform(void); 57extern void init_hypervisor_platform(void);
58static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
59{
60 return x86_hyper_type == type;
61}
61#else 62#else
62static inline void init_hypervisor_platform(void) { } 63static inline void init_hypervisor_platform(void) { }
64static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
65{
66 return type == X86_HYPER_NATIVE;
67}
63#endif /* CONFIG_HYPERVISOR_GUEST */ 68#endif /* CONFIG_HYPERVISOR_GUEST */
64#endif /* _ASM_X86_HYPERVISOR_H */ 69#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c8ef23f2c28f..89f08955fff7 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
142 swapgs; \ 142 swapgs; \
143 sysretl 143 sysretl
144 144
145#ifdef CONFIG_DEBUG_ENTRY
146#define SAVE_FLAGS(x) pushfq; popq %rax
147#endif
145#else 148#else
146#define INTERRUPT_RETURN iret 149#define INTERRUPT_RETURN iret
147#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit 150#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f86a8caa561e..395c9631e000 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
26extern int __must_check __die(const char *, struct pt_regs *, long); 26extern int __must_check __die(const char *, struct pt_regs *, long);
27extern void show_stack_regs(struct pt_regs *regs); 27extern void show_stack_regs(struct pt_regs *regs);
28extern void __show_regs(struct pt_regs *regs, int all); 28extern void __show_regs(struct pt_regs *regs, int all);
29extern void show_iret_regs(struct pt_regs *regs);
29extern unsigned long oops_begin(void); 30extern unsigned long oops_begin(void);
30extern void oops_end(unsigned long, struct pt_regs *, int signr); 31extern void oops_end(unsigned long, struct pt_regs *, int signr);
31 32
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 283efcaac8af..892df375b615 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -927,6 +927,15 @@ extern void default_banner(void);
927 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ 927 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
928 CLBR_NONE, \ 928 CLBR_NONE, \
929 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) 929 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
930
931#ifdef CONFIG_DEBUG_ENTRY
932#define SAVE_FLAGS(clobbers) \
933 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
934 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
935 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
936 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
937#endif
938
930#endif /* CONFIG_X86_32 */ 939#endif /* CONFIG_X86_32 */
931 940
932#endif /* __ASSEMBLY__ */ 941#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cc16fa882e3e..1f2434ee9f80 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -163,9 +163,9 @@ enum cpuid_regs_idx {
163extern struct cpuinfo_x86 boot_cpu_data; 163extern struct cpuinfo_x86 boot_cpu_data;
164extern struct cpuinfo_x86 new_cpu_data; 164extern struct cpuinfo_x86 new_cpu_data;
165 165
166extern struct tss_struct doublefault_tss; 166extern struct x86_hw_tss doublefault_tss;
167extern __u32 cpu_caps_cleared[NCAPINTS]; 167extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
168extern __u32 cpu_caps_set[NCAPINTS]; 168extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
169 169
170#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
171DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); 171DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
253 write_cr3(__sme_pa(pgdir)); 253 write_cr3(__sme_pa(pgdir));
254} 254}
255 255
256/*
257 * Note that while the legacy 'TSS' name comes from 'Task State Segment',
258 * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
259 * unrelated to the task-switch mechanism:
260 */
256#ifdef CONFIG_X86_32 261#ifdef CONFIG_X86_32
257/* This is the TSS defined by the hardware. */ 262/* This is the TSS defined by the hardware. */
258struct x86_hw_tss { 263struct x86_hw_tss {
@@ -305,7 +310,13 @@ struct x86_hw_tss {
305struct x86_hw_tss { 310struct x86_hw_tss {
306 u32 reserved1; 311 u32 reserved1;
307 u64 sp0; 312 u64 sp0;
313
314 /*
315 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
316 * Linux does not use ring 1, so sp1 is not otherwise needed.
317 */
308 u64 sp1; 318 u64 sp1;
319
309 u64 sp2; 320 u64 sp2;
310 u64 reserved2; 321 u64 reserved2;
311 u64 ist[7]; 322 u64 ist[7];
@@ -323,12 +334,22 @@ struct x86_hw_tss {
323#define IO_BITMAP_BITS 65536 334#define IO_BITMAP_BITS 65536
324#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) 335#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
325#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) 336#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
326#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) 337#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
327#define INVALID_IO_BITMAP_OFFSET 0x8000 338#define INVALID_IO_BITMAP_OFFSET 0x8000
328 339
340struct SYSENTER_stack {
341 unsigned long words[64];
342};
343
344struct SYSENTER_stack_page {
345 struct SYSENTER_stack stack;
346} __aligned(PAGE_SIZE);
347
329struct tss_struct { 348struct tss_struct {
330 /* 349 /*
331 * The hardware state: 350 * The fixed hardware portion. This must not cross a page boundary
351 * at risk of violating the SDM's advice and potentially triggering
352 * errata.
332 */ 353 */
333 struct x86_hw_tss x86_tss; 354 struct x86_hw_tss x86_tss;
334 355
@@ -339,18 +360,9 @@ struct tss_struct {
339 * be within the limit. 360 * be within the limit.
340 */ 361 */
341 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 362 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
363} __aligned(PAGE_SIZE);
342 364
343#ifdef CONFIG_X86_32 365DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
344 /*
345 * Space for the temporary SYSENTER stack.
346 */
347 unsigned long SYSENTER_stack_canary;
348 unsigned long SYSENTER_stack[64];
349#endif
350
351} ____cacheline_aligned;
352
353DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
354 366
355/* 367/*
356 * sizeof(unsigned long) coming from an extra "long" at the end 368 * sizeof(unsigned long) coming from an extra "long" at the end
@@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
364 376
365#ifdef CONFIG_X86_32 377#ifdef CONFIG_X86_32
366DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); 378DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
379#else
380/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
381#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
367#endif 382#endif
368 383
369/* 384/*
@@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
523static inline void 538static inline void
524native_load_sp0(unsigned long sp0) 539native_load_sp0(unsigned long sp0)
525{ 540{
526 this_cpu_write(cpu_tss.x86_tss.sp0, sp0); 541 this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
527} 542}
528 543
529static inline void native_swapgs(void) 544static inline void native_swapgs(void)
@@ -535,12 +550,12 @@ static inline void native_swapgs(void)
535 550
536static inline unsigned long current_top_of_stack(void) 551static inline unsigned long current_top_of_stack(void)
537{ 552{
538#ifdef CONFIG_X86_64 553 /*
539 return this_cpu_read_stable(cpu_tss.x86_tss.sp0); 554 * We can't read directly from tss.sp0: sp0 on x86_32 is special in
540#else 555 * and around vm86 mode and sp0 on x86_64 is special because of the
541 /* sp0 on x86_32 is special in and around vm86 mode. */ 556 * entry trampoline.
557 */
542 return this_cpu_read_stable(cpu_current_top_of_stack); 558 return this_cpu_read_stable(cpu_current_top_of_stack);
543#endif
544} 559}
545 560
546static inline bool on_thread_stack(void) 561static inline bool on_thread_stack(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 8da111b3c342..f8062bfd43a0 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -16,6 +16,7 @@ enum stack_type {
16 STACK_TYPE_TASK, 16 STACK_TYPE_TASK,
17 STACK_TYPE_IRQ, 17 STACK_TYPE_IRQ,
18 STACK_TYPE_SOFTIRQ, 18 STACK_TYPE_SOFTIRQ,
19 STACK_TYPE_SYSENTER,
19 STACK_TYPE_EXCEPTION, 20 STACK_TYPE_EXCEPTION,
20 STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, 21 STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
21}; 22};
@@ -28,6 +29,8 @@ struct stack_info {
28bool in_task_stack(unsigned long *stack, struct task_struct *task, 29bool in_task_stack(unsigned long *stack, struct task_struct *task,
29 struct stack_info *info); 30 struct stack_info *info);
30 31
32bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
33
31int get_stack_info(unsigned long *stack, struct task_struct *task, 34int get_stack_info(unsigned long *stack, struct task_struct *task,
32 struct stack_info *info, unsigned long *visit_mask); 35 struct stack_info *info, unsigned long *visit_mask);
33 36
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8c6bd6863db9..9b6df68d8fd1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,10 +79,10 @@ do { \
79static inline void refresh_sysenter_cs(struct thread_struct *thread) 79static inline void refresh_sysenter_cs(struct thread_struct *thread)
80{ 80{
81 /* Only happens when SEP is enabled, no need to test "SEP"arately: */ 81 /* Only happens when SEP is enabled, no need to test "SEP"arately: */
82 if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) 82 if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
83 return; 83 return;
84 84
85 this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); 85 this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
86 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 86 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
87} 87}
88#endif 88#endif
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
90/* This is used when switching tasks or entering/exiting vm86 mode. */ 90/* This is used when switching tasks or entering/exiting vm86 mode. */
91static inline void update_sp0(struct task_struct *task) 91static inline void update_sp0(struct task_struct *task)
92{ 92{
93 /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
93#ifdef CONFIG_X86_32 94#ifdef CONFIG_X86_32
94 load_sp0(task->thread.sp0); 95 load_sp0(task->thread.sp0);
95#else 96#else
96 load_sp0(task_top_of_stack(task)); 97 if (static_cpu_has(X86_FEATURE_XENPV))
98 load_sp0(task_top_of_stack(task));
97#endif 99#endif
98} 100}
99 101
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 70f425947dc5..00223333821a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
207#else /* !__ASSEMBLY__ */ 207#else /* !__ASSEMBLY__ */
208 208
209#ifdef CONFIG_X86_64 209#ifdef CONFIG_X86_64
210# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) 210# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
211#endif 211#endif
212 212
213#endif 213#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 1fadd310ff68..31051f35cbb7 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
75dotraplinkage void do_stack_segment(struct pt_regs *, long); 75dotraplinkage void do_stack_segment(struct pt_regs *, long);
76#ifdef CONFIG_X86_64 76#ifdef CONFIG_X86_64
77dotraplinkage void do_double_fault(struct pt_regs *, long); 77dotraplinkage void do_double_fault(struct pt_regs *, long);
78asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
79#endif 78#endif
80dotraplinkage void do_general_protection(struct pt_regs *, long); 79dotraplinkage void do_general_protection(struct pt_regs *, long);
81dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); 80dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e9cc6fe1fc6f..c1688c2d0a12 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -7,6 +7,9 @@
7#include <asm/ptrace.h> 7#include <asm/ptrace.h>
8#include <asm/stacktrace.h> 8#include <asm/stacktrace.h>
9 9
10#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
11#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
12
10struct unwind_state { 13struct unwind_state {
11 struct stack_info stack_info; 14 struct stack_info stack_info;
12 unsigned long stack_mask; 15 unsigned long stack_mask;
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
52} 55}
53 56
54#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) 57#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
58/*
59 * WARNING: The entire pt_regs may not be safe to dereference. In some cases,
60 * only the iret frame registers are accessible. Use with caution!
61 */
55static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) 62static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
56{ 63{
57 if (unwind_done(state)) 64 if (unwind_done(state))
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 8ea78275480d..cd360a5e0dca 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -93,4 +93,10 @@ void common(void) {
93 93
94 BLANK(); 94 BLANK();
95 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); 95 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
96
97 /* Layout info for cpu_entry_area */
98 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
99 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
100 OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
101 DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
96} 102}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dedf428b20b6..7d20d9c0b3d6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -47,13 +47,8 @@ void foo(void)
47 BLANK(); 47 BLANK();
48 48
49 /* Offset from the sysenter stack to tss.sp0 */ 49 /* Offset from the sysenter stack to tss.sp0 */
50 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 50 DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
51 offsetofend(struct tss_struct, SYSENTER_stack)); 51 offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
52
53 /* Offset from cpu_tss to SYSENTER_stack */
54 OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
55 /* Size of SYSENTER_stack */
56 DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
57 52
58#ifdef CONFIG_CC_STACKPROTECTOR 53#ifdef CONFIG_CC_STACKPROTECTOR
59 BLANK(); 54 BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 630212fa9b9d..bf51e51d808d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,6 +23,9 @@ int main(void)
23#ifdef CONFIG_PARAVIRT 23#ifdef CONFIG_PARAVIRT
24 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 24 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
25 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 25 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
26#ifdef CONFIG_DEBUG_ENTRY
27 OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
28#endif
26 BLANK(); 29 BLANK();
27#endif 30#endif
28 31
@@ -63,6 +66,7 @@ int main(void)
63 66
64 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 67 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
65 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); 68 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
69 OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
66 BLANK(); 70 BLANK();
67 71
68#ifdef CONFIG_CC_STACKPROTECTOR 72#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fa998ca8aa5a..7416da3ec4df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
476 return NULL; /* Not found */ 476 return NULL; /* Not found */
477} 477}
478 478
479__u32 cpu_caps_cleared[NCAPINTS]; 479__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
480__u32 cpu_caps_set[NCAPINTS]; 480__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
481 481
482void load_percpu_segment(int cpu) 482void load_percpu_segment(int cpu)
483{ 483{
@@ -490,27 +490,116 @@ void load_percpu_segment(int cpu)
490 load_stack_canary_segment(); 490 load_stack_canary_segment();
491} 491}
492 492
493/* Setup the fixmap mapping only once per-processor */ 493#ifdef CONFIG_X86_32
494static inline void setup_fixmap_gdt(int cpu) 494/* The 32-bit entry code needs to find cpu_entry_area. */
495DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
496#endif
497
498#ifdef CONFIG_X86_64
499/*
500 * Special IST stacks which the CPU switches to when it calls
501 * an IST-marked descriptor entry. Up to 7 stacks (hardware
502 * limit), all of them are 4K, except the debug stack which
503 * is 8K.
504 */
505static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
506 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
507 [DEBUG_STACK - 1] = DEBUG_STKSZ
508};
509
510static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
511 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
512#endif
513
514static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
515 SYSENTER_stack_storage);
516
517static void __init
518set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
519{
520 for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
521 __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
522}
523
524/* Setup the fixmap mappings only once per-processor */
525static void __init setup_cpu_entry_area(int cpu)
495{ 526{
496#ifdef CONFIG_X86_64 527#ifdef CONFIG_X86_64
497 /* On 64-bit systems, we use a read-only fixmap GDT. */ 528 extern char _entry_trampoline[];
498 pgprot_t prot = PAGE_KERNEL_RO; 529
530 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
531 pgprot_t gdt_prot = PAGE_KERNEL_RO;
532 pgprot_t tss_prot = PAGE_KERNEL_RO;
499#else 533#else
500 /* 534 /*
501 * On native 32-bit systems, the GDT cannot be read-only because 535 * On native 32-bit systems, the GDT cannot be read-only because
502 * our double fault handler uses a task gate, and entering through 536 * our double fault handler uses a task gate, and entering through
503 * a task gate needs to change an available TSS to busy. If the GDT 537 * a task gate needs to change an available TSS to busy. If the
504 * is read-only, that will triple fault. 538 * GDT is read-only, that will triple fault. The TSS cannot be
539 * read-only because the CPU writes to it on task switches.
505 * 540 *
506 * On Xen PV, the GDT must be read-only because the hypervisor requires 541 * On Xen PV, the GDT must be read-only because the hypervisor
507 * it. 542 * requires it.
508 */ 543 */
509 pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? 544 pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
510 PAGE_KERNEL_RO : PAGE_KERNEL; 545 PAGE_KERNEL_RO : PAGE_KERNEL;
546 pgprot_t tss_prot = PAGE_KERNEL;
511#endif 547#endif
512 548
513 __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); 549 __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
550 set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
551 per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
552 PAGE_KERNEL);
553
554 /*
555 * The Intel SDM says (Volume 3, 7.2.1):
556 *
557 * Avoid placing a page boundary in the part of the TSS that the
558 * processor reads during a task switch (the first 104 bytes). The
559 * processor may not correctly perform address translations if a
560 * boundary occurs in this area. During a task switch, the processor
561 * reads and writes into the first 104 bytes of each TSS (using
562 * contiguous physical addresses beginning with the physical address
563 * of the first byte of the TSS). So, after TSS access begins, if
564 * part of the 104 bytes is not physically contiguous, the processor
565 * will access incorrect information without generating a page-fault
566 * exception.
567 *
568 * There are also a lot of errata involving the TSS spanning a page
569 * boundary. Assert that we're not doing that.
570 */
571 BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
572 offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
573 BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
574 set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
575 &per_cpu(cpu_tss_rw, cpu),
576 sizeof(struct tss_struct) / PAGE_SIZE,
577 tss_prot);
578
579#ifdef CONFIG_X86_32
580 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
581#endif
582
583#ifdef CONFIG_X86_64
584 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
585 BUILD_BUG_ON(sizeof(exception_stacks) !=
586 sizeof(((struct cpu_entry_area *)0)->exception_stacks));
587 set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
588 &per_cpu(exception_stacks, cpu),
589 sizeof(exception_stacks) / PAGE_SIZE,
590 PAGE_KERNEL);
591
592 __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
593 __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
594#endif
595}
596
597void __init setup_cpu_entry_areas(void)
598{
599 unsigned int cpu;
600
601 for_each_possible_cpu(cpu)
602 setup_cpu_entry_area(cpu);
514} 603}
515 604
516/* Load the original GDT from the per-cpu structure */ 605/* Load the original GDT from the per-cpu structure */
@@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
747{ 836{
748 int i; 837 int i;
749 838
750 for (i = 0; i < NCAPINTS; i++) { 839 for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
751 c->x86_capability[i] &= ~cpu_caps_cleared[i]; 840 c->x86_capability[i] &= ~cpu_caps_cleared[i];
752 c->x86_capability[i] |= cpu_caps_set[i]; 841 c->x86_capability[i] |= cpu_caps_set[i];
753 } 842 }
@@ -1250,7 +1339,7 @@ void enable_sep_cpu(void)
1250 return; 1339 return;
1251 1340
1252 cpu = get_cpu(); 1341 cpu = get_cpu();
1253 tss = &per_cpu(cpu_tss, cpu); 1342 tss = &per_cpu(cpu_tss_rw, cpu);
1254 1343
1255 /* 1344 /*
1256 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- 1345 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1259,11 +1348,7 @@ void enable_sep_cpu(void)
1259 1348
1260 tss->x86_tss.ss1 = __KERNEL_CS; 1349 tss->x86_tss.ss1 = __KERNEL_CS;
1261 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); 1350 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
1262 1351 wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
1263 wrmsr(MSR_IA32_SYSENTER_ESP,
1264 (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
1265 0);
1266
1267 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); 1352 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
1268 1353
1269 put_cpu(); 1354 put_cpu();
@@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
1357DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; 1442DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1358EXPORT_PER_CPU_SYMBOL(__preempt_count); 1443EXPORT_PER_CPU_SYMBOL(__preempt_count);
1359 1444
1360/*
1361 * Special IST stacks which the CPU switches to when it calls
1362 * an IST-marked descriptor entry. Up to 7 stacks (hardware
1363 * limit), all of them are 4K, except the debug stack which
1364 * is 8K.
1365 */
1366static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1367 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1368 [DEBUG_STACK - 1] = DEBUG_STKSZ
1369};
1370
1371static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1372 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1373
1374/* May not be marked __init: used by software suspend */ 1445/* May not be marked __init: used by software suspend */
1375void syscall_init(void) 1446void syscall_init(void)
1376{ 1447{
1448 extern char _entry_trampoline[];
1449 extern char entry_SYSCALL_64_trampoline[];
1450
1451 int cpu = smp_processor_id();
1452 unsigned long SYSCALL64_entry_trampoline =
1453 (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
1454 (entry_SYSCALL_64_trampoline - _entry_trampoline);
1455
1377 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); 1456 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
1378 wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); 1457 wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1379 1458
1380#ifdef CONFIG_IA32_EMULATION 1459#ifdef CONFIG_IA32_EMULATION
1381 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); 1460 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1386,7 +1465,7 @@ void syscall_init(void)
1386 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). 1465 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
1387 */ 1466 */
1388 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 1467 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1389 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); 1468 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
1390 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); 1469 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
1391#else 1470#else
1392 wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); 1471 wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1530,7 +1609,7 @@ void cpu_init(void)
1530 if (cpu) 1609 if (cpu)
1531 load_ucode_ap(); 1610 load_ucode_ap();
1532 1611
1533 t = &per_cpu(cpu_tss, cpu); 1612 t = &per_cpu(cpu_tss_rw, cpu);
1534 oist = &per_cpu(orig_ist, cpu); 1613 oist = &per_cpu(orig_ist, cpu);
1535 1614
1536#ifdef CONFIG_NUMA 1615#ifdef CONFIG_NUMA
@@ -1569,7 +1648,7 @@ void cpu_init(void)
1569 * set up and load the per-CPU TSS 1648 * set up and load the per-CPU TSS
1570 */ 1649 */
1571 if (!oist->ist[0]) { 1650 if (!oist->ist[0]) {
1572 char *estacks = per_cpu(exception_stacks, cpu); 1651 char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
1573 1652
1574 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1653 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1575 estacks += exception_stack_sizes[v]; 1654 estacks += exception_stack_sizes[v];
@@ -1580,7 +1659,7 @@ void cpu_init(void)
1580 } 1659 }
1581 } 1660 }
1582 1661
1583 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 1662 t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
1584 1663
1585 /* 1664 /*
1586 * <= is required because the CPU will access up to 1665 * <= is required because the CPU will access up to
@@ -1596,11 +1675,12 @@ void cpu_init(void)
1596 enter_lazy_tlb(&init_mm, me); 1675 enter_lazy_tlb(&init_mm, me);
1597 1676
1598 /* 1677 /*
1599 * Initialize the TSS. Don't bother initializing sp0, as the initial 1678 * Initialize the TSS. sp0 points to the entry trampoline stack
1600 * task never enters user mode. 1679 * regardless of what task is running.
1601 */ 1680 */
1602 set_tss_desc(cpu, t); 1681 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
1603 load_TR_desc(); 1682 load_TR_desc();
1683 load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
1604 1684
1605 load_mm_ldt(&init_mm); 1685 load_mm_ldt(&init_mm);
1606 1686
@@ -1612,7 +1692,6 @@ void cpu_init(void)
1612 if (is_uv_system()) 1692 if (is_uv_system())
1613 uv_cpu_init(); 1693 uv_cpu_init();
1614 1694
1615 setup_fixmap_gdt(cpu);
1616 load_fixmap_gdt(cpu); 1695 load_fixmap_gdt(cpu);
1617} 1696}
1618 1697
@@ -1622,7 +1701,7 @@ void cpu_init(void)
1622{ 1701{
1623 int cpu = smp_processor_id(); 1702 int cpu = smp_processor_id();
1624 struct task_struct *curr = current; 1703 struct task_struct *curr = current;
1625 struct tss_struct *t = &per_cpu(cpu_tss, cpu); 1704 struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
1626 1705
1627 wait_for_master_cpu(cpu); 1706 wait_for_master_cpu(cpu);
1628 1707
@@ -1657,12 +1736,12 @@ void cpu_init(void)
1657 * Initialize the TSS. Don't bother initializing sp0, as the initial 1736 * Initialize the TSS. Don't bother initializing sp0, as the initial
1658 * task never enters user mode. 1737 * task never enters user mode.
1659 */ 1738 */
1660 set_tss_desc(cpu, t); 1739 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
1661 load_TR_desc(); 1740 load_TR_desc();
1662 1741
1663 load_mm_ldt(&init_mm); 1742 load_mm_ldt(&init_mm);
1664 1743
1665 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 1744 t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
1666 1745
1667#ifdef CONFIG_DOUBLEFAULT 1746#ifdef CONFIG_DOUBLEFAULT
1668 /* Set up doublefault TSS pointer in the GDT */ 1747 /* Set up doublefault TSS pointer in the GDT */
@@ -1674,7 +1753,6 @@ void cpu_init(void)
1674 1753
1675 fpu__init_cpu(); 1754 fpu__init_cpu();
1676 1755
1677 setup_fixmap_gdt(cpu);
1678 load_fixmap_gdt(cpu); 1756 load_fixmap_gdt(cpu);
1679} 1757}
1680#endif 1758#endif
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0e662c55ae90..0b8cedb20d6d 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
50 cpu_relax(); 50 cpu_relax();
51} 51}
52 52
53struct tss_struct doublefault_tss __cacheline_aligned = { 53struct x86_hw_tss doublefault_tss __cacheline_aligned = {
54 .x86_tss = { 54 .sp0 = STACK_START,
55 .sp0 = STACK_START, 55 .ss0 = __KERNEL_DS,
56 .ss0 = __KERNEL_DS, 56 .ldt = 0,
57 .ldt = 0, 57 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
58 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 58
59 59 .ip = (unsigned long) doublefault_fn,
60 .ip = (unsigned long) doublefault_fn, 60 /* 0x2 bit is always set */
61 /* 0x2 bit is always set */ 61 .flags = X86_EFLAGS_SF | 0x2,
62 .flags = X86_EFLAGS_SF | 0x2, 62 .sp = STACK_START,
63 .sp = STACK_START, 63 .es = __USER_DS,
64 .es = __USER_DS, 64 .cs = __KERNEL_CS,
65 .cs = __KERNEL_CS, 65 .ss = __KERNEL_DS,
66 .ss = __KERNEL_DS, 66 .ds = __USER_DS,
67 .ds = __USER_DS, 67 .fs = __KERNEL_PERCPU,
68 .fs = __KERNEL_PERCPU, 68
69 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 .__cr3 = __pa_nodebug(swapper_pg_dir),
71 }
72}; 70};
73 71
74/* dummy for do_double_fault() call */ 72/* dummy for do_double_fault() call */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f13b4c00a5de..bbd6d986e2d0 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
43 return true; 43 return true;
44} 44}
45 45
46bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
47{
48 struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
49
50 void *begin = ss;
51 void *end = ss + 1;
52
53 if ((void *)stack < begin || (void *)stack >= end)
54 return false;
55
56 info->type = STACK_TYPE_SYSENTER;
57 info->begin = begin;
58 info->end = end;
59 info->next_sp = NULL;
60
61 return true;
62}
63
46static void printk_stack_address(unsigned long address, int reliable, 64static void printk_stack_address(unsigned long address, int reliable,
47 char *log_lvl) 65 char *log_lvl)
48{ 66{
@@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
50 printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); 68 printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
51} 69}
52 70
71void show_iret_regs(struct pt_regs *regs)
72{
73 printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
74 printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
75 regs->sp, regs->flags);
76}
77
78static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
79{
80 if (on_stack(info, regs, sizeof(*regs)))
81 __show_regs(regs, 0);
82 else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
83 IRET_FRAME_SIZE)) {
84 /*
85 * When an interrupt or exception occurs in entry code, the
86 * full pt_regs might not have been saved yet. In that case
87 * just print the iret frame.
88 */
89 show_iret_regs(regs);
90 }
91}
92
53void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 93void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
54 unsigned long *stack, char *log_lvl) 94 unsigned long *stack, char *log_lvl)
55{ 95{
@@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
71 * - task stack 111 * - task stack
72 * - interrupt stack 112 * - interrupt stack
73 * - HW exception stacks (double fault, nmi, debug, mce) 113 * - HW exception stacks (double fault, nmi, debug, mce)
114 * - SYSENTER stack
74 * 115 *
75 * x86-32 can have up to three stacks: 116 * x86-32 can have up to four stacks:
76 * - task stack 117 * - task stack
77 * - softirq stack 118 * - softirq stack
78 * - hardirq stack 119 * - hardirq stack
120 * - SYSENTER stack
79 */ 121 */
80 for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 122 for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
81 const char *stack_name; 123 const char *stack_name;
82 124
83 /* 125 if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
84 * If we overflowed the task stack into a guard page, jump back 126 /*
85 * to the bottom of the usable stack. 127 * We weren't on a valid stack. It's possible that
86 */ 128 * we overflowed a valid stack into a guard page.
87 if (task_stack_page(task) - (void *)stack < PAGE_SIZE) 129 * See if the next page up is valid so that we can
88 stack = task_stack_page(task); 130 * generate some kind of backtrace if this happens.
89 131 */
90 if (get_stack_info(stack, task, &stack_info, &visit_mask)) 132 stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
91 break; 133 if (get_stack_info(stack, task, &stack_info, &visit_mask))
134 break;
135 }
92 136
93 stack_name = stack_type_name(stack_info.type); 137 stack_name = stack_type_name(stack_info.type);
94 if (stack_name) 138 if (stack_name)
95 printk("%s <%s>\n", log_lvl, stack_name); 139 printk("%s <%s>\n", log_lvl, stack_name);
96 140
97 if (regs && on_stack(&stack_info, regs, sizeof(*regs))) 141 if (regs)
98 __show_regs(regs, 0); 142 show_regs_safe(&stack_info, regs);
99 143
100 /* 144 /*
101 * Scan the stack, printing any text addresses we find. At the 145 * Scan the stack, printing any text addresses we find. At the
@@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
119 163
120 /* 164 /*
121 * Don't print regs->ip again if it was already printed 165 * Don't print regs->ip again if it was already printed
122 * by __show_regs() below. 166 * by show_regs_safe() below.
123 */ 167 */
124 if (regs && stack == &regs->ip) 168 if (regs && stack == &regs->ip)
125 goto next; 169 goto next;
@@ -155,8 +199,8 @@ next:
155 199
156 /* if the frame has entry regs, print them */ 200 /* if the frame has entry regs, print them */
157 regs = unwind_get_entry_regs(&state); 201 regs = unwind_get_entry_regs(&state);
158 if (regs && on_stack(&stack_info, regs, sizeof(*regs))) 202 if (regs)
159 __show_regs(regs, 0); 203 show_regs_safe(&stack_info, regs);
160 } 204 }
161 205
162 if (stack_name) 206 if (stack_name)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index daefae83a3aa..5ff13a6b3680 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
26 if (type == STACK_TYPE_SOFTIRQ) 26 if (type == STACK_TYPE_SOFTIRQ)
27 return "SOFTIRQ"; 27 return "SOFTIRQ";
28 28
29 if (type == STACK_TYPE_SYSENTER)
30 return "SYSENTER";
31
29 return NULL; 32 return NULL;
30} 33}
31 34
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
93 if (task != current) 96 if (task != current)
94 goto unknown; 97 goto unknown;
95 98
99 if (in_sysenter_stack(stack, info))
100 goto recursion_check;
101
96 if (in_hardirq_stack(stack, info)) 102 if (in_hardirq_stack(stack, info))
97 goto recursion_check; 103 goto recursion_check;
98 104
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 88ce2ffdb110..abc828f8c297 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
37 if (type == STACK_TYPE_IRQ) 37 if (type == STACK_TYPE_IRQ)
38 return "IRQ"; 38 return "IRQ";
39 39
40 if (type == STACK_TYPE_SYSENTER)
41 return "SYSENTER";
42
40 if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) 43 if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
41 return exception_stack_names[type - STACK_TYPE_EXCEPTION]; 44 return exception_stack_names[type - STACK_TYPE_EXCEPTION];
42 45
@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
115 if (in_irq_stack(stack, info)) 118 if (in_irq_stack(stack, info))
116 goto recursion_check; 119 goto recursion_check;
117 120
121 if (in_sysenter_stack(stack, info))
122 goto recursion_check;
123
118 goto unknown; 124 goto unknown;
119 125
120recursion_check: 126recursion_check:
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 3feb648781c4..2f723301eb58 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
67 * because the ->io_bitmap_max value must match the bitmap 67 * because the ->io_bitmap_max value must match the bitmap
68 * contents: 68 * contents:
69 */ 69 */
70 tss = &per_cpu(cpu_tss, get_cpu()); 70 tss = &per_cpu(cpu_tss_rw, get_cpu());
71 71
72 if (turn_on) 72 if (turn_on)
73 bitmap_clear(t->io_bitmap_ptr, from, num); 73 bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 49cfd9fe7589..68e1867cca80 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
219 /* high bit used in ret_from_ code */ 219 /* high bit used in ret_from_ code */
220 unsigned vector = ~regs->orig_ax; 220 unsigned vector = ~regs->orig_ax;
221 221
222 /*
223 * NB: Unlike exception entries, IRQ entries do not reliably
224 * handle context tracking in the low-level entry code. This is
225 * because syscall entries execute briefly with IRQs on before
226 * updating context tracking state, so we can take an IRQ from
227 * kernel mode with CONTEXT_USER. The low-level entry code only
228 * updates the context if we came from user mode, so we won't
229 * switch to CONTEXT_KERNEL. We'll fix that once the syscall
230 * code is cleaned up enough that we can cleanly defer enabling
231 * IRQs.
232 */
233
234 entering_irq(); 222 entering_irq();
235 223
236 /* entering_irq() tells RCU that we're not quiescent. Check it. */ 224 /* entering_irq() tells RCU that we're not quiescent. Check it. */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 020efbf5786b..d86e344f5b3d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
57 if (regs->sp >= estack_top && regs->sp <= estack_bottom) 57 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
58 return; 58 return;
59 59
60 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", 60 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
61 current->comm, curbase, regs->sp, 61 current->comm, curbase, regs->sp,
62 irq_stack_top, irq_stack_bottom, 62 irq_stack_top, irq_stack_bottom,
63 estack_top, estack_bottom); 63 estack_top, estack_bottom, (void *)regs->ip);
64 64
65 if (sysctl_panic_on_stackoverflow) 65 if (sysctl_panic_on_stackoverflow)
66 panic("low stack detected by irq handler - check messages\n"); 66 panic("low stack detected by irq handler - check messages\n");
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index ac0be8283325..9edadabf04f6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); 10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); 11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); 12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
13DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); 13DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
15 14
16DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); 15DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
60 PATCH_SITE(pv_mmu_ops, read_cr2); 59 PATCH_SITE(pv_mmu_ops, read_cr2);
61 PATCH_SITE(pv_mmu_ops, read_cr3); 60 PATCH_SITE(pv_mmu_ops, read_cr3);
62 PATCH_SITE(pv_mmu_ops, write_cr3); 61 PATCH_SITE(pv_mmu_ops, write_cr3);
63 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
64 PATCH_SITE(pv_cpu_ops, wbinvd); 62 PATCH_SITE(pv_cpu_ops, wbinvd);
65#if defined(CONFIG_PARAVIRT_SPINLOCKS) 63#if defined(CONFIG_PARAVIRT_SPINLOCKS)
66 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): 64 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bb988a24db92..aed9d94bd46f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
47 * section. Since TSS's are completely CPU-local, we want them 47 * section. Since TSS's are completely CPU-local, we want them
48 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 48 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
49 */ 49 */
50__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 50__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
51 .x86_tss = { 51 .x86_tss = {
52 /* 52 /*
53 * .sp0 is only used when entering ring 0 from a lower 53 * .sp0 is only used when entering ring 0 from a lower
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
56 * Poison it. 56 * Poison it.
57 */ 57 */
58 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, 58 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
59
60#ifdef CONFIG_X86_64
61 /*
62 * .sp1 is cpu_current_top_of_stack. The init task never
63 * runs user code, but cpu_current_top_of_stack should still
64 * be well defined before the first context switch.
65 */
66 .sp1 = TOP_OF_INIT_STACK,
67#endif
68
59#ifdef CONFIG_X86_32 69#ifdef CONFIG_X86_32
60 .ss0 = __KERNEL_DS, 70 .ss0 = __KERNEL_DS,
61 .ss1 = __KERNEL_CS, 71 .ss1 = __KERNEL_CS,
@@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
71 */ 81 */
72 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 82 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
73#endif 83#endif
74#ifdef CONFIG_X86_32
75 .SYSENTER_stack_canary = STACK_END_MAGIC,
76#endif
77}; 84};
78EXPORT_PER_CPU_SYMBOL(cpu_tss); 85EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
79 86
80DEFINE_PER_CPU(bool, __tss_limit_invalid); 87DEFINE_PER_CPU(bool, __tss_limit_invalid);
81EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); 88EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
104 struct fpu *fpu = &t->fpu; 111 struct fpu *fpu = &t->fpu;
105 112
106 if (bp) { 113 if (bp) {
107 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 114 struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
108 115
109 t->io_bitmap_ptr = NULL; 116 t->io_bitmap_ptr = NULL;
110 clear_thread_flag(TIF_IO_BITMAP); 117 clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 45bf0c5f93e1..5224c6099184 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
234 struct fpu *prev_fpu = &prev->fpu; 234 struct fpu *prev_fpu = &prev->fpu;
235 struct fpu *next_fpu = &next->fpu; 235 struct fpu *next_fpu = &next->fpu;
236 int cpu = smp_processor_id(); 236 int cpu = smp_processor_id();
237 struct tss_struct *tss = &per_cpu(cpu_tss, cpu); 237 struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
238 238
239 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 239 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
240 240
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eeeb34f85c25..c75466232016 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
69 unsigned int fsindex, gsindex; 69 unsigned int fsindex, gsindex;
70 unsigned int ds, cs, es; 70 unsigned int ds, cs, es;
71 71
72 printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); 72 show_iret_regs(regs);
73 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, 73
74 regs->sp, regs->flags);
75 if (regs->orig_ax != -1) 74 if (regs->orig_ax != -1)
76 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 75 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
77 else 76 else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
88 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", 87 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
89 regs->r13, regs->r14, regs->r15); 88 regs->r13, regs->r14, regs->r15);
90 89
90 if (!all)
91 return;
92
91 asm("movl %%ds,%0" : "=r" (ds)); 93 asm("movl %%ds,%0" : "=r" (ds));
92 asm("movl %%cs,%0" : "=r" (cs)); 94 asm("movl %%cs,%0" : "=r" (cs));
93 asm("movl %%es,%0" : "=r" (es)); 95 asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
98 rdmsrl(MSR_GS_BASE, gs); 100 rdmsrl(MSR_GS_BASE, gs);
99 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 101 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
100 102
101 if (!all)
102 return;
103
104 cr0 = read_cr0(); 103 cr0 = read_cr0();
105 cr2 = read_cr2(); 104 cr2 = read_cr2();
106 cr3 = __read_cr3(); 105 cr3 = __read_cr3();
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
400 struct fpu *prev_fpu = &prev->fpu; 399 struct fpu *prev_fpu = &prev->fpu;
401 struct fpu *next_fpu = &next->fpu; 400 struct fpu *next_fpu = &next->fpu;
402 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
403 struct tss_struct *tss = &per_cpu(cpu_tss, cpu); 402 struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
404 403
405 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 404 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
406 this_cpu_read(irq_count) != -1); 405 this_cpu_read(irq_count) != -1);
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
462 * Switch the PDA and FPU contexts. 461 * Switch the PDA and FPU contexts.
463 */ 462 */
464 this_cpu_write(current_task, next_p); 463 this_cpu_write(current_task, next_p);
464 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
465 465
466 /* Reload sp0. */ 466 /* Reload sp0. */
467 update_sp0(next_p); 467 update_sp0(next_p);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 989514c94a55..e98f8b66a460 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
348 348
349 /* 349 /*
350 * If IRET takes a non-IST fault on the espfix64 stack, then we 350 * If IRET takes a non-IST fault on the espfix64 stack, then we
351 * end up promoting it to a doublefault. In that case, modify 351 * end up promoting it to a doublefault. In that case, take
352 * the stack to make it look like we just entered the #GP 352 * advantage of the fact that we're not using the normal (TSS.sp0)
353 * handler from user space, similar to bad_iret. 353 * stack right now. We can write a fake #GP(0) frame at TSS.sp0
354 * and then modify our own IRET frame so that, when we return,
355 * we land directly at the #GP(0) vector with the stack already
356 * set up according to its expectations.
357 *
358 * The net result is that our #GP handler will think that we
359 * entered from usermode with the bad user context.
354 * 360 *
355 * No need for ist_enter here because we don't use RCU. 361 * No need for ist_enter here because we don't use RCU.
356 */ 362 */
@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
358 regs->cs == __KERNEL_CS && 364 regs->cs == __KERNEL_CS &&
359 regs->ip == (unsigned long)native_irq_return_iret) 365 regs->ip == (unsigned long)native_irq_return_iret)
360 { 366 {
361 struct pt_regs *normal_regs = task_pt_regs(current); 367 struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
362 368
363 /* Fake a #GP(0) from userspace. */ 369 /*
364 memmove(&normal_regs->ip, (void *)regs->sp, 5*8); 370 * regs->sp points to the failing IRET frame on the
365 normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ 371 * ESPFIX64 stack. Copy it to the entry stack. This fills
372 * in gpregs->ss through gpregs->ip.
373 *
374 */
375 memmove(&gpregs->ip, (void *)regs->sp, 5*8);
376 gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
377
378 /*
379 * Adjust our frame so that we return straight to the #GP
380 * vector with the expected RSP value. This is safe because
381 * we won't enable interupts or schedule before we invoke
382 * general_protection, so nothing will clobber the stack
383 * frame we just set up.
384 */
366 regs->ip = (unsigned long)general_protection; 385 regs->ip = (unsigned long)general_protection;
367 regs->sp = (unsigned long)&normal_regs->orig_ax; 386 regs->sp = (unsigned long)&gpregs->orig_ax;
368 387
369 return; 388 return;
370 } 389 }
@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
389 * 408 *
390 * Processors update CR2 whenever a page fault is detected. If a 409 * Processors update CR2 whenever a page fault is detected. If a
391 * second page fault occurs while an earlier page fault is being 410 * second page fault occurs while an earlier page fault is being
392 * deliv- ered, the faulting linear address of the second fault will 411 * delivered, the faulting linear address of the second fault will
393 * overwrite the contents of CR2 (replacing the previous 412 * overwrite the contents of CR2 (replacing the previous
394 * address). These updates to CR2 occur even if the page fault 413 * address). These updates to CR2 occur even if the page fault
395 * results in a double fault or occurs during the delivery of a 414 * results in a double fault or occurs during the delivery of a
@@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3);
605 624
606#ifdef CONFIG_X86_64 625#ifdef CONFIG_X86_64
607/* 626/*
608 * Help handler running on IST stack to switch off the IST stack if the 627 * Help handler running on a per-cpu (IST or entry trampoline) stack
609 * interrupted code was in user mode. The actual stack switch is done in 628 * to switch to the normal thread stack if the interrupted code was in
610 * entry_64.S 629 * user mode. The actual stack switch is done in entry_64.S
611 */ 630 */
612asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) 631asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
613{ 632{
614 struct pt_regs *regs = task_pt_regs(current); 633 struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
615 *regs = *eregs; 634 if (regs != eregs)
635 *regs = *eregs;
616 return regs; 636 return regs;
617} 637}
618NOKPROBE_SYMBOL(sync_regs); 638NOKPROBE_SYMBOL(sync_regs);
@@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
628 /* 648 /*
629 * This is called from entry_64.S early in handling a fault 649 * This is called from entry_64.S early in handling a fault
630 * caused by a bad iret to user mode. To handle the fault 650 * caused by a bad iret to user mode. To handle the fault
631 * correctly, we want move our stack frame to task_pt_regs 651 * correctly, we want to move our stack frame to where it would
632 * and we want to pretend that the exception came from the 652 * be had we entered directly on the entry stack (rather than
633 * iret target. 653 * just below the IRET frame) and we want to pretend that the
654 * exception came from the IRET target.
634 */ 655 */
635 struct bad_iret_stack *new_stack = 656 struct bad_iret_stack *new_stack =
636 container_of(task_pt_regs(current), 657 (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
637 struct bad_iret_stack, regs);
638 658
639 /* Copy the IRET target to the new stack. */ 659 /* Copy the IRET target to the new stack. */
640 memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); 660 memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
795 debug_stack_usage_dec(); 815 debug_stack_usage_dec();
796 816
797exit: 817exit:
798#if defined(CONFIG_X86_32)
799 /*
800 * This is the most likely code path that involves non-trivial use
801 * of the SYSENTER stack. Check that we haven't overrun it.
802 */
803 WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
804 "Overran or corrupted SYSENTER stack\n");
805#endif
806 ist_exit(regs); 818 ist_exit(regs);
807} 819}
808NOKPROBE_SYMBOL(do_debug); 820NOKPROBE_SYMBOL(do_debug);
@@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
929 941
930void __init trap_init(void) 942void __init trap_init(void)
931{ 943{
944 /* Init cpu_entry_area before IST entries are set up */
945 setup_cpu_entry_areas();
946
932 idt_setup_traps(); 947 idt_setup_traps();
933 948
934 /* 949 /*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a3f973b2c97a..be86a865087a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
253 return NULL; 253 return NULL;
254} 254}
255 255
256static bool stack_access_ok(struct unwind_state *state, unsigned long addr, 256static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
257 size_t len) 257 size_t len)
258{ 258{
259 struct stack_info *info = &state->stack_info; 259 struct stack_info *info = &state->stack_info;
260 void *addr = (void *)_addr;
260 261
261 /* 262 if (!on_stack(info, addr, len) &&
262 * If the address isn't on the current stack, switch to the next one. 263 (get_stack_info(addr, state->task, info, &state->stack_mask)))
263 * 264 return false;
264 * We may have to traverse multiple stacks to deal with the possibility
265 * that info->next_sp could point to an empty stack and the address
266 * could be on a subsequent stack.
267 */
268 while (!on_stack(info, (void *)addr, len))
269 if (get_stack_info(info->next_sp, state->task, info,
270 &state->stack_mask))
271 return false;
272 265
273 return true; 266 return true;
274} 267}
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
283 return true; 276 return true;
284} 277}
285 278
286#define REGS_SIZE (sizeof(struct pt_regs))
287#define SP_OFFSET (offsetof(struct pt_regs, sp))
288#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
289#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
290
291static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, 279static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
292 unsigned long *ip, unsigned long *sp, bool full) 280 unsigned long *ip, unsigned long *sp)
293{ 281{
294 size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; 282 struct pt_regs *regs = (struct pt_regs *)addr;
295 size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
296 struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
297
298 if (IS_ENABLED(CONFIG_X86_64)) {
299 if (!stack_access_ok(state, addr, regs_size))
300 return false;
301 283
302 *ip = regs->ip; 284 /* x86-32 support will be more complicated due to the &regs->sp hack */
303 *sp = regs->sp; 285 BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
304 286
305 return true; 287 if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
306 }
307
308 if (!stack_access_ok(state, addr, sp_offset))
309 return false; 288 return false;
310 289
311 *ip = regs->ip; 290 *ip = regs->ip;
291 *sp = regs->sp;
292 return true;
293}
312 294
313 if (user_mode(regs)) { 295static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
314 if (!stack_access_ok(state, addr + sp_offset, 296 unsigned long *ip, unsigned long *sp)
315 REGS_SIZE - SP_OFFSET)) 297{
316 return false; 298 struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
317 299
318 *sp = regs->sp; 300 if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
319 } else 301 return false;
320 *sp = (unsigned long)&regs->sp;
321 302
303 *ip = regs->ip;
304 *sp = regs->sp;
322 return true; 305 return true;
323} 306}
324 307
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
327 unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; 310 unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
328 enum stack_type prev_type = state->stack_info.type; 311 enum stack_type prev_type = state->stack_info.type;
329 struct orc_entry *orc; 312 struct orc_entry *orc;
330 struct pt_regs *ptregs;
331 bool indirect = false; 313 bool indirect = false;
332 314
333 if (unwind_done(state)) 315 if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
435 break; 417 break;
436 418
437 case ORC_TYPE_REGS: 419 case ORC_TYPE_REGS:
438 if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { 420 if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
439 orc_warn("can't dereference registers at %p for ip %pB\n", 421 orc_warn("can't dereference registers at %p for ip %pB\n",
440 (void *)sp, (void *)orig_ip); 422 (void *)sp, (void *)orig_ip);
441 goto done; 423 goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
447 break; 429 break;
448 430
449 case ORC_TYPE_REGS_IRET: 431 case ORC_TYPE_REGS_IRET:
450 if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { 432 if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
451 orc_warn("can't dereference iret registers at %p for ip %pB\n", 433 orc_warn("can't dereference iret registers at %p for ip %pB\n",
452 (void *)sp, (void *)orig_ip); 434 (void *)sp, (void *)orig_ip);
453 goto done; 435 goto done;
454 } 436 }
455 437
456 ptregs = container_of((void *)sp, struct pt_regs, ip); 438 state->regs = (void *)sp - IRET_FRAME_OFFSET;
457 if ((unsigned long)ptregs >= prev_sp && 439 state->full_regs = false;
458 on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
459 state->regs = ptregs;
460 state->full_regs = false;
461 } else
462 state->regs = NULL;
463
464 state->signal = true; 440 state->signal = true;
465 break; 441 break;
466 442
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
553 } 529 }
554 530
555 if (get_stack_info((unsigned long *)state->sp, state->task, 531 if (get_stack_info((unsigned long *)state->sp, state->task,
556 &state->stack_info, &state->stack_mask)) 532 &state->stack_info, &state->stack_mask)) {
557 return; 533 /*
534 * We weren't on a valid stack. It's possible that
535 * we overflowed a valid stack into a guard page.
536 * See if the next page up is valid so that we can
537 * generate some kind of backtrace if this happens.
538 */
539 void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
540 if (get_stack_info(next_page, state->task, &state->stack_info,
541 &state->stack_mask))
542 return;
543 }
558 544
559 /* 545 /*
560 * The caller can provide the address of the first frame directly 546 * The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a4009fb9be87..d2a8b5a24a44 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -107,6 +107,15 @@ SECTIONS
107 SOFTIRQENTRY_TEXT 107 SOFTIRQENTRY_TEXT
108 *(.fixup) 108 *(.fixup)
109 *(.gnu.warning) 109 *(.gnu.warning)
110
111#ifdef CONFIG_X86_64
112 . = ALIGN(PAGE_SIZE);
113 _entry_trampoline = .;
114 *(.entry_trampoline)
115 . = ALIGN(PAGE_SIZE);
116 ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
117#endif
118
110 /* End of text section */ 119 /* End of text section */
111 _etext = .; 120 _etext = .;
112 } :text = 0x9090 121 } :text = 0x9090
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eba631c4dbd..023afa0c8887 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2302 * processors. See 22.2.4. 2302 * processors. See 22.2.4.
2303 */ 2303 */
2304 vmcs_writel(HOST_TR_BASE, 2304 vmcs_writel(HOST_TR_BASE,
2305 (unsigned long)this_cpu_ptr(&cpu_tss)); 2305 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2306 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 2306 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
2307 2307
2308 /* 2308 /*
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 553f8fd23cc4..4846eff7e4c8 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
107 delay = min_t(u64, MWAITX_MAX_LOOPS, loops); 107 delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
108 108
109 /* 109 /*
110 * Use cpu_tss as a cacheline-aligned, seldomly 110 * Use cpu_tss_rw as a cacheline-aligned, seldomly
111 * accessed per-cpu variable as the monitor target. 111 * accessed per-cpu variable as the monitor target.
112 */ 112 */
113 __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); 113 __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
114 114
115 /* 115 /*
116 * AMD, like Intel, supports the EAX hint and EAX=0xf 116 * AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 99dfed6dfef8..9ec70d780f1f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -277,6 +277,7 @@ void __init kasan_early_init(void)
277void __init kasan_init(void) 277void __init kasan_init(void)
278{ 278{
279 int i; 279 int i;
280 void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
280 281
281#ifdef CONFIG_KASAN_INLINE 282#ifdef CONFIG_KASAN_INLINE
282 register_die_notifier(&kasan_die_notifier); 283 register_die_notifier(&kasan_die_notifier);
@@ -329,8 +330,23 @@ void __init kasan_init(void)
329 (unsigned long)kasan_mem_to_shadow(_end), 330 (unsigned long)kasan_mem_to_shadow(_end),
330 early_pfn_to_nid(__pa(_stext))); 331 early_pfn_to_nid(__pa(_stext)));
331 332
333 shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
334 shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
335 shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
336 PAGE_SIZE);
337
338 shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
339 shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
340 shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
341 PAGE_SIZE);
342
332 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 343 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
333 (void *)KASAN_SHADOW_END); 344 shadow_cpu_entry_begin);
345
346 kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
347 (unsigned long)shadow_cpu_entry_end, 0);
348
349 kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
334 350
335 load_cr3(init_top_pgt); 351 load_cr3(init_top_pgt);
336 __flush_tlb_all(); 352 __flush_tlb_all();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 36a28eddb435..a7d966964c6f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -152,17 +152,19 @@ static void do_fpu_end(void)
152static void fix_processor_context(void) 152static void fix_processor_context(void)
153{ 153{
154 int cpu = smp_processor_id(); 154 int cpu = smp_processor_id();
155 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
156#ifdef CONFIG_X86_64 155#ifdef CONFIG_X86_64
157 struct desc_struct *desc = get_cpu_gdt_rw(cpu); 156 struct desc_struct *desc = get_cpu_gdt_rw(cpu);
158 tss_desc tss; 157 tss_desc tss;
159#endif 158#endif
160 set_tss_desc(cpu, t); /* 159
161 * This just modifies memory; should not be 160 /*
162 * necessary. But... This is necessary, because 161 * We need to reload TR, which requires that we change the
163 * 386 hardware has concept of busy TSS or some 162 * GDT entry to indicate "available" first.
164 * similar stupidity. 163 *
165 */ 164 * XXX: This could probably all be replaced by a call to
165 * force_reload_TR().
166 */
167 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
166 168
167#ifdef CONFIG_X86_64 169#ifdef CONFIG_X86_64
168 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); 170 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f2414c6c5e7c..7beeee1443b3 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0)
826 mcs = xen_mc_entry(0); 826 mcs = xen_mc_entry(0);
827 MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); 827 MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
828 xen_mc_issue(PARAVIRT_LAZY_CPU); 828 xen_mc_issue(PARAVIRT_LAZY_CPU);
829 this_cpu_write(cpu_tss.x86_tss.sp0, sp0); 829 this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
830} 830}
831 831
832void xen_set_iopl_mask(unsigned mask) 832void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index fc048ec686e7..6cf801ca1142 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2272#endif 2272#endif
2273 case FIX_TEXT_POKE0: 2273 case FIX_TEXT_POKE0:
2274 case FIX_TEXT_POKE1: 2274 case FIX_TEXT_POKE1:
2275 case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: 2275 case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
2276 /* All local page mappings */ 2276 /* All local page mappings */
2277 pte = pfn_pte(phys, prot); 2277 pte = pfn_pte(phys, prot);
2278 break; 2278 break;