diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-18 11:59:15 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-18 11:59:15 -0500 |
commit | 64a48099b3b31568ac45716b7fafcb74a0c2fcfe (patch) | |
tree | 0652431aeb450bbfa74b9be8b7d813ac8511aec3 | |
parent | 1291a0d5049dbc06baaaf66a9ff3f53db493b19b (diff) | |
parent | 6cbd2171e89b13377261d15e64384df60ecb530e (diff) |
Merge branch 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 syscall entry code changes for PTI from Ingo Molnar:
"The main changes here are Andy Lutomirski's changes to switch the
x86-64 entry code to use the 'per CPU entry trampoline stack'. This,
besides helping fix KASLR leaks (the pending Page Table Isolation
(PTI) work), also robustifies the x86 entry code"
* 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits)
x86/cpufeatures: Make CPU bugs sticky
x86/paravirt: Provide a way to check for hypervisors
x86/paravirt: Dont patch flush_tlb_single
x86/entry/64: Make cpu_entry_area.tss read-only
x86/entry: Clean up the SYSENTER_stack code
x86/entry/64: Remove the SYSENTER stack canary
x86/entry/64: Move the IST stacks into struct cpu_entry_area
x86/entry/64: Create a per-CPU SYSCALL entry trampoline
x86/entry/64: Return to userspace from the trampoline stack
x86/entry/64: Use a per-CPU trampoline stack for IDT entries
x86/espfix/64: Stop assuming that pt_regs is on the entry stack
x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0
x86/entry: Remap the TSS into the CPU entry area
x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct
x86/dumpstack: Handle stack overflow on all stacks
x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss
x86/kasan/64: Teach KASAN about the cpu_entry_area
x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area
x86/entry/gdt: Put per-CPU GDT remaps in ascending order
x86/dumpstack: Add get_stack_info() support for the SYSENTER stack
...
40 files changed, 691 insertions, 286 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6..bd8b57a5c874 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S | |||
@@ -941,7 +941,8 @@ ENTRY(debug) | |||
941 | movl %esp, %eax # pt_regs pointer | 941 | movl %esp, %eax # pt_regs pointer |
942 | 942 | ||
943 | /* Are we currently on the SYSENTER stack? */ | 943 | /* Are we currently on the SYSENTER stack? */ |
944 | PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) | 944 | movl PER_CPU_VAR(cpu_entry_area), %ecx |
945 | addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx | ||
945 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ | 946 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ |
946 | cmpl $SIZEOF_SYSENTER_stack, %ecx | 947 | cmpl $SIZEOF_SYSENTER_stack, %ecx |
947 | jb .Ldebug_from_sysenter_stack | 948 | jb .Ldebug_from_sysenter_stack |
@@ -984,7 +985,8 @@ ENTRY(nmi) | |||
984 | movl %esp, %eax # pt_regs pointer | 985 | movl %esp, %eax # pt_regs pointer |
985 | 986 | ||
986 | /* Are we currently on the SYSENTER stack? */ | 987 | /* Are we currently on the SYSENTER stack? */ |
987 | PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) | 988 | movl PER_CPU_VAR(cpu_entry_area), %ecx |
989 | addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx | ||
988 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ | 990 | subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ |
989 | cmpl $SIZEOF_SYSENTER_stack, %ecx | 991 | cmpl $SIZEOF_SYSENTER_stack, %ecx |
990 | jb .Lnmi_from_sysenter_stack | 992 | jb .Lnmi_from_sysenter_stack |
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f81d50d7ceac..423885bee398 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -140,6 +140,64 @@ END(native_usergs_sysret64) | |||
140 | * with them due to bugs in both AMD and Intel CPUs. | 140 | * with them due to bugs in both AMD and Intel CPUs. |
141 | */ | 141 | */ |
142 | 142 | ||
143 | .pushsection .entry_trampoline, "ax" | ||
144 | |||
145 | /* | ||
146 | * The code in here gets remapped into cpu_entry_area's trampoline. This means | ||
147 | * that the assembler and linker have the wrong idea as to where this code | ||
148 | * lives (and, in fact, it's mapped more than once, so it's not even at a | ||
149 | * fixed address). So we can't reference any symbols outside the entry | ||
150 | * trampoline and expect it to work. | ||
151 | * | ||
152 | * Instead, we carefully abuse %rip-relative addressing. | ||
153 | * _entry_trampoline(%rip) refers to the start of the remapped) entry | ||
154 | * trampoline. We can thus find cpu_entry_area with this macro: | ||
155 | */ | ||
156 | |||
157 | #define CPU_ENTRY_AREA \ | ||
158 | _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) | ||
159 | |||
160 | /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ | ||
161 | #define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ | ||
162 | SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA | ||
163 | |||
164 | ENTRY(entry_SYSCALL_64_trampoline) | ||
165 | UNWIND_HINT_EMPTY | ||
166 | swapgs | ||
167 | |||
168 | /* Stash the user RSP. */ | ||
169 | movq %rsp, RSP_SCRATCH | ||
170 | |||
171 | /* Load the top of the task stack into RSP */ | ||
172 | movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp | ||
173 | |||
174 | /* Start building the simulated IRET frame. */ | ||
175 | pushq $__USER_DS /* pt_regs->ss */ | ||
176 | pushq RSP_SCRATCH /* pt_regs->sp */ | ||
177 | pushq %r11 /* pt_regs->flags */ | ||
178 | pushq $__USER_CS /* pt_regs->cs */ | ||
179 | pushq %rcx /* pt_regs->ip */ | ||
180 | |||
181 | /* | ||
182 | * x86 lacks a near absolute jump, and we can't jump to the real | ||
183 | * entry text with a relative jump. We could push the target | ||
184 | * address and then use retq, but this destroys the pipeline on | ||
185 | * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, | ||
186 | * spill RDI and restore it in a second-stage trampoline. | ||
187 | */ | ||
188 | pushq %rdi | ||
189 | movq $entry_SYSCALL_64_stage2, %rdi | ||
190 | jmp *%rdi | ||
191 | END(entry_SYSCALL_64_trampoline) | ||
192 | |||
193 | .popsection | ||
194 | |||
195 | ENTRY(entry_SYSCALL_64_stage2) | ||
196 | UNWIND_HINT_EMPTY | ||
197 | popq %rdi | ||
198 | jmp entry_SYSCALL_64_after_hwframe | ||
199 | END(entry_SYSCALL_64_stage2) | ||
200 | |||
143 | ENTRY(entry_SYSCALL_64) | 201 | ENTRY(entry_SYSCALL_64) |
144 | UNWIND_HINT_EMPTY | 202 | UNWIND_HINT_EMPTY |
145 | /* | 203 | /* |
@@ -330,8 +388,24 @@ syscall_return_via_sysret: | |||
330 | popq %rsi /* skip rcx */ | 388 | popq %rsi /* skip rcx */ |
331 | popq %rdx | 389 | popq %rdx |
332 | popq %rsi | 390 | popq %rsi |
391 | |||
392 | /* | ||
393 | * Now all regs are restored except RSP and RDI. | ||
394 | * Save old stack pointer and switch to trampoline stack. | ||
395 | */ | ||
396 | movq %rsp, %rdi | ||
397 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp | ||
398 | |||
399 | pushq RSP-RDI(%rdi) /* RSP */ | ||
400 | pushq (%rdi) /* RDI */ | ||
401 | |||
402 | /* | ||
403 | * We are on the trampoline stack. All regs except RDI are live. | ||
404 | * We can do future final exit work right here. | ||
405 | */ | ||
406 | |||
333 | popq %rdi | 407 | popq %rdi |
334 | movq RSP-ORIG_RAX(%rsp), %rsp | 408 | popq %rsp |
335 | USERGS_SYSRET64 | 409 | USERGS_SYSRET64 |
336 | END(entry_SYSCALL_64) | 410 | END(entry_SYSCALL_64) |
337 | 411 | ||
@@ -466,12 +540,13 @@ END(irq_entries_start) | |||
466 | 540 | ||
467 | .macro DEBUG_ENTRY_ASSERT_IRQS_OFF | 541 | .macro DEBUG_ENTRY_ASSERT_IRQS_OFF |
468 | #ifdef CONFIG_DEBUG_ENTRY | 542 | #ifdef CONFIG_DEBUG_ENTRY |
469 | pushfq | 543 | pushq %rax |
470 | testl $X86_EFLAGS_IF, (%rsp) | 544 | SAVE_FLAGS(CLBR_RAX) |
545 | testl $X86_EFLAGS_IF, %eax | ||
471 | jz .Lokay_\@ | 546 | jz .Lokay_\@ |
472 | ud2 | 547 | ud2 |
473 | .Lokay_\@: | 548 | .Lokay_\@: |
474 | addq $8, %rsp | 549 | popq %rax |
475 | #endif | 550 | #endif |
476 | .endm | 551 | .endm |
477 | 552 | ||
@@ -563,6 +638,13 @@ END(irq_entries_start) | |||
563 | /* 0(%rsp): ~(interrupt number) */ | 638 | /* 0(%rsp): ~(interrupt number) */ |
564 | .macro interrupt func | 639 | .macro interrupt func |
565 | cld | 640 | cld |
641 | |||
642 | testb $3, CS-ORIG_RAX(%rsp) | ||
643 | jz 1f | ||
644 | SWAPGS | ||
645 | call switch_to_thread_stack | ||
646 | 1: | ||
647 | |||
566 | ALLOC_PT_GPREGS_ON_STACK | 648 | ALLOC_PT_GPREGS_ON_STACK |
567 | SAVE_C_REGS | 649 | SAVE_C_REGS |
568 | SAVE_EXTRA_REGS | 650 | SAVE_EXTRA_REGS |
@@ -572,12 +654,8 @@ END(irq_entries_start) | |||
572 | jz 1f | 654 | jz 1f |
573 | 655 | ||
574 | /* | 656 | /* |
575 | * IRQ from user mode. Switch to kernel gsbase and inform context | 657 | * IRQ from user mode. |
576 | * tracking that we're in kernel mode. | 658 | * |
577 | */ | ||
578 | SWAPGS | ||
579 | |||
580 | /* | ||
581 | * We need to tell lockdep that IRQs are off. We can't do this until | 659 | * We need to tell lockdep that IRQs are off. We can't do this until |
582 | * we fix gsbase, and we should do it before enter_from_user_mode | 660 | * we fix gsbase, and we should do it before enter_from_user_mode |
583 | * (which can take locks). Since TRACE_IRQS_OFF idempotent, | 661 | * (which can take locks). Since TRACE_IRQS_OFF idempotent, |
@@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) | |||
630 | ud2 | 708 | ud2 |
631 | 1: | 709 | 1: |
632 | #endif | 710 | #endif |
633 | SWAPGS | ||
634 | POP_EXTRA_REGS | 711 | POP_EXTRA_REGS |
635 | POP_C_REGS | 712 | popq %r11 |
636 | addq $8, %rsp /* skip regs->orig_ax */ | 713 | popq %r10 |
714 | popq %r9 | ||
715 | popq %r8 | ||
716 | popq %rax | ||
717 | popq %rcx | ||
718 | popq %rdx | ||
719 | popq %rsi | ||
720 | |||
721 | /* | ||
722 | * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. | ||
723 | * Save old stack pointer and switch to trampoline stack. | ||
724 | */ | ||
725 | movq %rsp, %rdi | ||
726 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp | ||
727 | |||
728 | /* Copy the IRET frame to the trampoline stack. */ | ||
729 | pushq 6*8(%rdi) /* SS */ | ||
730 | pushq 5*8(%rdi) /* RSP */ | ||
731 | pushq 4*8(%rdi) /* EFLAGS */ | ||
732 | pushq 3*8(%rdi) /* CS */ | ||
733 | pushq 2*8(%rdi) /* RIP */ | ||
734 | |||
735 | /* Push user RDI on the trampoline stack. */ | ||
736 | pushq (%rdi) | ||
737 | |||
738 | /* | ||
739 | * We are on the trampoline stack. All regs except RDI are live. | ||
740 | * We can do future final exit work right here. | ||
741 | */ | ||
742 | |||
743 | /* Restore RDI. */ | ||
744 | popq %rdi | ||
745 | SWAPGS | ||
637 | INTERRUPT_RETURN | 746 | INTERRUPT_RETURN |
638 | 747 | ||
639 | 748 | ||
@@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt | |||
829 | /* | 938 | /* |
830 | * Exception entry points. | 939 | * Exception entry points. |
831 | */ | 940 | */ |
832 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) | 941 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) |
942 | |||
943 | /* | ||
944 | * Switch to the thread stack. This is called with the IRET frame and | ||
945 | * orig_ax on the stack. (That is, RDI..R12 are not on the stack and | ||
946 | * space has not been allocated for them.) | ||
947 | */ | ||
948 | ENTRY(switch_to_thread_stack) | ||
949 | UNWIND_HINT_FUNC | ||
950 | |||
951 | pushq %rdi | ||
952 | movq %rsp, %rdi | ||
953 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | ||
954 | UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI | ||
955 | |||
956 | pushq 7*8(%rdi) /* regs->ss */ | ||
957 | pushq 6*8(%rdi) /* regs->rsp */ | ||
958 | pushq 5*8(%rdi) /* regs->eflags */ | ||
959 | pushq 4*8(%rdi) /* regs->cs */ | ||
960 | pushq 3*8(%rdi) /* regs->ip */ | ||
961 | pushq 2*8(%rdi) /* regs->orig_ax */ | ||
962 | pushq 8(%rdi) /* return address */ | ||
963 | UNWIND_HINT_FUNC | ||
964 | |||
965 | movq (%rdi), %rdi | ||
966 | ret | ||
967 | END(switch_to_thread_stack) | ||
833 | 968 | ||
834 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 | 969 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 |
835 | ENTRY(\sym) | 970 | ENTRY(\sym) |
@@ -848,11 +983,12 @@ ENTRY(\sym) | |||
848 | 983 | ||
849 | ALLOC_PT_GPREGS_ON_STACK | 984 | ALLOC_PT_GPREGS_ON_STACK |
850 | 985 | ||
851 | .if \paranoid | 986 | .if \paranoid < 2 |
852 | .if \paranoid == 1 | ||
853 | testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ | 987 | testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ |
854 | jnz 1f | 988 | jnz .Lfrom_usermode_switch_stack_\@ |
855 | .endif | 989 | .endif |
990 | |||
991 | .if \paranoid | ||
856 | call paranoid_entry | 992 | call paranoid_entry |
857 | .else | 993 | .else |
858 | call error_entry | 994 | call error_entry |
@@ -894,20 +1030,15 @@ ENTRY(\sym) | |||
894 | jmp error_exit | 1030 | jmp error_exit |
895 | .endif | 1031 | .endif |
896 | 1032 | ||
897 | .if \paranoid == 1 | 1033 | .if \paranoid < 2 |
898 | /* | 1034 | /* |
899 | * Paranoid entry from userspace. Switch stacks and treat it | 1035 | * Entry from userspace. Switch stacks and treat it |
900 | * as a normal entry. This means that paranoid handlers | 1036 | * as a normal entry. This means that paranoid handlers |
901 | * run in real process context if user_mode(regs). | 1037 | * run in real process context if user_mode(regs). |
902 | */ | 1038 | */ |
903 | 1: | 1039 | .Lfrom_usermode_switch_stack_\@: |
904 | call error_entry | 1040 | call error_entry |
905 | 1041 | ||
906 | |||
907 | movq %rsp, %rdi /* pt_regs pointer */ | ||
908 | call sync_regs | ||
909 | movq %rax, %rsp /* switch stack */ | ||
910 | |||
911 | movq %rsp, %rdi /* pt_regs pointer */ | 1042 | movq %rsp, %rdi /* pt_regs pointer */ |
912 | 1043 | ||
913 | .if \has_error_code | 1044 | .if \has_error_code |
@@ -1170,6 +1301,14 @@ ENTRY(error_entry) | |||
1170 | SWAPGS | 1301 | SWAPGS |
1171 | 1302 | ||
1172 | .Lerror_entry_from_usermode_after_swapgs: | 1303 | .Lerror_entry_from_usermode_after_swapgs: |
1304 | /* Put us onto the real thread stack. */ | ||
1305 | popq %r12 /* save return addr in %12 */ | ||
1306 | movq %rsp, %rdi /* arg0 = pt_regs pointer */ | ||
1307 | call sync_regs | ||
1308 | movq %rax, %rsp /* switch stack */ | ||
1309 | ENCODE_FRAME_POINTER | ||
1310 | pushq %r12 | ||
1311 | |||
1173 | /* | 1312 | /* |
1174 | * We need to tell lockdep that IRQs are off. We can't do this until | 1313 | * We need to tell lockdep that IRQs are off. We can't do this until |
1175 | * we fix gsbase, and we should do it before enter_from_user_mode | 1314 | * we fix gsbase, and we should do it before enter_from_user_mode |
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 568e130d932c..95ad40eb7eff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S | |||
@@ -48,7 +48,7 @@ | |||
48 | */ | 48 | */ |
49 | ENTRY(entry_SYSENTER_compat) | 49 | ENTRY(entry_SYSENTER_compat) |
50 | /* Interrupts are off on entry. */ | 50 | /* Interrupts are off on entry. */ |
51 | SWAPGS_UNSAFE_STACK | 51 | SWAPGS |
52 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 52 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
53 | 53 | ||
54 | /* | 54 | /* |
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat) | |||
306 | */ | 306 | */ |
307 | movl %eax, %eax | 307 | movl %eax, %eax |
308 | 308 | ||
309 | /* Construct struct pt_regs on stack (iret frame is already on stack) */ | ||
310 | pushq %rax /* pt_regs->orig_ax */ | 309 | pushq %rax /* pt_regs->orig_ax */ |
310 | |||
311 | /* switch to thread stack expects orig_ax to be pushed */ | ||
312 | call switch_to_thread_stack | ||
313 | |||
311 | pushq %rdi /* pt_regs->di */ | 314 | pushq %rdi /* pt_regs->di */ |
312 | pushq %rsi /* pt_regs->si */ | 315 | pushq %rsi /* pt_regs->si */ |
313 | pushq %rdx /* pt_regs->dx */ | 316 | pushq %rdx /* pt_regs->dx */ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bf6a76202a77..ea9a7dde62e5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); | |||
135 | set_bit(bit, (unsigned long *)cpu_caps_set); \ | 135 | set_bit(bit, (unsigned long *)cpu_caps_set); \ |
136 | } while (0) | 136 | } while (0) |
137 | 137 | ||
138 | #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) | ||
139 | |||
138 | #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) | 140 | #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) |
139 | /* | 141 | /* |
140 | * Static testing of CPU features. Used the same as boot_cpu_has(). | 142 | * Static testing of CPU features. Used the same as boot_cpu_has(). |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4011cb03ef08..aab4fe9f49f8 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) | |||
60 | return this_cpu_ptr(&gdt_page)->gdt; | 60 | return this_cpu_ptr(&gdt_page)->gdt; |
61 | } | 61 | } |
62 | 62 | ||
63 | /* Get the fixmap index for a specific processor */ | ||
64 | static inline unsigned int get_cpu_gdt_ro_index(int cpu) | ||
65 | { | ||
66 | return FIX_GDT_REMAP_BEGIN + cpu; | ||
67 | } | ||
68 | |||
69 | /* Provide the fixmap address of the remapped GDT */ | 63 | /* Provide the fixmap address of the remapped GDT */ |
70 | static inline struct desc_struct *get_cpu_gdt_ro(int cpu) | 64 | static inline struct desc_struct *get_cpu_gdt_ro(int cpu) |
71 | { | 65 | { |
72 | unsigned int idx = get_cpu_gdt_ro_index(cpu); | 66 | return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; |
73 | return (struct desc_struct *)__fix_to_virt(idx); | ||
74 | } | 67 | } |
75 | 68 | ||
76 | /* Provide the current read-only GDT */ | 69 | /* Provide the current read-only GDT */ |
@@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, | |||
185 | #endif | 178 | #endif |
186 | } | 179 | } |
187 | 180 | ||
188 | static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) | 181 | static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) |
189 | { | 182 | { |
190 | struct desc_struct *d = get_cpu_gdt_rw(cpu); | 183 | struct desc_struct *d = get_cpu_gdt_rw(cpu); |
191 | tss_desc tss; | 184 | tss_desc tss; |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0c505fe9a95..94fc4fa14127 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP; | |||
44 | PAGE_SIZE) | 44 | PAGE_SIZE) |
45 | #endif | 45 | #endif |
46 | 46 | ||
47 | /* | ||
48 | * cpu_entry_area is a percpu region in the fixmap that contains things | ||
49 | * needed by the CPU and early entry/exit code. Real types aren't used | ||
50 | * for all fields here to avoid circular header dependencies. | ||
51 | * | ||
52 | * Every field is a virtual alias of some other allocated backing store. | ||
53 | * There is no direct allocation of a struct cpu_entry_area. | ||
54 | */ | ||
55 | struct cpu_entry_area { | ||
56 | char gdt[PAGE_SIZE]; | ||
57 | |||
58 | /* | ||
59 | * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as | ||
60 | * a a read-only guard page. | ||
61 | */ | ||
62 | struct SYSENTER_stack_page SYSENTER_stack_page; | ||
63 | |||
64 | /* | ||
65 | * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because | ||
66 | * we need task switches to work, and task switches write to the TSS. | ||
67 | */ | ||
68 | struct tss_struct tss; | ||
69 | |||
70 | char entry_trampoline[PAGE_SIZE]; | ||
71 | |||
72 | #ifdef CONFIG_X86_64 | ||
73 | /* | ||
74 | * Exception stacks used for IST entries. | ||
75 | * | ||
76 | * In the future, this should have a separate slot for each stack | ||
77 | * with guard pages between them. | ||
78 | */ | ||
79 | char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; | ||
80 | #endif | ||
81 | }; | ||
82 | |||
83 | #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) | ||
84 | |||
85 | extern void setup_cpu_entry_areas(void); | ||
47 | 86 | ||
48 | /* | 87 | /* |
49 | * Here we define all the compile-time 'special' virtual | 88 | * Here we define all the compile-time 'special' virtual |
@@ -101,8 +140,8 @@ enum fixed_addresses { | |||
101 | FIX_LNW_VRTC, | 140 | FIX_LNW_VRTC, |
102 | #endif | 141 | #endif |
103 | /* Fixmap entries to remap the GDTs, one per processor. */ | 142 | /* Fixmap entries to remap the GDTs, one per processor. */ |
104 | FIX_GDT_REMAP_BEGIN, | 143 | FIX_CPU_ENTRY_AREA_TOP, |
105 | FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, | 144 | FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, |
106 | 145 | ||
107 | #ifdef CONFIG_ACPI_APEI_GHES | 146 | #ifdef CONFIG_ACPI_APEI_GHES |
108 | /* Used for GHES mapping from assorted contexts */ | 147 | /* Used for GHES mapping from assorted contexts */ |
@@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, | |||
191 | void __early_set_fixmap(enum fixed_addresses idx, | 230 | void __early_set_fixmap(enum fixed_addresses idx, |
192 | phys_addr_t phys, pgprot_t flags); | 231 | phys_addr_t phys, pgprot_t flags); |
193 | 232 | ||
233 | static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) | ||
234 | { | ||
235 | BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); | ||
236 | |||
237 | return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; | ||
238 | } | ||
239 | |||
240 | #define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ | ||
241 | BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ | ||
242 | __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ | ||
243 | }) | ||
244 | |||
245 | #define get_cpu_entry_area_index(cpu, field) \ | ||
246 | __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) | ||
247 | |||
248 | static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) | ||
249 | { | ||
250 | return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); | ||
251 | } | ||
252 | |||
253 | static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) | ||
254 | { | ||
255 | return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; | ||
256 | } | ||
257 | |||
194 | #endif /* !__ASSEMBLY__ */ | 258 | #endif /* !__ASSEMBLY__ */ |
195 | #endif /* _ASM_X86_FIXMAP_H */ | 259 | #endif /* _ASM_X86_FIXMAP_H */ |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 1b0a5abcd8ae..96aa6b9884dc 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
@@ -20,16 +20,7 @@ | |||
20 | #ifndef _ASM_X86_HYPERVISOR_H | 20 | #ifndef _ASM_X86_HYPERVISOR_H |
21 | #define _ASM_X86_HYPERVISOR_H | 21 | #define _ASM_X86_HYPERVISOR_H |
22 | 22 | ||
23 | #ifdef CONFIG_HYPERVISOR_GUEST | 23 | /* x86 hypervisor types */ |
24 | |||
25 | #include <asm/kvm_para.h> | ||
26 | #include <asm/x86_init.h> | ||
27 | #include <asm/xen/hypervisor.h> | ||
28 | |||
29 | /* | ||
30 | * x86 hypervisor information | ||
31 | */ | ||
32 | |||
33 | enum x86_hypervisor_type { | 24 | enum x86_hypervisor_type { |
34 | X86_HYPER_NATIVE = 0, | 25 | X86_HYPER_NATIVE = 0, |
35 | X86_HYPER_VMWARE, | 26 | X86_HYPER_VMWARE, |
@@ -39,6 +30,12 @@ enum x86_hypervisor_type { | |||
39 | X86_HYPER_KVM, | 30 | X86_HYPER_KVM, |
40 | }; | 31 | }; |
41 | 32 | ||
33 | #ifdef CONFIG_HYPERVISOR_GUEST | ||
34 | |||
35 | #include <asm/kvm_para.h> | ||
36 | #include <asm/x86_init.h> | ||
37 | #include <asm/xen/hypervisor.h> | ||
38 | |||
42 | struct hypervisor_x86 { | 39 | struct hypervisor_x86 { |
43 | /* Hypervisor name */ | 40 | /* Hypervisor name */ |
44 | const char *name; | 41 | const char *name; |
@@ -58,7 +55,15 @@ struct hypervisor_x86 { | |||
58 | 55 | ||
59 | extern enum x86_hypervisor_type x86_hyper_type; | 56 | extern enum x86_hypervisor_type x86_hyper_type; |
60 | extern void init_hypervisor_platform(void); | 57 | extern void init_hypervisor_platform(void); |
58 | static inline bool hypervisor_is_type(enum x86_hypervisor_type type) | ||
59 | { | ||
60 | return x86_hyper_type == type; | ||
61 | } | ||
61 | #else | 62 | #else |
62 | static inline void init_hypervisor_platform(void) { } | 63 | static inline void init_hypervisor_platform(void) { } |
64 | static inline bool hypervisor_is_type(enum x86_hypervisor_type type) | ||
65 | { | ||
66 | return type == X86_HYPER_NATIVE; | ||
67 | } | ||
63 | #endif /* CONFIG_HYPERVISOR_GUEST */ | 68 | #endif /* CONFIG_HYPERVISOR_GUEST */ |
64 | #endif /* _ASM_X86_HYPERVISOR_H */ | 69 | #endif /* _ASM_X86_HYPERVISOR_H */ |
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c8ef23f2c28f..89f08955fff7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void) | |||
142 | swapgs; \ | 142 | swapgs; \ |
143 | sysretl | 143 | sysretl |
144 | 144 | ||
145 | #ifdef CONFIG_DEBUG_ENTRY | ||
146 | #define SAVE_FLAGS(x) pushfq; popq %rax | ||
147 | #endif | ||
145 | #else | 148 | #else |
146 | #define INTERRUPT_RETURN iret | 149 | #define INTERRUPT_RETURN iret |
147 | #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit | 150 | #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit |
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index f86a8caa561e..395c9631e000 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h | |||
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); | |||
26 | extern int __must_check __die(const char *, struct pt_regs *, long); | 26 | extern int __must_check __die(const char *, struct pt_regs *, long); |
27 | extern void show_stack_regs(struct pt_regs *regs); | 27 | extern void show_stack_regs(struct pt_regs *regs); |
28 | extern void __show_regs(struct pt_regs *regs, int all); | 28 | extern void __show_regs(struct pt_regs *regs, int all); |
29 | extern void show_iret_regs(struct pt_regs *regs); | ||
29 | extern unsigned long oops_begin(void); | 30 | extern unsigned long oops_begin(void); |
30 | extern void oops_end(unsigned long, struct pt_regs *, int signr); | 31 | extern void oops_end(unsigned long, struct pt_regs *, int signr); |
31 | 32 | ||
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 283efcaac8af..892df375b615 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -927,6 +927,15 @@ extern void default_banner(void); | |||
927 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ | 927 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ |
928 | CLBR_NONE, \ | 928 | CLBR_NONE, \ |
929 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) | 929 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) |
930 | |||
931 | #ifdef CONFIG_DEBUG_ENTRY | ||
932 | #define SAVE_FLAGS(clobbers) \ | ||
933 | PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ | ||
934 | PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ | ||
935 | call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ | ||
936 | PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) | ||
937 | #endif | ||
938 | |||
930 | #endif /* CONFIG_X86_32 */ | 939 | #endif /* CONFIG_X86_32 */ |
931 | 940 | ||
932 | #endif /* __ASSEMBLY__ */ | 941 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cc16fa882e3e..1f2434ee9f80 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -163,9 +163,9 @@ enum cpuid_regs_idx { | |||
163 | extern struct cpuinfo_x86 boot_cpu_data; | 163 | extern struct cpuinfo_x86 boot_cpu_data; |
164 | extern struct cpuinfo_x86 new_cpu_data; | 164 | extern struct cpuinfo_x86 new_cpu_data; |
165 | 165 | ||
166 | extern struct tss_struct doublefault_tss; | 166 | extern struct x86_hw_tss doublefault_tss; |
167 | extern __u32 cpu_caps_cleared[NCAPINTS]; | 167 | extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; |
168 | extern __u32 cpu_caps_set[NCAPINTS]; | 168 | extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; |
169 | 169 | ||
170 | #ifdef CONFIG_SMP | 170 | #ifdef CONFIG_SMP |
171 | DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); | 171 | DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); |
@@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir) | |||
253 | write_cr3(__sme_pa(pgdir)); | 253 | write_cr3(__sme_pa(pgdir)); |
254 | } | 254 | } |
255 | 255 | ||
256 | /* | ||
257 | * Note that while the legacy 'TSS' name comes from 'Task State Segment', | ||
258 | * on modern x86 CPUs the TSS also holds information important to 64-bit mode, | ||
259 | * unrelated to the task-switch mechanism: | ||
260 | */ | ||
256 | #ifdef CONFIG_X86_32 | 261 | #ifdef CONFIG_X86_32 |
257 | /* This is the TSS defined by the hardware. */ | 262 | /* This is the TSS defined by the hardware. */ |
258 | struct x86_hw_tss { | 263 | struct x86_hw_tss { |
@@ -305,7 +310,13 @@ struct x86_hw_tss { | |||
305 | struct x86_hw_tss { | 310 | struct x86_hw_tss { |
306 | u32 reserved1; | 311 | u32 reserved1; |
307 | u64 sp0; | 312 | u64 sp0; |
313 | |||
314 | /* | ||
315 | * We store cpu_current_top_of_stack in sp1 so it's always accessible. | ||
316 | * Linux does not use ring 1, so sp1 is not otherwise needed. | ||
317 | */ | ||
308 | u64 sp1; | 318 | u64 sp1; |
319 | |||
309 | u64 sp2; | 320 | u64 sp2; |
310 | u64 reserved2; | 321 | u64 reserved2; |
311 | u64 ist[7]; | 322 | u64 ist[7]; |
@@ -323,12 +334,22 @@ struct x86_hw_tss { | |||
323 | #define IO_BITMAP_BITS 65536 | 334 | #define IO_BITMAP_BITS 65536 |
324 | #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) | 335 | #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) |
325 | #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) | 336 | #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) |
326 | #define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) | 337 | #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) |
327 | #define INVALID_IO_BITMAP_OFFSET 0x8000 | 338 | #define INVALID_IO_BITMAP_OFFSET 0x8000 |
328 | 339 | ||
340 | struct SYSENTER_stack { | ||
341 | unsigned long words[64]; | ||
342 | }; | ||
343 | |||
344 | struct SYSENTER_stack_page { | ||
345 | struct SYSENTER_stack stack; | ||
346 | } __aligned(PAGE_SIZE); | ||
347 | |||
329 | struct tss_struct { | 348 | struct tss_struct { |
330 | /* | 349 | /* |
331 | * The hardware state: | 350 | * The fixed hardware portion. This must not cross a page boundary |
351 | * at risk of violating the SDM's advice and potentially triggering | ||
352 | * errata. | ||
332 | */ | 353 | */ |
333 | struct x86_hw_tss x86_tss; | 354 | struct x86_hw_tss x86_tss; |
334 | 355 | ||
@@ -339,18 +360,9 @@ struct tss_struct { | |||
339 | * be within the limit. | 360 | * be within the limit. |
340 | */ | 361 | */ |
341 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | 362 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; |
363 | } __aligned(PAGE_SIZE); | ||
342 | 364 | ||
343 | #ifdef CONFIG_X86_32 | 365 | DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); |
344 | /* | ||
345 | * Space for the temporary SYSENTER stack. | ||
346 | */ | ||
347 | unsigned long SYSENTER_stack_canary; | ||
348 | unsigned long SYSENTER_stack[64]; | ||
349 | #endif | ||
350 | |||
351 | } ____cacheline_aligned; | ||
352 | |||
353 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); | ||
354 | 366 | ||
355 | /* | 367 | /* |
356 | * sizeof(unsigned long) coming from an extra "long" at the end | 368 | * sizeof(unsigned long) coming from an extra "long" at the end |
@@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); | |||
364 | 376 | ||
365 | #ifdef CONFIG_X86_32 | 377 | #ifdef CONFIG_X86_32 |
366 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); | 378 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); |
379 | #else | ||
380 | /* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ | ||
381 | #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 | ||
367 | #endif | 382 | #endif |
368 | 383 | ||
369 | /* | 384 | /* |
@@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask) | |||
523 | static inline void | 538 | static inline void |
524 | native_load_sp0(unsigned long sp0) | 539 | native_load_sp0(unsigned long sp0) |
525 | { | 540 | { |
526 | this_cpu_write(cpu_tss.x86_tss.sp0, sp0); | 541 | this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); |
527 | } | 542 | } |
528 | 543 | ||
529 | static inline void native_swapgs(void) | 544 | static inline void native_swapgs(void) |
@@ -535,12 +550,12 @@ static inline void native_swapgs(void) | |||
535 | 550 | ||
536 | static inline unsigned long current_top_of_stack(void) | 551 | static inline unsigned long current_top_of_stack(void) |
537 | { | 552 | { |
538 | #ifdef CONFIG_X86_64 | 553 | /* |
539 | return this_cpu_read_stable(cpu_tss.x86_tss.sp0); | 554 | * We can't read directly from tss.sp0: sp0 on x86_32 is special in |
540 | #else | 555 | * and around vm86 mode and sp0 on x86_64 is special because of the |
541 | /* sp0 on x86_32 is special in and around vm86 mode. */ | 556 | * entry trampoline. |
557 | */ | ||
542 | return this_cpu_read_stable(cpu_current_top_of_stack); | 558 | return this_cpu_read_stable(cpu_current_top_of_stack); |
543 | #endif | ||
544 | } | 559 | } |
545 | 560 | ||
546 | static inline bool on_thread_stack(void) | 561 | static inline bool on_thread_stack(void) |
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 8da111b3c342..f8062bfd43a0 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h | |||
@@ -16,6 +16,7 @@ enum stack_type { | |||
16 | STACK_TYPE_TASK, | 16 | STACK_TYPE_TASK, |
17 | STACK_TYPE_IRQ, | 17 | STACK_TYPE_IRQ, |
18 | STACK_TYPE_SOFTIRQ, | 18 | STACK_TYPE_SOFTIRQ, |
19 | STACK_TYPE_SYSENTER, | ||
19 | STACK_TYPE_EXCEPTION, | 20 | STACK_TYPE_EXCEPTION, |
20 | STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, | 21 | STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, |
21 | }; | 22 | }; |
@@ -28,6 +29,8 @@ struct stack_info { | |||
28 | bool in_task_stack(unsigned long *stack, struct task_struct *task, | 29 | bool in_task_stack(unsigned long *stack, struct task_struct *task, |
29 | struct stack_info *info); | 30 | struct stack_info *info); |
30 | 31 | ||
32 | bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); | ||
33 | |||
31 | int get_stack_info(unsigned long *stack, struct task_struct *task, | 34 | int get_stack_info(unsigned long *stack, struct task_struct *task, |
32 | struct stack_info *info, unsigned long *visit_mask); | 35 | struct stack_info *info, unsigned long *visit_mask); |
33 | 36 | ||
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..9b6df68d8fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h | |||
@@ -79,10 +79,10 @@ do { \ | |||
79 | static inline void refresh_sysenter_cs(struct thread_struct *thread) | 79 | static inline void refresh_sysenter_cs(struct thread_struct *thread) |
80 | { | 80 | { |
81 | /* Only happens when SEP is enabled, no need to test "SEP"arately: */ | 81 | /* Only happens when SEP is enabled, no need to test "SEP"arately: */ |
82 | if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) | 82 | if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) |
83 | return; | 83 | return; |
84 | 84 | ||
85 | this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); | 85 | this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); |
86 | wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | 86 | wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); |
87 | } | 87 | } |
88 | #endif | 88 | #endif |
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) | |||
90 | /* This is used when switching tasks or entering/exiting vm86 mode. */ | 90 | /* This is used when switching tasks or entering/exiting vm86 mode. */ |
91 | static inline void update_sp0(struct task_struct *task) | 91 | static inline void update_sp0(struct task_struct *task) |
92 | { | 92 | { |
93 | /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ | ||
93 | #ifdef CONFIG_X86_32 | 94 | #ifdef CONFIG_X86_32 |
94 | load_sp0(task->thread.sp0); | 95 | load_sp0(task->thread.sp0); |
95 | #else | 96 | #else |
96 | load_sp0(task_top_of_stack(task)); | 97 | if (static_cpu_has(X86_FEATURE_XENPV)) |
98 | load_sp0(task_top_of_stack(task)); | ||
97 | #endif | 99 | #endif |
98 | } | 100 | } |
99 | 101 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 70f425947dc5..00223333821a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, | |||
207 | #else /* !__ASSEMBLY__ */ | 207 | #else /* !__ASSEMBLY__ */ |
208 | 208 | ||
209 | #ifdef CONFIG_X86_64 | 209 | #ifdef CONFIG_X86_64 |
210 | # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) | 210 | # define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) |
211 | #endif | 211 | #endif |
212 | 212 | ||
213 | #endif | 213 | #endif |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff68..31051f35cbb7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); | |||
75 | dotraplinkage void do_stack_segment(struct pt_regs *, long); | 75 | dotraplinkage void do_stack_segment(struct pt_regs *, long); |
76 | #ifdef CONFIG_X86_64 | 76 | #ifdef CONFIG_X86_64 |
77 | dotraplinkage void do_double_fault(struct pt_regs *, long); | 77 | dotraplinkage void do_double_fault(struct pt_regs *, long); |
78 | asmlinkage struct pt_regs *sync_regs(struct pt_regs *); | ||
79 | #endif | 78 | #endif |
80 | dotraplinkage void do_general_protection(struct pt_regs *, long); | 79 | dotraplinkage void do_general_protection(struct pt_regs *, long); |
81 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); | 80 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); |
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9cc6fe1fc6f..c1688c2d0a12 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h | |||
@@ -7,6 +7,9 @@ | |||
7 | #include <asm/ptrace.h> | 7 | #include <asm/ptrace.h> |
8 | #include <asm/stacktrace.h> | 8 | #include <asm/stacktrace.h> |
9 | 9 | ||
10 | #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) | ||
11 | #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) | ||
12 | |||
10 | struct unwind_state { | 13 | struct unwind_state { |
11 | struct stack_info stack_info; | 14 | struct stack_info stack_info; |
12 | unsigned long stack_mask; | 15 | unsigned long stack_mask; |
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, | |||
52 | } | 55 | } |
53 | 56 | ||
54 | #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) | 57 | #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) |
58 | /* | ||
59 | * WARNING: The entire pt_regs may not be safe to dereference. In some cases, | ||
60 | * only the iret frame registers are accessible. Use with caution! | ||
61 | */ | ||
55 | static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) | 62 | static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) |
56 | { | 63 | { |
57 | if (unwind_done(state)) | 64 | if (unwind_done(state)) |
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..cd360a5e0dca 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c | |||
@@ -93,4 +93,10 @@ void common(void) { | |||
93 | 93 | ||
94 | BLANK(); | 94 | BLANK(); |
95 | DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); | 95 | DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); |
96 | |||
97 | /* Layout info for cpu_entry_area */ | ||
98 | OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); | ||
99 | OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); | ||
100 | OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); | ||
101 | DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); | ||
96 | } | 102 | } |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..7d20d9c0b3d6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -47,13 +47,8 @@ void foo(void) | |||
47 | BLANK(); | 47 | BLANK(); |
48 | 48 | ||
49 | /* Offset from the sysenter stack to tss.sp0 */ | 49 | /* Offset from the sysenter stack to tss.sp0 */ |
50 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - | 50 | DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - |
51 | offsetofend(struct tss_struct, SYSENTER_stack)); | 51 | offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); |
52 | |||
53 | /* Offset from cpu_tss to SYSENTER_stack */ | ||
54 | OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); | ||
55 | /* Size of SYSENTER_stack */ | ||
56 | DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); | ||
57 | 52 | ||
58 | #ifdef CONFIG_CC_STACKPROTECTOR | 53 | #ifdef CONFIG_CC_STACKPROTECTOR |
59 | BLANK(); | 54 | BLANK(); |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -23,6 +23,9 @@ int main(void) | |||
23 | #ifdef CONFIG_PARAVIRT | 23 | #ifdef CONFIG_PARAVIRT |
24 | OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); | 24 | OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); |
25 | OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); | 25 | OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); |
26 | #ifdef CONFIG_DEBUG_ENTRY | ||
27 | OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); | ||
28 | #endif | ||
26 | BLANK(); | 29 | BLANK(); |
27 | #endif | 30 | #endif |
28 | 31 | ||
@@ -63,6 +66,7 @@ int main(void) | |||
63 | 66 | ||
64 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); | 67 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); |
65 | OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); | 68 | OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); |
69 | OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); | ||
66 | BLANK(); | 70 | BLANK(); |
67 | 71 | ||
68 | #ifdef CONFIG_CC_STACKPROTECTOR | 72 | #ifdef CONFIG_CC_STACKPROTECTOR |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fa998ca8aa5a..7416da3ec4df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) | |||
476 | return NULL; /* Not found */ | 476 | return NULL; /* Not found */ |
477 | } | 477 | } |
478 | 478 | ||
479 | __u32 cpu_caps_cleared[NCAPINTS]; | 479 | __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; |
480 | __u32 cpu_caps_set[NCAPINTS]; | 480 | __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; |
481 | 481 | ||
482 | void load_percpu_segment(int cpu) | 482 | void load_percpu_segment(int cpu) |
483 | { | 483 | { |
@@ -490,27 +490,116 @@ void load_percpu_segment(int cpu) | |||
490 | load_stack_canary_segment(); | 490 | load_stack_canary_segment(); |
491 | } | 491 | } |
492 | 492 | ||
493 | /* Setup the fixmap mapping only once per-processor */ | 493 | #ifdef CONFIG_X86_32 |
494 | static inline void setup_fixmap_gdt(int cpu) | 494 | /* The 32-bit entry code needs to find cpu_entry_area. */ |
495 | DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); | ||
496 | #endif | ||
497 | |||
498 | #ifdef CONFIG_X86_64 | ||
499 | /* | ||
500 | * Special IST stacks which the CPU switches to when it calls | ||
501 | * an IST-marked descriptor entry. Up to 7 stacks (hardware | ||
502 | * limit), all of them are 4K, except the debug stack which | ||
503 | * is 8K. | ||
504 | */ | ||
505 | static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { | ||
506 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, | ||
507 | [DEBUG_STACK - 1] = DEBUG_STKSZ | ||
508 | }; | ||
509 | |||
510 | static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | ||
511 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); | ||
512 | #endif | ||
513 | |||
514 | static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, | ||
515 | SYSENTER_stack_storage); | ||
516 | |||
517 | static void __init | ||
518 | set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) | ||
519 | { | ||
520 | for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) | ||
521 | __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); | ||
522 | } | ||
523 | |||
524 | /* Setup the fixmap mappings only once per-processor */ | ||
525 | static void __init setup_cpu_entry_area(int cpu) | ||
495 | { | 526 | { |
496 | #ifdef CONFIG_X86_64 | 527 | #ifdef CONFIG_X86_64 |
497 | /* On 64-bit systems, we use a read-only fixmap GDT. */ | 528 | extern char _entry_trampoline[]; |
498 | pgprot_t prot = PAGE_KERNEL_RO; | 529 | |
530 | /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ | ||
531 | pgprot_t gdt_prot = PAGE_KERNEL_RO; | ||
532 | pgprot_t tss_prot = PAGE_KERNEL_RO; | ||
499 | #else | 533 | #else |
500 | /* | 534 | /* |
501 | * On native 32-bit systems, the GDT cannot be read-only because | 535 | * On native 32-bit systems, the GDT cannot be read-only because |
502 | * our double fault handler uses a task gate, and entering through | 536 | * our double fault handler uses a task gate, and entering through |
503 | * a task gate needs to change an available TSS to busy. If the GDT | 537 | * a task gate needs to change an available TSS to busy. If the |
504 | * is read-only, that will triple fault. | 538 | * GDT is read-only, that will triple fault. The TSS cannot be |
539 | * read-only because the CPU writes to it on task switches. | ||
505 | * | 540 | * |
506 | * On Xen PV, the GDT must be read-only because the hypervisor requires | 541 | * On Xen PV, the GDT must be read-only because the hypervisor |
507 | * it. | 542 | * requires it. |
508 | */ | 543 | */ |
509 | pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? | 544 | pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? |
510 | PAGE_KERNEL_RO : PAGE_KERNEL; | 545 | PAGE_KERNEL_RO : PAGE_KERNEL; |
546 | pgprot_t tss_prot = PAGE_KERNEL; | ||
511 | #endif | 547 | #endif |
512 | 548 | ||
513 | __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); | 549 | __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); |
550 | set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), | ||
551 | per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, | ||
552 | PAGE_KERNEL); | ||
553 | |||
554 | /* | ||
555 | * The Intel SDM says (Volume 3, 7.2.1): | ||
556 | * | ||
557 | * Avoid placing a page boundary in the part of the TSS that the | ||
558 | * processor reads during a task switch (the first 104 bytes). The | ||
559 | * processor may not correctly perform address translations if a | ||
560 | * boundary occurs in this area. During a task switch, the processor | ||
561 | * reads and writes into the first 104 bytes of each TSS (using | ||
562 | * contiguous physical addresses beginning with the physical address | ||
563 | * of the first byte of the TSS). So, after TSS access begins, if | ||
564 | * part of the 104 bytes is not physically contiguous, the processor | ||
565 | * will access incorrect information without generating a page-fault | ||
566 | * exception. | ||
567 | * | ||
568 | * There are also a lot of errata involving the TSS spanning a page | ||
569 | * boundary. Assert that we're not doing that. | ||
570 | */ | ||
571 | BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ | ||
572 | offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); | ||
573 | BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); | ||
574 | set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), | ||
575 | &per_cpu(cpu_tss_rw, cpu), | ||
576 | sizeof(struct tss_struct) / PAGE_SIZE, | ||
577 | tss_prot); | ||
578 | |||
579 | #ifdef CONFIG_X86_32 | ||
580 | per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); | ||
581 | #endif | ||
582 | |||
583 | #ifdef CONFIG_X86_64 | ||
584 | BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); | ||
585 | BUILD_BUG_ON(sizeof(exception_stacks) != | ||
586 | sizeof(((struct cpu_entry_area *)0)->exception_stacks)); | ||
587 | set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), | ||
588 | &per_cpu(exception_stacks, cpu), | ||
589 | sizeof(exception_stacks) / PAGE_SIZE, | ||
590 | PAGE_KERNEL); | ||
591 | |||
592 | __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), | ||
593 | __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); | ||
594 | #endif | ||
595 | } | ||
596 | |||
597 | void __init setup_cpu_entry_areas(void) | ||
598 | { | ||
599 | unsigned int cpu; | ||
600 | |||
601 | for_each_possible_cpu(cpu) | ||
602 | setup_cpu_entry_area(cpu); | ||
514 | } | 603 | } |
515 | 604 | ||
516 | /* Load the original GDT from the per-cpu structure */ | 605 | /* Load the original GDT from the per-cpu structure */ |
@@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) | |||
747 | { | 836 | { |
748 | int i; | 837 | int i; |
749 | 838 | ||
750 | for (i = 0; i < NCAPINTS; i++) { | 839 | for (i = 0; i < NCAPINTS + NBUGINTS; i++) { |
751 | c->x86_capability[i] &= ~cpu_caps_cleared[i]; | 840 | c->x86_capability[i] &= ~cpu_caps_cleared[i]; |
752 | c->x86_capability[i] |= cpu_caps_set[i]; | 841 | c->x86_capability[i] |= cpu_caps_set[i]; |
753 | } | 842 | } |
@@ -1250,7 +1339,7 @@ void enable_sep_cpu(void) | |||
1250 | return; | 1339 | return; |
1251 | 1340 | ||
1252 | cpu = get_cpu(); | 1341 | cpu = get_cpu(); |
1253 | tss = &per_cpu(cpu_tss, cpu); | 1342 | tss = &per_cpu(cpu_tss_rw, cpu); |
1254 | 1343 | ||
1255 | /* | 1344 | /* |
1256 | * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- | 1345 | * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- |
@@ -1259,11 +1348,7 @@ void enable_sep_cpu(void) | |||
1259 | 1348 | ||
1260 | tss->x86_tss.ss1 = __KERNEL_CS; | 1349 | tss->x86_tss.ss1 = __KERNEL_CS; |
1261 | wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); | 1350 | wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); |
1262 | 1351 | wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); | |
1263 | wrmsr(MSR_IA32_SYSENTER_ESP, | ||
1264 | (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), | ||
1265 | 0); | ||
1266 | |||
1267 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); | 1352 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); |
1268 | 1353 | ||
1269 | put_cpu(); | 1354 | put_cpu(); |
@@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; | |||
1357 | DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | 1442 | DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; |
1358 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | 1443 | EXPORT_PER_CPU_SYMBOL(__preempt_count); |
1359 | 1444 | ||
1360 | /* | ||
1361 | * Special IST stacks which the CPU switches to when it calls | ||
1362 | * an IST-marked descriptor entry. Up to 7 stacks (hardware | ||
1363 | * limit), all of them are 4K, except the debug stack which | ||
1364 | * is 8K. | ||
1365 | */ | ||
1366 | static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { | ||
1367 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, | ||
1368 | [DEBUG_STACK - 1] = DEBUG_STKSZ | ||
1369 | }; | ||
1370 | |||
1371 | static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | ||
1372 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); | ||
1373 | |||
1374 | /* May not be marked __init: used by software suspend */ | 1445 | /* May not be marked __init: used by software suspend */ |
1375 | void syscall_init(void) | 1446 | void syscall_init(void) |
1376 | { | 1447 | { |
1448 | extern char _entry_trampoline[]; | ||
1449 | extern char entry_SYSCALL_64_trampoline[]; | ||
1450 | |||
1451 | int cpu = smp_processor_id(); | ||
1452 | unsigned long SYSCALL64_entry_trampoline = | ||
1453 | (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + | ||
1454 | (entry_SYSCALL_64_trampoline - _entry_trampoline); | ||
1455 | |||
1377 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); | 1456 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); |
1378 | wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); | 1457 | wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); |
1379 | 1458 | ||
1380 | #ifdef CONFIG_IA32_EMULATION | 1459 | #ifdef CONFIG_IA32_EMULATION |
1381 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); | 1460 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); |
@@ -1386,7 +1465,7 @@ void syscall_init(void) | |||
1386 | * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). | 1465 | * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). |
1387 | */ | 1466 | */ |
1388 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | 1467 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); |
1389 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | 1468 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); |
1390 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); | 1469 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); |
1391 | #else | 1470 | #else |
1392 | wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); | 1471 | wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); |
@@ -1530,7 +1609,7 @@ void cpu_init(void) | |||
1530 | if (cpu) | 1609 | if (cpu) |
1531 | load_ucode_ap(); | 1610 | load_ucode_ap(); |
1532 | 1611 | ||
1533 | t = &per_cpu(cpu_tss, cpu); | 1612 | t = &per_cpu(cpu_tss_rw, cpu); |
1534 | oist = &per_cpu(orig_ist, cpu); | 1613 | oist = &per_cpu(orig_ist, cpu); |
1535 | 1614 | ||
1536 | #ifdef CONFIG_NUMA | 1615 | #ifdef CONFIG_NUMA |
@@ -1569,7 +1648,7 @@ void cpu_init(void) | |||
1569 | * set up and load the per-CPU TSS | 1648 | * set up and load the per-CPU TSS |
1570 | */ | 1649 | */ |
1571 | if (!oist->ist[0]) { | 1650 | if (!oist->ist[0]) { |
1572 | char *estacks = per_cpu(exception_stacks, cpu); | 1651 | char *estacks = get_cpu_entry_area(cpu)->exception_stacks; |
1573 | 1652 | ||
1574 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | 1653 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { |
1575 | estacks += exception_stack_sizes[v]; | 1654 | estacks += exception_stack_sizes[v]; |
@@ -1580,7 +1659,7 @@ void cpu_init(void) | |||
1580 | } | 1659 | } |
1581 | } | 1660 | } |
1582 | 1661 | ||
1583 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | 1662 | t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; |
1584 | 1663 | ||
1585 | /* | 1664 | /* |
1586 | * <= is required because the CPU will access up to | 1665 | * <= is required because the CPU will access up to |
@@ -1596,11 +1675,12 @@ void cpu_init(void) | |||
1596 | enter_lazy_tlb(&init_mm, me); | 1675 | enter_lazy_tlb(&init_mm, me); |
1597 | 1676 | ||
1598 | /* | 1677 | /* |
1599 | * Initialize the TSS. Don't bother initializing sp0, as the initial | 1678 | * Initialize the TSS. sp0 points to the entry trampoline stack |
1600 | * task never enters user mode. | 1679 | * regardless of what task is running. |
1601 | */ | 1680 | */ |
1602 | set_tss_desc(cpu, t); | 1681 | set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); |
1603 | load_TR_desc(); | 1682 | load_TR_desc(); |
1683 | load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); | ||
1604 | 1684 | ||
1605 | load_mm_ldt(&init_mm); | 1685 | load_mm_ldt(&init_mm); |
1606 | 1686 | ||
@@ -1612,7 +1692,6 @@ void cpu_init(void) | |||
1612 | if (is_uv_system()) | 1692 | if (is_uv_system()) |
1613 | uv_cpu_init(); | 1693 | uv_cpu_init(); |
1614 | 1694 | ||
1615 | setup_fixmap_gdt(cpu); | ||
1616 | load_fixmap_gdt(cpu); | 1695 | load_fixmap_gdt(cpu); |
1617 | } | 1696 | } |
1618 | 1697 | ||
@@ -1622,7 +1701,7 @@ void cpu_init(void) | |||
1622 | { | 1701 | { |
1623 | int cpu = smp_processor_id(); | 1702 | int cpu = smp_processor_id(); |
1624 | struct task_struct *curr = current; | 1703 | struct task_struct *curr = current; |
1625 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); | 1704 | struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); |
1626 | 1705 | ||
1627 | wait_for_master_cpu(cpu); | 1706 | wait_for_master_cpu(cpu); |
1628 | 1707 | ||
@@ -1657,12 +1736,12 @@ void cpu_init(void) | |||
1657 | * Initialize the TSS. Don't bother initializing sp0, as the initial | 1736 | * Initialize the TSS. Don't bother initializing sp0, as the initial |
1658 | * task never enters user mode. | 1737 | * task never enters user mode. |
1659 | */ | 1738 | */ |
1660 | set_tss_desc(cpu, t); | 1739 | set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); |
1661 | load_TR_desc(); | 1740 | load_TR_desc(); |
1662 | 1741 | ||
1663 | load_mm_ldt(&init_mm); | 1742 | load_mm_ldt(&init_mm); |
1664 | 1743 | ||
1665 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | 1744 | t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; |
1666 | 1745 | ||
1667 | #ifdef CONFIG_DOUBLEFAULT | 1746 | #ifdef CONFIG_DOUBLEFAULT |
1668 | /* Set up doublefault TSS pointer in the GDT */ | 1747 | /* Set up doublefault TSS pointer in the GDT */ |
@@ -1674,7 +1753,6 @@ void cpu_init(void) | |||
1674 | 1753 | ||
1675 | fpu__init_cpu(); | 1754 | fpu__init_cpu(); |
1676 | 1755 | ||
1677 | setup_fixmap_gdt(cpu); | ||
1678 | load_fixmap_gdt(cpu); | 1756 | load_fixmap_gdt(cpu); |
1679 | } | 1757 | } |
1680 | #endif | 1758 | #endif |
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c | |||
@@ -50,25 +50,23 @@ static void doublefault_fn(void) | |||
50 | cpu_relax(); | 50 | cpu_relax(); |
51 | } | 51 | } |
52 | 52 | ||
53 | struct tss_struct doublefault_tss __cacheline_aligned = { | 53 | struct x86_hw_tss doublefault_tss __cacheline_aligned = { |
54 | .x86_tss = { | 54 | .sp0 = STACK_START, |
55 | .sp0 = STACK_START, | 55 | .ss0 = __KERNEL_DS, |
56 | .ss0 = __KERNEL_DS, | 56 | .ldt = 0, |
57 | .ldt = 0, | 57 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, |
58 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, | 58 | |
59 | 59 | .ip = (unsigned long) doublefault_fn, | |
60 | .ip = (unsigned long) doublefault_fn, | 60 | /* 0x2 bit is always set */ |
61 | /* 0x2 bit is always set */ | 61 | .flags = X86_EFLAGS_SF | 0x2, |
62 | .flags = X86_EFLAGS_SF | 0x2, | 62 | .sp = STACK_START, |
63 | .sp = STACK_START, | 63 | .es = __USER_DS, |
64 | .es = __USER_DS, | 64 | .cs = __KERNEL_CS, |
65 | .cs = __KERNEL_CS, | 65 | .ss = __KERNEL_DS, |
66 | .ss = __KERNEL_DS, | 66 | .ds = __USER_DS, |
67 | .ds = __USER_DS, | 67 | .fs = __KERNEL_PERCPU, |
68 | .fs = __KERNEL_PERCPU, | 68 | |
69 | 69 | .__cr3 = __pa_nodebug(swapper_pg_dir), | |
70 | .__cr3 = __pa_nodebug(swapper_pg_dir), | ||
71 | } | ||
72 | }; | 70 | }; |
73 | 71 | ||
74 | /* dummy for do_double_fault() call */ | 72 | /* dummy for do_double_fault() call */ |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..bbd6d986e2d0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, | |||
43 | return true; | 43 | return true; |
44 | } | 44 | } |
45 | 45 | ||
46 | bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) | ||
47 | { | ||
48 | struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); | ||
49 | |||
50 | void *begin = ss; | ||
51 | void *end = ss + 1; | ||
52 | |||
53 | if ((void *)stack < begin || (void *)stack >= end) | ||
54 | return false; | ||
55 | |||
56 | info->type = STACK_TYPE_SYSENTER; | ||
57 | info->begin = begin; | ||
58 | info->end = end; | ||
59 | info->next_sp = NULL; | ||
60 | |||
61 | return true; | ||
62 | } | ||
63 | |||
46 | static void printk_stack_address(unsigned long address, int reliable, | 64 | static void printk_stack_address(unsigned long address, int reliable, |
47 | char *log_lvl) | 65 | char *log_lvl) |
48 | { | 66 | { |
@@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable, | |||
50 | printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); | 68 | printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); |
51 | } | 69 | } |
52 | 70 | ||
71 | void show_iret_regs(struct pt_regs *regs) | ||
72 | { | ||
73 | printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); | ||
74 | printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, | ||
75 | regs->sp, regs->flags); | ||
76 | } | ||
77 | |||
78 | static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) | ||
79 | { | ||
80 | if (on_stack(info, regs, sizeof(*regs))) | ||
81 | __show_regs(regs, 0); | ||
82 | else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, | ||
83 | IRET_FRAME_SIZE)) { | ||
84 | /* | ||
85 | * When an interrupt or exception occurs in entry code, the | ||
86 | * full pt_regs might not have been saved yet. In that case | ||
87 | * just print the iret frame. | ||
88 | */ | ||
89 | show_iret_regs(regs); | ||
90 | } | ||
91 | } | ||
92 | |||
53 | void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | 93 | void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
54 | unsigned long *stack, char *log_lvl) | 94 | unsigned long *stack, char *log_lvl) |
55 | { | 95 | { |
@@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
71 | * - task stack | 111 | * - task stack |
72 | * - interrupt stack | 112 | * - interrupt stack |
73 | * - HW exception stacks (double fault, nmi, debug, mce) | 113 | * - HW exception stacks (double fault, nmi, debug, mce) |
114 | * - SYSENTER stack | ||
74 | * | 115 | * |
75 | * x86-32 can have up to three stacks: | 116 | * x86-32 can have up to four stacks: |
76 | * - task stack | 117 | * - task stack |
77 | * - softirq stack | 118 | * - softirq stack |
78 | * - hardirq stack | 119 | * - hardirq stack |
120 | * - SYSENTER stack | ||
79 | */ | 121 | */ |
80 | for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { | 122 | for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { |
81 | const char *stack_name; | 123 | const char *stack_name; |
82 | 124 | ||
83 | /* | 125 | if (get_stack_info(stack, task, &stack_info, &visit_mask)) { |
84 | * If we overflowed the task stack into a guard page, jump back | 126 | /* |
85 | * to the bottom of the usable stack. | 127 | * We weren't on a valid stack. It's possible that |
86 | */ | 128 | * we overflowed a valid stack into a guard page. |
87 | if (task_stack_page(task) - (void *)stack < PAGE_SIZE) | 129 | * See if the next page up is valid so that we can |
88 | stack = task_stack_page(task); | 130 | * generate some kind of backtrace if this happens. |
89 | 131 | */ | |
90 | if (get_stack_info(stack, task, &stack_info, &visit_mask)) | 132 | stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); |
91 | break; | 133 | if (get_stack_info(stack, task, &stack_info, &visit_mask)) |
134 | break; | ||
135 | } | ||
92 | 136 | ||
93 | stack_name = stack_type_name(stack_info.type); | 137 | stack_name = stack_type_name(stack_info.type); |
94 | if (stack_name) | 138 | if (stack_name) |
95 | printk("%s <%s>\n", log_lvl, stack_name); | 139 | printk("%s <%s>\n", log_lvl, stack_name); |
96 | 140 | ||
97 | if (regs && on_stack(&stack_info, regs, sizeof(*regs))) | 141 | if (regs) |
98 | __show_regs(regs, 0); | 142 | show_regs_safe(&stack_info, regs); |
99 | 143 | ||
100 | /* | 144 | /* |
101 | * Scan the stack, printing any text addresses we find. At the | 145 | * Scan the stack, printing any text addresses we find. At the |
@@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
119 | 163 | ||
120 | /* | 164 | /* |
121 | * Don't print regs->ip again if it was already printed | 165 | * Don't print regs->ip again if it was already printed |
122 | * by __show_regs() below. | 166 | * by show_regs_safe() below. |
123 | */ | 167 | */ |
124 | if (regs && stack == ®s->ip) | 168 | if (regs && stack == ®s->ip) |
125 | goto next; | 169 | goto next; |
@@ -155,8 +199,8 @@ next: | |||
155 | 199 | ||
156 | /* if the frame has entry regs, print them */ | 200 | /* if the frame has entry regs, print them */ |
157 | regs = unwind_get_entry_regs(&state); | 201 | regs = unwind_get_entry_regs(&state); |
158 | if (regs && on_stack(&stack_info, regs, sizeof(*regs))) | 202 | if (regs) |
159 | __show_regs(regs, 0); | 203 | show_regs_safe(&stack_info, regs); |
160 | } | 204 | } |
161 | 205 | ||
162 | if (stack_name) | 206 | if (stack_name) |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..5ff13a6b3680 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) | |||
26 | if (type == STACK_TYPE_SOFTIRQ) | 26 | if (type == STACK_TYPE_SOFTIRQ) |
27 | return "SOFTIRQ"; | 27 | return "SOFTIRQ"; |
28 | 28 | ||
29 | if (type == STACK_TYPE_SYSENTER) | ||
30 | return "SYSENTER"; | ||
31 | |||
29 | return NULL; | 32 | return NULL; |
30 | } | 33 | } |
31 | 34 | ||
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, | |||
93 | if (task != current) | 96 | if (task != current) |
94 | goto unknown; | 97 | goto unknown; |
95 | 98 | ||
99 | if (in_sysenter_stack(stack, info)) | ||
100 | goto recursion_check; | ||
101 | |||
96 | if (in_hardirq_stack(stack, info)) | 102 | if (in_hardirq_stack(stack, info)) |
97 | goto recursion_check; | 103 | goto recursion_check; |
98 | 104 | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..abc828f8c297 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type) | |||
37 | if (type == STACK_TYPE_IRQ) | 37 | if (type == STACK_TYPE_IRQ) |
38 | return "IRQ"; | 38 | return "IRQ"; |
39 | 39 | ||
40 | if (type == STACK_TYPE_SYSENTER) | ||
41 | return "SYSENTER"; | ||
42 | |||
40 | if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) | 43 | if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) |
41 | return exception_stack_names[type - STACK_TYPE_EXCEPTION]; | 44 | return exception_stack_names[type - STACK_TYPE_EXCEPTION]; |
42 | 45 | ||
@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, | |||
115 | if (in_irq_stack(stack, info)) | 118 | if (in_irq_stack(stack, info)) |
116 | goto recursion_check; | 119 | goto recursion_check; |
117 | 120 | ||
121 | if (in_sysenter_stack(stack, info)) | ||
122 | goto recursion_check; | ||
123 | |||
118 | goto unknown; | 124 | goto unknown; |
119 | 125 | ||
120 | recursion_check: | 126 | recursion_check: |
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c | |||
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |||
67 | * because the ->io_bitmap_max value must match the bitmap | 67 | * because the ->io_bitmap_max value must match the bitmap |
68 | * contents: | 68 | * contents: |
69 | */ | 69 | */ |
70 | tss = &per_cpu(cpu_tss, get_cpu()); | 70 | tss = &per_cpu(cpu_tss_rw, get_cpu()); |
71 | 71 | ||
72 | if (turn_on) | 72 | if (turn_on) |
73 | bitmap_clear(t->io_bitmap_ptr, from, num); | 73 | bitmap_clear(t->io_bitmap_ptr, from, num); |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 49cfd9fe7589..68e1867cca80 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) | |||
219 | /* high bit used in ret_from_ code */ | 219 | /* high bit used in ret_from_ code */ |
220 | unsigned vector = ~regs->orig_ax; | 220 | unsigned vector = ~regs->orig_ax; |
221 | 221 | ||
222 | /* | ||
223 | * NB: Unlike exception entries, IRQ entries do not reliably | ||
224 | * handle context tracking in the low-level entry code. This is | ||
225 | * because syscall entries execute briefly with IRQs on before | ||
226 | * updating context tracking state, so we can take an IRQ from | ||
227 | * kernel mode with CONTEXT_USER. The low-level entry code only | ||
228 | * updates the context if we came from user mode, so we won't | ||
229 | * switch to CONTEXT_KERNEL. We'll fix that once the syscall | ||
230 | * code is cleaned up enough that we can cleanly defer enabling | ||
231 | * IRQs. | ||
232 | */ | ||
233 | |||
234 | entering_irq(); | 222 | entering_irq(); |
235 | 223 | ||
236 | /* entering_irq() tells RCU that we're not quiescent. Check it. */ | 224 | /* entering_irq() tells RCU that we're not quiescent. Check it. */ |
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) | |||
57 | if (regs->sp >= estack_top && regs->sp <= estack_bottom) | 57 | if (regs->sp >= estack_top && regs->sp <= estack_bottom) |
58 | return; | 58 | return; |
59 | 59 | ||
60 | WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", | 60 | WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", |
61 | current->comm, curbase, regs->sp, | 61 | current->comm, curbase, regs->sp, |
62 | irq_stack_top, irq_stack_bottom, | 62 | irq_stack_top, irq_stack_bottom, |
63 | estack_top, estack_bottom); | 63 | estack_top, estack_bottom, (void *)regs->ip); |
64 | 64 | ||
65 | if (sysctl_panic_on_stackoverflow) | 65 | if (sysctl_panic_on_stackoverflow) |
66 | panic("low stack detected by irq handler - check messages\n"); | 66 | panic("low stack detected by irq handler - check messages\n"); |
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c | |||
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); | |||
10 | DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); | 10 | DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); |
11 | DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); | 11 | DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); |
12 | DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); | 12 | DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); |
13 | DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); | ||
14 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); | 13 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); |
15 | 14 | ||
16 | DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); | 15 | DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); |
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | |||
60 | PATCH_SITE(pv_mmu_ops, read_cr2); | 59 | PATCH_SITE(pv_mmu_ops, read_cr2); |
61 | PATCH_SITE(pv_mmu_ops, read_cr3); | 60 | PATCH_SITE(pv_mmu_ops, read_cr3); |
62 | PATCH_SITE(pv_mmu_ops, write_cr3); | 61 | PATCH_SITE(pv_mmu_ops, write_cr3); |
63 | PATCH_SITE(pv_mmu_ops, flush_tlb_single); | ||
64 | PATCH_SITE(pv_cpu_ops, wbinvd); | 62 | PATCH_SITE(pv_cpu_ops, wbinvd); |
65 | #if defined(CONFIG_PARAVIRT_SPINLOCKS) | 63 | #if defined(CONFIG_PARAVIRT_SPINLOCKS) |
66 | case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): | 64 | case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb988a24db92..aed9d94bd46f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -47,7 +47,7 @@ | |||
47 | * section. Since TSS's are completely CPU-local, we want them | 47 | * section. Since TSS's are completely CPU-local, we want them |
48 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | 48 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
49 | */ | 49 | */ |
50 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { | 50 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { |
51 | .x86_tss = { | 51 | .x86_tss = { |
52 | /* | 52 | /* |
53 | * .sp0 is only used when entering ring 0 from a lower | 53 | * .sp0 is only used when entering ring 0 from a lower |
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { | |||
56 | * Poison it. | 56 | * Poison it. |
57 | */ | 57 | */ |
58 | .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, | 58 | .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, |
59 | |||
60 | #ifdef CONFIG_X86_64 | ||
61 | /* | ||
62 | * .sp1 is cpu_current_top_of_stack. The init task never | ||
63 | * runs user code, but cpu_current_top_of_stack should still | ||
64 | * be well defined before the first context switch. | ||
65 | */ | ||
66 | .sp1 = TOP_OF_INIT_STACK, | ||
67 | #endif | ||
68 | |||
59 | #ifdef CONFIG_X86_32 | 69 | #ifdef CONFIG_X86_32 |
60 | .ss0 = __KERNEL_DS, | 70 | .ss0 = __KERNEL_DS, |
61 | .ss1 = __KERNEL_CS, | 71 | .ss1 = __KERNEL_CS, |
@@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { | |||
71 | */ | 81 | */ |
72 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, | 82 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, |
73 | #endif | 83 | #endif |
74 | #ifdef CONFIG_X86_32 | ||
75 | .SYSENTER_stack_canary = STACK_END_MAGIC, | ||
76 | #endif | ||
77 | }; | 84 | }; |
78 | EXPORT_PER_CPU_SYMBOL(cpu_tss); | 85 | EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); |
79 | 86 | ||
80 | DEFINE_PER_CPU(bool, __tss_limit_invalid); | 87 | DEFINE_PER_CPU(bool, __tss_limit_invalid); |
81 | EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); | 88 | EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); |
@@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk) | |||
104 | struct fpu *fpu = &t->fpu; | 111 | struct fpu *fpu = &t->fpu; |
105 | 112 | ||
106 | if (bp) { | 113 | if (bp) { |
107 | struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); | 114 | struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); |
108 | 115 | ||
109 | t->io_bitmap_ptr = NULL; | 116 | t->io_bitmap_ptr = NULL; |
110 | clear_thread_flag(TIF_IO_BITMAP); | 117 | clear_thread_flag(TIF_IO_BITMAP); |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
234 | struct fpu *prev_fpu = &prev->fpu; | 234 | struct fpu *prev_fpu = &prev->fpu; |
235 | struct fpu *next_fpu = &next->fpu; | 235 | struct fpu *next_fpu = &next->fpu; |
236 | int cpu = smp_processor_id(); | 236 | int cpu = smp_processor_id(); |
237 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); | 237 | struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); |
238 | 238 | ||
239 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 239 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
240 | 240 | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c25..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) | |||
69 | unsigned int fsindex, gsindex; | 69 | unsigned int fsindex, gsindex; |
70 | unsigned int ds, cs, es; | 70 | unsigned int ds, cs, es; |
71 | 71 | ||
72 | printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); | 72 | show_iret_regs(regs); |
73 | printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, | 73 | |
74 | regs->sp, regs->flags); | ||
75 | if (regs->orig_ax != -1) | 74 | if (regs->orig_ax != -1) |
76 | pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); | 75 | pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); |
77 | else | 76 | else |
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) | |||
88 | printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", | 87 | printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", |
89 | regs->r13, regs->r14, regs->r15); | 88 | regs->r13, regs->r14, regs->r15); |
90 | 89 | ||
90 | if (!all) | ||
91 | return; | ||
92 | |||
91 | asm("movl %%ds,%0" : "=r" (ds)); | 93 | asm("movl %%ds,%0" : "=r" (ds)); |
92 | asm("movl %%cs,%0" : "=r" (cs)); | 94 | asm("movl %%cs,%0" : "=r" (cs)); |
93 | asm("movl %%es,%0" : "=r" (es)); | 95 | asm("movl %%es,%0" : "=r" (es)); |
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) | |||
98 | rdmsrl(MSR_GS_BASE, gs); | 100 | rdmsrl(MSR_GS_BASE, gs); |
99 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | 101 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); |
100 | 102 | ||
101 | if (!all) | ||
102 | return; | ||
103 | |||
104 | cr0 = read_cr0(); | 103 | cr0 = read_cr0(); |
105 | cr2 = read_cr2(); | 104 | cr2 = read_cr2(); |
106 | cr3 = __read_cr3(); | 105 | cr3 = __read_cr3(); |
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
400 | struct fpu *prev_fpu = &prev->fpu; | 399 | struct fpu *prev_fpu = &prev->fpu; |
401 | struct fpu *next_fpu = &next->fpu; | 400 | struct fpu *next_fpu = &next->fpu; |
402 | int cpu = smp_processor_id(); | 401 | int cpu = smp_processor_id(); |
403 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); | 402 | struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); |
404 | 403 | ||
405 | WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && | 404 | WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && |
406 | this_cpu_read(irq_count) != -1); | 405 | this_cpu_read(irq_count) != -1); |
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
462 | * Switch the PDA and FPU contexts. | 461 | * Switch the PDA and FPU contexts. |
463 | */ | 462 | */ |
464 | this_cpu_write(current_task, next_p); | 463 | this_cpu_write(current_task, next_p); |
464 | this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); | ||
465 | 465 | ||
466 | /* Reload sp0. */ | 466 | /* Reload sp0. */ |
467 | update_sp0(next_p); | 467 | update_sp0(next_p); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 989514c94a55..e98f8b66a460 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
348 | 348 | ||
349 | /* | 349 | /* |
350 | * If IRET takes a non-IST fault on the espfix64 stack, then we | 350 | * If IRET takes a non-IST fault on the espfix64 stack, then we |
351 | * end up promoting it to a doublefault. In that case, modify | 351 | * end up promoting it to a doublefault. In that case, take |
352 | * the stack to make it look like we just entered the #GP | 352 | * advantage of the fact that we're not using the normal (TSS.sp0) |
353 | * handler from user space, similar to bad_iret. | 353 | * stack right now. We can write a fake #GP(0) frame at TSS.sp0 |
354 | * and then modify our own IRET frame so that, when we return, | ||
355 | * we land directly at the #GP(0) vector with the stack already | ||
356 | * set up according to its expectations. | ||
357 | * | ||
358 | * The net result is that our #GP handler will think that we | ||
359 | * entered from usermode with the bad user context. | ||
354 | * | 360 | * |
355 | * No need for ist_enter here because we don't use RCU. | 361 | * No need for ist_enter here because we don't use RCU. |
356 | */ | 362 | */ |
@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
358 | regs->cs == __KERNEL_CS && | 364 | regs->cs == __KERNEL_CS && |
359 | regs->ip == (unsigned long)native_irq_return_iret) | 365 | regs->ip == (unsigned long)native_irq_return_iret) |
360 | { | 366 | { |
361 | struct pt_regs *normal_regs = task_pt_regs(current); | 367 | struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; |
362 | 368 | ||
363 | /* Fake a #GP(0) from userspace. */ | 369 | /* |
364 | memmove(&normal_regs->ip, (void *)regs->sp, 5*8); | 370 | * regs->sp points to the failing IRET frame on the |
365 | normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ | 371 | * ESPFIX64 stack. Copy it to the entry stack. This fills |
372 | * in gpregs->ss through gpregs->ip. | ||
373 | * | ||
374 | */ | ||
375 | memmove(&gpregs->ip, (void *)regs->sp, 5*8); | ||
376 | gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ | ||
377 | |||
378 | /* | ||
379 | * Adjust our frame so that we return straight to the #GP | ||
380 | * vector with the expected RSP value. This is safe because | ||
381 | * we won't enable interupts or schedule before we invoke | ||
382 | * general_protection, so nothing will clobber the stack | ||
383 | * frame we just set up. | ||
384 | */ | ||
366 | regs->ip = (unsigned long)general_protection; | 385 | regs->ip = (unsigned long)general_protection; |
367 | regs->sp = (unsigned long)&normal_regs->orig_ax; | 386 | regs->sp = (unsigned long)&gpregs->orig_ax; |
368 | 387 | ||
369 | return; | 388 | return; |
370 | } | 389 | } |
@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
389 | * | 408 | * |
390 | * Processors update CR2 whenever a page fault is detected. If a | 409 | * Processors update CR2 whenever a page fault is detected. If a |
391 | * second page fault occurs while an earlier page fault is being | 410 | * second page fault occurs while an earlier page fault is being |
392 | * deliv- ered, the faulting linear address of the second fault will | 411 | * delivered, the faulting linear address of the second fault will |
393 | * overwrite the contents of CR2 (replacing the previous | 412 | * overwrite the contents of CR2 (replacing the previous |
394 | * address). These updates to CR2 occur even if the page fault | 413 | * address). These updates to CR2 occur even if the page fault |
395 | * results in a double fault or occurs during the delivery of a | 414 | * results in a double fault or occurs during the delivery of a |
@@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3); | |||
605 | 624 | ||
606 | #ifdef CONFIG_X86_64 | 625 | #ifdef CONFIG_X86_64 |
607 | /* | 626 | /* |
608 | * Help handler running on IST stack to switch off the IST stack if the | 627 | * Help handler running on a per-cpu (IST or entry trampoline) stack |
609 | * interrupted code was in user mode. The actual stack switch is done in | 628 | * to switch to the normal thread stack if the interrupted code was in |
610 | * entry_64.S | 629 | * user mode. The actual stack switch is done in entry_64.S |
611 | */ | 630 | */ |
612 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) | 631 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) |
613 | { | 632 | { |
614 | struct pt_regs *regs = task_pt_regs(current); | 633 | struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; |
615 | *regs = *eregs; | 634 | if (regs != eregs) |
635 | *regs = *eregs; | ||
616 | return regs; | 636 | return regs; |
617 | } | 637 | } |
618 | NOKPROBE_SYMBOL(sync_regs); | 638 | NOKPROBE_SYMBOL(sync_regs); |
@@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) | |||
628 | /* | 648 | /* |
629 | * This is called from entry_64.S early in handling a fault | 649 | * This is called from entry_64.S early in handling a fault |
630 | * caused by a bad iret to user mode. To handle the fault | 650 | * caused by a bad iret to user mode. To handle the fault |
631 | * correctly, we want move our stack frame to task_pt_regs | 651 | * correctly, we want to move our stack frame to where it would |
632 | * and we want to pretend that the exception came from the | 652 | * be had we entered directly on the entry stack (rather than |
633 | * iret target. | 653 | * just below the IRET frame) and we want to pretend that the |
654 | * exception came from the IRET target. | ||
634 | */ | 655 | */ |
635 | struct bad_iret_stack *new_stack = | 656 | struct bad_iret_stack *new_stack = |
636 | container_of(task_pt_regs(current), | 657 | (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; |
637 | struct bad_iret_stack, regs); | ||
638 | 658 | ||
639 | /* Copy the IRET target to the new stack. */ | 659 | /* Copy the IRET target to the new stack. */ |
640 | memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); | 660 | memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); |
@@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
795 | debug_stack_usage_dec(); | 815 | debug_stack_usage_dec(); |
796 | 816 | ||
797 | exit: | 817 | exit: |
798 | #if defined(CONFIG_X86_32) | ||
799 | /* | ||
800 | * This is the most likely code path that involves non-trivial use | ||
801 | * of the SYSENTER stack. Check that we haven't overrun it. | ||
802 | */ | ||
803 | WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, | ||
804 | "Overran or corrupted SYSENTER stack\n"); | ||
805 | #endif | ||
806 | ist_exit(regs); | 818 | ist_exit(regs); |
807 | } | 819 | } |
808 | NOKPROBE_SYMBOL(do_debug); | 820 | NOKPROBE_SYMBOL(do_debug); |
@@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | |||
929 | 941 | ||
930 | void __init trap_init(void) | 942 | void __init trap_init(void) |
931 | { | 943 | { |
944 | /* Init cpu_entry_area before IST entries are set up */ | ||
945 | setup_cpu_entry_areas(); | ||
946 | |||
932 | idt_setup_traps(); | 947 | idt_setup_traps(); |
933 | 948 | ||
934 | /* | 949 | /* |
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a..be86a865087a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c | |||
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) | |||
253 | return NULL; | 253 | return NULL; |
254 | } | 254 | } |
255 | 255 | ||
256 | static bool stack_access_ok(struct unwind_state *state, unsigned long addr, | 256 | static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, |
257 | size_t len) | 257 | size_t len) |
258 | { | 258 | { |
259 | struct stack_info *info = &state->stack_info; | 259 | struct stack_info *info = &state->stack_info; |
260 | void *addr = (void *)_addr; | ||
260 | 261 | ||
261 | /* | 262 | if (!on_stack(info, addr, len) && |
262 | * If the address isn't on the current stack, switch to the next one. | 263 | (get_stack_info(addr, state->task, info, &state->stack_mask))) |
263 | * | 264 | return false; |
264 | * We may have to traverse multiple stacks to deal with the possibility | ||
265 | * that info->next_sp could point to an empty stack and the address | ||
266 | * could be on a subsequent stack. | ||
267 | */ | ||
268 | while (!on_stack(info, (void *)addr, len)) | ||
269 | if (get_stack_info(info->next_sp, state->task, info, | ||
270 | &state->stack_mask)) | ||
271 | return false; | ||
272 | 265 | ||
273 | return true; | 266 | return true; |
274 | } | 267 | } |
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, | |||
283 | return true; | 276 | return true; |
284 | } | 277 | } |
285 | 278 | ||
286 | #define REGS_SIZE (sizeof(struct pt_regs)) | ||
287 | #define SP_OFFSET (offsetof(struct pt_regs, sp)) | ||
288 | #define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) | ||
289 | #define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) | ||
290 | |||
291 | static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, | 279 | static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, |
292 | unsigned long *ip, unsigned long *sp, bool full) | 280 | unsigned long *ip, unsigned long *sp) |
293 | { | 281 | { |
294 | size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; | 282 | struct pt_regs *regs = (struct pt_regs *)addr; |
295 | size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; | ||
296 | struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); | ||
297 | |||
298 | if (IS_ENABLED(CONFIG_X86_64)) { | ||
299 | if (!stack_access_ok(state, addr, regs_size)) | ||
300 | return false; | ||
301 | 283 | ||
302 | *ip = regs->ip; | 284 | /* x86-32 support will be more complicated due to the ®s->sp hack */ |
303 | *sp = regs->sp; | 285 | BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); |
304 | 286 | ||
305 | return true; | 287 | if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) |
306 | } | ||
307 | |||
308 | if (!stack_access_ok(state, addr, sp_offset)) | ||
309 | return false; | 288 | return false; |
310 | 289 | ||
311 | *ip = regs->ip; | 290 | *ip = regs->ip; |
291 | *sp = regs->sp; | ||
292 | return true; | ||
293 | } | ||
312 | 294 | ||
313 | if (user_mode(regs)) { | 295 | static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, |
314 | if (!stack_access_ok(state, addr + sp_offset, | 296 | unsigned long *ip, unsigned long *sp) |
315 | REGS_SIZE - SP_OFFSET)) | 297 | { |
316 | return false; | 298 | struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; |
317 | 299 | ||
318 | *sp = regs->sp; | 300 | if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) |
319 | } else | 301 | return false; |
320 | *sp = (unsigned long)®s->sp; | ||
321 | 302 | ||
303 | *ip = regs->ip; | ||
304 | *sp = regs->sp; | ||
322 | return true; | 305 | return true; |
323 | } | 306 | } |
324 | 307 | ||
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) | |||
327 | unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; | 310 | unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; |
328 | enum stack_type prev_type = state->stack_info.type; | 311 | enum stack_type prev_type = state->stack_info.type; |
329 | struct orc_entry *orc; | 312 | struct orc_entry *orc; |
330 | struct pt_regs *ptregs; | ||
331 | bool indirect = false; | 313 | bool indirect = false; |
332 | 314 | ||
333 | if (unwind_done(state)) | 315 | if (unwind_done(state)) |
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) | |||
435 | break; | 417 | break; |
436 | 418 | ||
437 | case ORC_TYPE_REGS: | 419 | case ORC_TYPE_REGS: |
438 | if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { | 420 | if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { |
439 | orc_warn("can't dereference registers at %p for ip %pB\n", | 421 | orc_warn("can't dereference registers at %p for ip %pB\n", |
440 | (void *)sp, (void *)orig_ip); | 422 | (void *)sp, (void *)orig_ip); |
441 | goto done; | 423 | goto done; |
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) | |||
447 | break; | 429 | break; |
448 | 430 | ||
449 | case ORC_TYPE_REGS_IRET: | 431 | case ORC_TYPE_REGS_IRET: |
450 | if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { | 432 | if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { |
451 | orc_warn("can't dereference iret registers at %p for ip %pB\n", | 433 | orc_warn("can't dereference iret registers at %p for ip %pB\n", |
452 | (void *)sp, (void *)orig_ip); | 434 | (void *)sp, (void *)orig_ip); |
453 | goto done; | 435 | goto done; |
454 | } | 436 | } |
455 | 437 | ||
456 | ptregs = container_of((void *)sp, struct pt_regs, ip); | 438 | state->regs = (void *)sp - IRET_FRAME_OFFSET; |
457 | if ((unsigned long)ptregs >= prev_sp && | 439 | state->full_regs = false; |
458 | on_stack(&state->stack_info, ptregs, REGS_SIZE)) { | ||
459 | state->regs = ptregs; | ||
460 | state->full_regs = false; | ||
461 | } else | ||
462 | state->regs = NULL; | ||
463 | |||
464 | state->signal = true; | 440 | state->signal = true; |
465 | break; | 441 | break; |
466 | 442 | ||
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, | |||
553 | } | 529 | } |
554 | 530 | ||
555 | if (get_stack_info((unsigned long *)state->sp, state->task, | 531 | if (get_stack_info((unsigned long *)state->sp, state->task, |
556 | &state->stack_info, &state->stack_mask)) | 532 | &state->stack_info, &state->stack_mask)) { |
557 | return; | 533 | /* |
534 | * We weren't on a valid stack. It's possible that | ||
535 | * we overflowed a valid stack into a guard page. | ||
536 | * See if the next page up is valid so that we can | ||
537 | * generate some kind of backtrace if this happens. | ||
538 | */ | ||
539 | void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); | ||
540 | if (get_stack_info(next_page, state->task, &state->stack_info, | ||
541 | &state->stack_mask)) | ||
542 | return; | ||
543 | } | ||
558 | 544 | ||
559 | /* | 545 | /* |
560 | * The caller can provide the address of the first frame directly | 546 | * The caller can provide the address of the first frame directly |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..d2a8b5a24a44 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -107,6 +107,15 @@ SECTIONS | |||
107 | SOFTIRQENTRY_TEXT | 107 | SOFTIRQENTRY_TEXT |
108 | *(.fixup) | 108 | *(.fixup) |
109 | *(.gnu.warning) | 109 | *(.gnu.warning) |
110 | |||
111 | #ifdef CONFIG_X86_64 | ||
112 | . = ALIGN(PAGE_SIZE); | ||
113 | _entry_trampoline = .; | ||
114 | *(.entry_trampoline) | ||
115 | . = ALIGN(PAGE_SIZE); | ||
116 | ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); | ||
117 | #endif | ||
118 | |||
110 | /* End of text section */ | 119 | /* End of text section */ |
111 | _etext = .; | 120 | _etext = .; |
112 | } :text = 0x9090 | 121 | } :text = 0x9090 |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eba631c4dbd..023afa0c8887 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2302 | * processors. See 22.2.4. | 2302 | * processors. See 22.2.4. |
2303 | */ | 2303 | */ |
2304 | vmcs_writel(HOST_TR_BASE, | 2304 | vmcs_writel(HOST_TR_BASE, |
2305 | (unsigned long)this_cpu_ptr(&cpu_tss)); | 2305 | (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); |
2306 | vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ | 2306 | vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ |
2307 | 2307 | ||
2308 | /* | 2308 | /* |
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..4846eff7e4c8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c | |||
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) | |||
107 | delay = min_t(u64, MWAITX_MAX_LOOPS, loops); | 107 | delay = min_t(u64, MWAITX_MAX_LOOPS, loops); |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * Use cpu_tss as a cacheline-aligned, seldomly | 110 | * Use cpu_tss_rw as a cacheline-aligned, seldomly |
111 | * accessed per-cpu variable as the monitor target. | 111 | * accessed per-cpu variable as the monitor target. |
112 | */ | 112 | */ |
113 | __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); | 113 | __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); |
114 | 114 | ||
115 | /* | 115 | /* |
116 | * AMD, like Intel, supports the EAX hint and EAX=0xf | 116 | * AMD, like Intel, supports the EAX hint and EAX=0xf |
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..9ec70d780f1f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c | |||
@@ -277,6 +277,7 @@ void __init kasan_early_init(void) | |||
277 | void __init kasan_init(void) | 277 | void __init kasan_init(void) |
278 | { | 278 | { |
279 | int i; | 279 | int i; |
280 | void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; | ||
280 | 281 | ||
281 | #ifdef CONFIG_KASAN_INLINE | 282 | #ifdef CONFIG_KASAN_INLINE |
282 | register_die_notifier(&kasan_die_notifier); | 283 | register_die_notifier(&kasan_die_notifier); |
@@ -329,8 +330,23 @@ void __init kasan_init(void) | |||
329 | (unsigned long)kasan_mem_to_shadow(_end), | 330 | (unsigned long)kasan_mem_to_shadow(_end), |
330 | early_pfn_to_nid(__pa(_stext))); | 331 | early_pfn_to_nid(__pa(_stext))); |
331 | 332 | ||
333 | shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); | ||
334 | shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); | ||
335 | shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, | ||
336 | PAGE_SIZE); | ||
337 | |||
338 | shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); | ||
339 | shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); | ||
340 | shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, | ||
341 | PAGE_SIZE); | ||
342 | |||
332 | kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), | 343 | kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), |
333 | (void *)KASAN_SHADOW_END); | 344 | shadow_cpu_entry_begin); |
345 | |||
346 | kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, | ||
347 | (unsigned long)shadow_cpu_entry_end, 0); | ||
348 | |||
349 | kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); | ||
334 | 350 | ||
335 | load_cr3(init_top_pgt); | 351 | load_cr3(init_top_pgt); |
336 | __flush_tlb_all(); | 352 | __flush_tlb_all(); |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 36a28eddb435..a7d966964c6f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -152,17 +152,19 @@ static void do_fpu_end(void) | |||
152 | static void fix_processor_context(void) | 152 | static void fix_processor_context(void) |
153 | { | 153 | { |
154 | int cpu = smp_processor_id(); | 154 | int cpu = smp_processor_id(); |
155 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); | ||
156 | #ifdef CONFIG_X86_64 | 155 | #ifdef CONFIG_X86_64 |
157 | struct desc_struct *desc = get_cpu_gdt_rw(cpu); | 156 | struct desc_struct *desc = get_cpu_gdt_rw(cpu); |
158 | tss_desc tss; | 157 | tss_desc tss; |
159 | #endif | 158 | #endif |
160 | set_tss_desc(cpu, t); /* | 159 | |
161 | * This just modifies memory; should not be | 160 | /* |
162 | * necessary. But... This is necessary, because | 161 | * We need to reload TR, which requires that we change the |
163 | * 386 hardware has concept of busy TSS or some | 162 | * GDT entry to indicate "available" first. |
164 | * similar stupidity. | 163 | * |
165 | */ | 164 | * XXX: This could probably all be replaced by a call to |
165 | * force_reload_TR(). | ||
166 | */ | ||
167 | set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); | ||
166 | 168 | ||
167 | #ifdef CONFIG_X86_64 | 169 | #ifdef CONFIG_X86_64 |
168 | memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); | 170 | memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); |
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f2414c6c5e7c..7beeee1443b3 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c | |||
@@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0) | |||
826 | mcs = xen_mc_entry(0); | 826 | mcs = xen_mc_entry(0); |
827 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); | 827 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); |
828 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 828 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
829 | this_cpu_write(cpu_tss.x86_tss.sp0, sp0); | 829 | this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); |
830 | } | 830 | } |
831 | 831 | ||
832 | void xen_set_iopl_mask(unsigned mask) | 832 | void xen_set_iopl_mask(unsigned mask) |
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index fc048ec686e7..6cf801ca1142 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c | |||
@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
2272 | #endif | 2272 | #endif |
2273 | case FIX_TEXT_POKE0: | 2273 | case FIX_TEXT_POKE0: |
2274 | case FIX_TEXT_POKE1: | 2274 | case FIX_TEXT_POKE1: |
2275 | case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: | 2275 | case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: |
2276 | /* All local page mappings */ | 2276 | /* All local page mappings */ |
2277 | pte = pfn_pte(phys, prot); | 2277 | pte = pfn_pte(phys, prot); |
2278 | break; | 2278 | break; |