aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-12-29 20:02:49 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-12-29 20:02:49 -0500
commit5aa90a84589282b87666f92b6c3c917c8080a9bf (patch)
treeb03c3c5879240496fda0c43e070a89b327a894de
parent61233580f1f33c50e159c50e24d80ffd2ba2e06b (diff)
parent9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 (diff)
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 page table isolation updates from Thomas Gleixner: "This is the final set of enabling page table isolation on x86: - Infrastructure patches for handling the extra page tables. - Patches which map the various bits and pieces which are required to get in and out of user space into the user space visible page tables. - The required changes to have CR3 switching in the entry/exit code. - Optimizations for the CR3 switching along with documentation how the ASID/PCID mechanism works. - Updates to dump pagetables to cover the user space page tables for W+X scans and extra debugfs files to analyze both the kernel and the user space visible page tables The whole functionality is compile time controlled via a config switch and can be turned on/off on the command line as well" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) x86/ldt: Make the LDT mapping RO x86/mm/dump_pagetables: Allow dumping current pagetables x86/mm/dump_pagetables: Check user space page table for WX pages x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy x86/mm/pti: Add Kconfig x86/dumpstack: Indicate in Oops whether PTI is configured and enabled x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming x86/mm: Use INVPCID for __native_flush_tlb_single() x86/mm: Optimize RESTORE_CR3 x86/mm: Use/Fix PCID to optimize user/kernel switches x86/mm: Abstract switching CR3 x86/mm: Allow flushing for future ASID switches x86/pti: Map the vsyscall page if needed x86/pti: Put the LDT in its own PGD if PTI is on x86/mm/64: Make a full PGD-entry size hole in the memory map x86/events/intel/ds: Map debug buffers in cpu_entry_area x86/cpu_entry_area: Add debugstore entries to cpu_entry_area x86/mm/pti: Map ESPFIX into user space x86/mm/pti: Share entry text PMD x86/entry: Align entry text section to PMD boundary ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/x86/x86_64/mm.txt5
-rw-r--r--arch/x86/boot/compressed/pagetable.c3
-rw-r--r--arch/x86/entry/calling.h145
-rw-r--r--arch/x86/entry/entry_64.S48
-rw-r--r--arch/x86/entry/entry_64_compat.S24
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c6
-rw-r--r--arch/x86/events/intel/ds.c130
-rw-r--r--arch/x86/events/perf_event.h23
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h13
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/intel_ds.h36
-rw-r--r--arch/x86/include/asm/mmu_context.h59
-rw-r--r--arch/x86/include/asm/pgalloc.h11
-rw-r--r--arch/x86/include/asm/pgtable.h30
-rw-r--r--arch/x86/include/asm/pgtable_64.h92
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h5
-rw-r--r--arch/x86/include/asm/processor.h23
-rw-r--r--arch/x86/include/asm/pti.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h202
-rw-r--r--arch/x86/include/asm/vsyscall.h1
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h7
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/cpu/common.c9
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/ldt.c144
-rw-r--r--arch/x86/kernel/tls.c11
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/cpu_entry_area.c27
-rw-r--r--arch/x86/mm/debug_pagetables.c80
-rw-r--r--arch/x86/mm/dump_pagetables.c43
-rw-r--r--arch/x86/mm/init.c80
-rw-r--r--arch/x86/mm/pgtable.c5
-rw-r--r--arch/x86/mm/pti.c387
-rw-r--r--arch/x86/mm/tlb.c58
-rw-r--r--arch/x86/platform/efi/efi_64.c5
-rw-r--r--include/linux/pti.h11
-rw-r--r--init/main.c3
-rw-r--r--security/Kconfig10
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c3
45 files changed, 1636 insertions, 202 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..e49311d53504 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2708,6 +2708,8 @@
2708 steal time is computed, but won't influence scheduler 2708 steal time is computed, but won't influence scheduler
2709 behaviour 2709 behaviour
2710 2710
2711 nopti [X86-64] Disable kernel page table isolation
2712
2711 nolapic [X86-32,APIC] Do not enable or use the local APIC. 2713 nolapic [X86-32,APIC] Do not enable or use the local APIC.
2712 2714
2713 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 2715 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@@ -3282,6 +3284,12 @@
3282 pt. [PARIDE] 3284 pt. [PARIDE]
3283 See Documentation/blockdev/paride.txt. 3285 See Documentation/blockdev/paride.txt.
3284 3286
3287 pti= [X86_64]
3288 Control user/kernel address space isolation:
3289 on - enable
3290 off - disable
3291 auto - default setting
3292
3285 pty.legacy_count= 3293 pty.legacy_count=
3286 [KNL] Number of legacy pty's. Overwrites compiled-in 3294 [KNL] Number of legacy pty's. Overwrites compiled-in
3287 default number. 3295 default number.
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 51101708a03a..ad41b3813f0a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
12... unused hole ... 12... unused hole ...
13ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) 13ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
14... unused hole ... 14... unused hole ...
15fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
15fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping 16fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
16ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 17ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
17... unused hole ... 18... unused hole ...
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
29hole caused by [56:63] sign extension 30hole caused by [56:63] sign extension
30ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor 31ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
31ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory 32ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
32ff90000000000000 - ff91ffffffffffff (=49 bits) hole 33ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
33ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space 34ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
34ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole 35ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
35ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) 36ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
36... unused hole ... 37... unused hole ...
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index d5364ca2e3f9..b5e5e02f8cde 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
23 */ 23 */
24#undef CONFIG_AMD_MEM_ENCRYPT 24#undef CONFIG_AMD_MEM_ENCRYPT
25 25
26/* No PAGE_TABLE_ISOLATION support needed either: */
27#undef CONFIG_PAGE_TABLE_ISOLATION
28
26#include "misc.h" 29#include "misc.h"
27 30
28/* These actually do the work of building the kernel identity maps. */ 31/* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3fd8bc560fae..45a63e00a6af 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/jump_label.h> 2#include <linux/jump_label.h>
3#include <asm/unwind_hints.h> 3#include <asm/unwind_hints.h>
4#include <asm/cpufeatures.h>
5#include <asm/page_types.h>
6#include <asm/percpu.h>
7#include <asm/asm-offsets.h>
8#include <asm/processor-flags.h>
4 9
5/* 10/*
6 11
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
187#endif 192#endif
188.endm 193.endm
189 194
195#ifdef CONFIG_PAGE_TABLE_ISOLATION
196
197/*
198 * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
199 * halves:
200 */
201#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
202#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
203
204.macro SET_NOFLUSH_BIT reg:req
205 bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
206.endm
207
208.macro ADJUST_KERNEL_CR3 reg:req
209 ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
210 /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
211 andq $(~PTI_SWITCH_MASK), \reg
212.endm
213
214.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
215 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
216 mov %cr3, \scratch_reg
217 ADJUST_KERNEL_CR3 \scratch_reg
218 mov \scratch_reg, %cr3
219.Lend_\@:
220.endm
221
222#define THIS_CPU_user_pcid_flush_mask \
223 PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
224
225.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
226 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
227 mov %cr3, \scratch_reg
228
229 ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
230
231 /*
232 * Test if the ASID needs a flush.
233 */
234 movq \scratch_reg, \scratch_reg2
235 andq $(0x7FF), \scratch_reg /* mask ASID */
236 bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
237 jnc .Lnoflush_\@
238
239 /* Flush needed, clear the bit */
240 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
241 movq \scratch_reg2, \scratch_reg
242 jmp .Lwrcr3_\@
243
244.Lnoflush_\@:
245 movq \scratch_reg2, \scratch_reg
246 SET_NOFLUSH_BIT \scratch_reg
247
248.Lwrcr3_\@:
249 /* Flip the PGD and ASID to the user version */
250 orq $(PTI_SWITCH_MASK), \scratch_reg
251 mov \scratch_reg, %cr3
252.Lend_\@:
253.endm
254
255.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
256 pushq %rax
257 SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
258 popq %rax
259.endm
260
261.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
262 ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
263 movq %cr3, \scratch_reg
264 movq \scratch_reg, \save_reg
265 /*
266 * Is the "switch mask" all zero? That means that both of
267 * these are zero:
268 *
269 * 1. The user/kernel PCID bit, and
270 * 2. The user/kernel "bit" that points CR3 to the
271 * bottom half of the 8k PGD
272 *
273 * That indicates a kernel CR3 value, not a user CR3.
274 */
275 testq $(PTI_SWITCH_MASK), \scratch_reg
276 jz .Ldone_\@
277
278 ADJUST_KERNEL_CR3 \scratch_reg
279 movq \scratch_reg, %cr3
280
281.Ldone_\@:
282.endm
283
284.macro RESTORE_CR3 scratch_reg:req save_reg:req
285 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
286
287 ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
288
289 /*
290 * KERNEL pages can always resume with NOFLUSH as we do
291 * explicit flushes.
292 */
293 bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
294 jnc .Lnoflush_\@
295
296 /*
297 * Check if there's a pending flush for the user ASID we're
298 * about to set.
299 */
300 movq \save_reg, \scratch_reg
301 andq $(0x7FF), \scratch_reg
302 bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
303 jnc .Lnoflush_\@
304
305 btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
306 jmp .Lwrcr3_\@
307
308.Lnoflush_\@:
309 SET_NOFLUSH_BIT \save_reg
310
311.Lwrcr3_\@:
312 /*
313 * The CR3 write could be avoided when not changing its value,
314 * but would require a CR3 read *and* a scratch register.
315 */
316 movq \save_reg, %cr3
317.Lend_\@:
318.endm
319
320#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
321
322.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
323.endm
324.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
325.endm
326.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
327.endm
328.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
329.endm
330.macro RESTORE_CR3 scratch_reg:req save_reg:req
331.endm
332
333#endif
334
190#endif /* CONFIG_X86_64 */ 335#endif /* CONFIG_X86_64 */
191 336
192/* 337/*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3d19c830e1b1..f048e384ff54 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
23#include <asm/segment.h> 23#include <asm/segment.h>
24#include <asm/cache.h> 24#include <asm/cache.h>
25#include <asm/errno.h> 25#include <asm/errno.h>
26#include "calling.h"
27#include <asm/asm-offsets.h> 26#include <asm/asm-offsets.h>
28#include <asm/msr.h> 27#include <asm/msr.h>
29#include <asm/unistd.h> 28#include <asm/unistd.h>
@@ -40,6 +39,8 @@
40#include <asm/frame.h> 39#include <asm/frame.h>
41#include <linux/err.h> 40#include <linux/err.h>
42 41
42#include "calling.h"
43
43.code64 44.code64
44.section .entry.text, "ax" 45.section .entry.text, "ax"
45 46
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
168 /* Stash the user RSP. */ 169 /* Stash the user RSP. */
169 movq %rsp, RSP_SCRATCH 170 movq %rsp, RSP_SCRATCH
170 171
172 /* Note: using %rsp as a scratch reg. */
173 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
174
171 /* Load the top of the task stack into RSP */ 175 /* Load the top of the task stack into RSP */
172 movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp 176 movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
173 177
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
207 */ 211 */
208 212
209 swapgs 213 swapgs
214 /*
215 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
216 * is not required to switch CR3.
217 */
210 movq %rsp, PER_CPU_VAR(rsp_scratch) 218 movq %rsp, PER_CPU_VAR(rsp_scratch)
211 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 219 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
212 220
@@ -403,6 +411,7 @@ syscall_return_via_sysret:
403 * We are on the trampoline stack. All regs except RDI are live. 411 * We are on the trampoline stack. All regs except RDI are live.
404 * We can do future final exit work right here. 412 * We can do future final exit work right here.
405 */ 413 */
414 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
406 415
407 popq %rdi 416 popq %rdi
408 popq %rsp 417 popq %rsp
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
740 * We can do future final exit work right here. 749 * We can do future final exit work right here.
741 */ 750 */
742 751
752 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
753
743 /* Restore RDI. */ 754 /* Restore RDI. */
744 popq %rdi 755 popq %rdi
745 SWAPGS 756 SWAPGS
@@ -822,7 +833,9 @@ native_irq_return_ldt:
822 */ 833 */
823 834
824 pushq %rdi /* Stash user RDI */ 835 pushq %rdi /* Stash user RDI */
825 SWAPGS 836 SWAPGS /* to kernel GS */
837 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
838
826 movq PER_CPU_VAR(espfix_waddr), %rdi 839 movq PER_CPU_VAR(espfix_waddr), %rdi
827 movq %rax, (0*8)(%rdi) /* user RAX */ 840 movq %rax, (0*8)(%rdi) /* user RAX */
828 movq (1*8)(%rsp), %rax /* user RIP */ 841 movq (1*8)(%rsp), %rax /* user RIP */
@@ -838,7 +851,6 @@ native_irq_return_ldt:
838 /* Now RAX == RSP. */ 851 /* Now RAX == RSP. */
839 852
840 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ 853 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
841 popq %rdi /* Restore user RDI */
842 854
843 /* 855 /*
844 * espfix_stack[31:16] == 0. The page tables are set up such that 856 * espfix_stack[31:16] == 0. The page tables are set up such that
@@ -849,7 +861,11 @@ native_irq_return_ldt:
849 * still points to an RO alias of the ESPFIX stack. 861 * still points to an RO alias of the ESPFIX stack.
850 */ 862 */
851 orq PER_CPU_VAR(espfix_stack), %rax 863 orq PER_CPU_VAR(espfix_stack), %rax
852 SWAPGS 864
865 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
866 SWAPGS /* to user GS */
867 popq %rdi /* Restore user RDI */
868
853 movq %rax, %rsp 869 movq %rax, %rsp
854 UNWIND_HINT_IRET_REGS offset=8 870 UNWIND_HINT_IRET_REGS offset=8
855 871
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
949 UNWIND_HINT_FUNC 965 UNWIND_HINT_FUNC
950 966
951 pushq %rdi 967 pushq %rdi
968 /* Need to switch before accessing the thread stack. */
969 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
952 movq %rsp, %rdi 970 movq %rsp, %rdi
953 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 971 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
954 UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI 972 UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
1250 js 1f /* negative -> in kernel */ 1268 js 1f /* negative -> in kernel */
1251 SWAPGS 1269 SWAPGS
1252 xorl %ebx, %ebx 1270 xorl %ebx, %ebx
12531: ret 1271
12721:
1273 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
1274
1275 ret
1254END(paranoid_entry) 1276END(paranoid_entry)
1255 1277
1256/* 1278/*
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
1272 testl %ebx, %ebx /* swapgs needed? */ 1294 testl %ebx, %ebx /* swapgs needed? */
1273 jnz .Lparanoid_exit_no_swapgs 1295 jnz .Lparanoid_exit_no_swapgs
1274 TRACE_IRQS_IRETQ 1296 TRACE_IRQS_IRETQ
1297 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1275 SWAPGS_UNSAFE_STACK 1298 SWAPGS_UNSAFE_STACK
1276 jmp .Lparanoid_exit_restore 1299 jmp .Lparanoid_exit_restore
1277.Lparanoid_exit_no_swapgs: 1300.Lparanoid_exit_no_swapgs:
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
1299 * from user mode due to an IRET fault. 1322 * from user mode due to an IRET fault.
1300 */ 1323 */
1301 SWAPGS 1324 SWAPGS
1325 /* We have user CR3. Change to kernel CR3. */
1326 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1302 1327
1303.Lerror_entry_from_usermode_after_swapgs: 1328.Lerror_entry_from_usermode_after_swapgs:
1304 /* Put us onto the real thread stack. */ 1329 /* Put us onto the real thread stack. */
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
1345 * .Lgs_change's error handler with kernel gsbase. 1370 * .Lgs_change's error handler with kernel gsbase.
1346 */ 1371 */
1347 SWAPGS 1372 SWAPGS
1373 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1348 jmp .Lerror_entry_done 1374 jmp .Lerror_entry_done
1349 1375
1350.Lbstep_iret: 1376.Lbstep_iret:
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
1354 1380
1355.Lerror_bad_iret: 1381.Lerror_bad_iret:
1356 /* 1382 /*
1357 * We came from an IRET to user mode, so we have user gsbase. 1383 * We came from an IRET to user mode, so we have user
1358 * Switch to kernel gsbase: 1384 * gsbase and CR3. Switch to kernel gsbase and CR3:
1359 */ 1385 */
1360 SWAPGS 1386 SWAPGS
1387 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1361 1388
1362 /* 1389 /*
1363 * Pretend that the exception came from user mode: set up pt_regs 1390 * Pretend that the exception came from user mode: set up pt_regs
@@ -1389,6 +1416,10 @@ END(error_exit)
1389/* 1416/*
1390 * Runs on exception stack. Xen PV does not go through this path at all, 1417 * Runs on exception stack. Xen PV does not go through this path at all,
1391 * so we can use real assembly here. 1418 * so we can use real assembly here.
1419 *
1420 * Registers:
1421 * %r14: Used to save/restore the CR3 of the interrupted context
1422 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
1392 */ 1423 */
1393ENTRY(nmi) 1424ENTRY(nmi)
1394 UNWIND_HINT_IRET_REGS 1425 UNWIND_HINT_IRET_REGS
@@ -1452,6 +1483,7 @@ ENTRY(nmi)
1452 1483
1453 swapgs 1484 swapgs
1454 cld 1485 cld
1486 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
1455 movq %rsp, %rdx 1487 movq %rsp, %rdx
1456 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1488 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1457 UNWIND_HINT_IRET_REGS base=%rdx offset=8 1489 UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
1704 movq $-1, %rsi 1736 movq $-1, %rsi
1705 call do_nmi 1737 call do_nmi
1706 1738
1739 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1740
1707 testl %ebx, %ebx /* swapgs needed? */ 1741 testl %ebx, %ebx /* swapgs needed? */
1708 jnz nmi_restore 1742 jnz nmi_restore
1709nmi_swapgs: 1743nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 95ad40eb7eff..40f17009ec20 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,6 +49,10 @@
49ENTRY(entry_SYSENTER_compat) 49ENTRY(entry_SYSENTER_compat)
50 /* Interrupts are off on entry. */ 50 /* Interrupts are off on entry. */
51 SWAPGS 51 SWAPGS
52
53 /* We are about to clobber %rsp anyway, clobbering here is OK */
54 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
55
52 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 56 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
53 57
54 /* 58 /*
@@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
216 pushq $0 /* pt_regs->r15 = 0 */ 220 pushq $0 /* pt_regs->r15 = 0 */
217 221
218 /* 222 /*
223 * We just saved %rdi so it is safe to clobber. It is not
224 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
225 */
226 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
227
228 /*
219 * User mode is traced as though IRQs are on, and SYSENTER 229 * User mode is traced as though IRQs are on, and SYSENTER
220 * turned them off. 230 * turned them off.
221 */ 231 */
@@ -256,10 +266,22 @@ sysret32_from_system_call:
256 * when the system call started, which is already known to user 266 * when the system call started, which is already known to user
257 * code. We zero R8-R10 to avoid info leaks. 267 * code. We zero R8-R10 to avoid info leaks.
258 */ 268 */
269 movq RSP-ORIG_RAX(%rsp), %rsp
270
271 /*
272 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
273 * on the process stack which is not mapped to userspace and
274 * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
275 * switch until after after the last reference to the process
276 * stack.
277 *
278 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
279 */
280 SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
281
259 xorq %r8, %r8 282 xorq %r8, %r8
260 xorq %r9, %r9 283 xorq %r9, %r9
261 xorq %r10, %r10 284 xorq %r10, %r10
262 movq RSP-ORIG_RAX(%rsp), %rsp
263 swapgs 285 swapgs
264 sysretl 286 sysretl
265END(entry_SYSCALL_compat) 287END(entry_SYSCALL_compat)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1faf40f2dda9..577fa8adb785 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
344 * vsyscalls but leave the page not present. If so, we skip calling 344 * vsyscalls but leave the page not present. If so, we skip calling
345 * this. 345 * this.
346 */ 346 */
347static void __init set_vsyscall_pgtable_user_bits(void) 347void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
348{ 348{
349 pgd_t *pgd; 349 pgd_t *pgd;
350 p4d_t *p4d; 350 p4d_t *p4d;
351 pud_t *pud; 351 pud_t *pud;
352 pmd_t *pmd; 352 pmd_t *pmd;
353 353
354 pgd = pgd_offset_k(VSYSCALL_ADDR); 354 pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
355 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); 355 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
356 p4d = p4d_offset(pgd, VSYSCALL_ADDR); 356 p4d = p4d_offset(pgd, VSYSCALL_ADDR);
357#if CONFIG_PGTABLE_LEVELS >= 5 357#if CONFIG_PGTABLE_LEVELS >= 5
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
373 vsyscall_mode == NATIVE 373 vsyscall_mode == NATIVE
374 ? PAGE_KERNEL_VSYSCALL 374 ? PAGE_KERNEL_VSYSCALL
375 : PAGE_KERNEL_VVAR); 375 : PAGE_KERNEL_VVAR);
376 set_vsyscall_pgtable_user_bits(); 376 set_vsyscall_pgtable_user_bits(swapper_pg_dir);
377 } 377 }
378 378
379 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != 379 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3674a4b6f8bd..8f0aace08b87 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,16 +3,18 @@
3#include <linux/types.h> 3#include <linux/types.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include <asm/cpu_entry_area.h>
6#include <asm/perf_event.h> 7#include <asm/perf_event.h>
7#include <asm/insn.h> 8#include <asm/insn.h>
8 9
9#include "../perf_event.h" 10#include "../perf_event.h"
10 11
12/* Waste a full page so it can be mapped into the cpu_entry_area */
13DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
14
11/* The size of a BTS record in bytes: */ 15/* The size of a BTS record in bytes: */
12#define BTS_RECORD_SIZE 24 16#define BTS_RECORD_SIZE 24
13 17
14#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
15#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
16#define PEBS_FIXUP_SIZE PAGE_SIZE 18#define PEBS_FIXUP_SIZE PAGE_SIZE
17 19
18/* 20/*
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
279 281
280static DEFINE_PER_CPU(void *, insn_buffer); 282static DEFINE_PER_CPU(void *, insn_buffer);
281 283
282static int alloc_pebs_buffer(int cpu) 284static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
283{ 285{
284 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 286 phys_addr_t pa;
287 size_t msz = 0;
288
289 pa = virt_to_phys(addr);
290 for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
291 cea_set_pte(cea, pa, prot);
292}
293
294static void ds_clear_cea(void *cea, size_t size)
295{
296 size_t msz = 0;
297
298 for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
299 cea_set_pte(cea, 0, PAGE_NONE);
300}
301
302static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
303{
304 unsigned int order = get_order(size);
285 int node = cpu_to_node(cpu); 305 int node = cpu_to_node(cpu);
286 int max; 306 struct page *page;
287 void *buffer, *ibuffer; 307
308 page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
309 return page ? page_address(page) : NULL;
310}
311
312static void dsfree_pages(const void *buffer, size_t size)
313{
314 if (buffer)
315 free_pages((unsigned long)buffer, get_order(size));
316}
317
318static int alloc_pebs_buffer(int cpu)
319{
320 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
321 struct debug_store *ds = hwev->ds;
322 size_t bsiz = x86_pmu.pebs_buffer_size;
323 int max, node = cpu_to_node(cpu);
324 void *buffer, *ibuffer, *cea;
288 325
289 if (!x86_pmu.pebs) 326 if (!x86_pmu.pebs)
290 return 0; 327 return 0;
291 328
292 buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); 329 buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
293 if (unlikely(!buffer)) 330 if (unlikely(!buffer))
294 return -ENOMEM; 331 return -ENOMEM;
295 332
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
300 if (x86_pmu.intel_cap.pebs_format < 2) { 337 if (x86_pmu.intel_cap.pebs_format < 2) {
301 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); 338 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
302 if (!ibuffer) { 339 if (!ibuffer) {
303 kfree(buffer); 340 dsfree_pages(buffer, bsiz);
304 return -ENOMEM; 341 return -ENOMEM;
305 } 342 }
306 per_cpu(insn_buffer, cpu) = ibuffer; 343 per_cpu(insn_buffer, cpu) = ibuffer;
307 } 344 }
308 345 hwev->ds_pebs_vaddr = buffer;
309 max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; 346 /* Update the cpu entry area mapping */
310 347 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
311 ds->pebs_buffer_base = (u64)(unsigned long)buffer; 348 ds->pebs_buffer_base = (unsigned long) cea;
349 ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
312 ds->pebs_index = ds->pebs_buffer_base; 350 ds->pebs_index = ds->pebs_buffer_base;
313 ds->pebs_absolute_maximum = ds->pebs_buffer_base + 351 max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
314 max * x86_pmu.pebs_record_size; 352 ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
315
316 return 0; 353 return 0;
317} 354}
318 355
319static void release_pebs_buffer(int cpu) 356static void release_pebs_buffer(int cpu)
320{ 357{
321 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 358 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
359 struct debug_store *ds = hwev->ds;
360 void *cea;
322 361
323 if (!ds || !x86_pmu.pebs) 362 if (!ds || !x86_pmu.pebs)
324 return; 363 return;
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
326 kfree(per_cpu(insn_buffer, cpu)); 365 kfree(per_cpu(insn_buffer, cpu));
327 per_cpu(insn_buffer, cpu) = NULL; 366 per_cpu(insn_buffer, cpu) = NULL;
328 367
329 kfree((void *)(unsigned long)ds->pebs_buffer_base); 368 /* Clear the fixmap */
369 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
370 ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
330 ds->pebs_buffer_base = 0; 371 ds->pebs_buffer_base = 0;
372 dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
373 hwev->ds_pebs_vaddr = NULL;
331} 374}
332 375
333static int alloc_bts_buffer(int cpu) 376static int alloc_bts_buffer(int cpu)
334{ 377{
335 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 378 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
336 int node = cpu_to_node(cpu); 379 struct debug_store *ds = hwev->ds;
337 int max, thresh; 380 void *buffer, *cea;
338 void *buffer; 381 int max;
339 382
340 if (!x86_pmu.bts) 383 if (!x86_pmu.bts)
341 return 0; 384 return 0;
342 385
343 buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); 386 buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
344 if (unlikely(!buffer)) { 387 if (unlikely(!buffer)) {
345 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); 388 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
346 return -ENOMEM; 389 return -ENOMEM;
347 } 390 }
348 391 hwev->ds_bts_vaddr = buffer;
349 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; 392 /* Update the fixmap */
350 thresh = max / 16; 393 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
351 394 ds->bts_buffer_base = (unsigned long) cea;
352 ds->bts_buffer_base = (u64)(unsigned long)buffer; 395 ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
353 ds->bts_index = ds->bts_buffer_base; 396 ds->bts_index = ds->bts_buffer_base;
354 ds->bts_absolute_maximum = ds->bts_buffer_base + 397 max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
355 max * BTS_RECORD_SIZE; 398 ds->bts_absolute_maximum = ds->bts_buffer_base + max;
356 ds->bts_interrupt_threshold = ds->bts_absolute_maximum - 399 ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
357 thresh * BTS_RECORD_SIZE;
358
359 return 0; 400 return 0;
360} 401}
361 402
362static void release_bts_buffer(int cpu) 403static void release_bts_buffer(int cpu)
363{ 404{
364 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 405 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
406 struct debug_store *ds = hwev->ds;
407 void *cea;
365 408
366 if (!ds || !x86_pmu.bts) 409 if (!ds || !x86_pmu.bts)
367 return; 410 return;
368 411
369 kfree((void *)(unsigned long)ds->bts_buffer_base); 412 /* Clear the fixmap */
413 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
414 ds_clear_cea(cea, BTS_BUFFER_SIZE);
370 ds->bts_buffer_base = 0; 415 ds->bts_buffer_base = 0;
416 dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
417 hwev->ds_bts_vaddr = NULL;
371} 418}
372 419
373static int alloc_ds_buffer(int cpu) 420static int alloc_ds_buffer(int cpu)
374{ 421{
375 int node = cpu_to_node(cpu); 422 struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
376 struct debug_store *ds;
377
378 ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
379 if (unlikely(!ds))
380 return -ENOMEM;
381 423
424 memset(ds, 0, sizeof(*ds));
382 per_cpu(cpu_hw_events, cpu).ds = ds; 425 per_cpu(cpu_hw_events, cpu).ds = ds;
383
384 return 0; 426 return 0;
385} 427}
386 428
387static void release_ds_buffer(int cpu) 429static void release_ds_buffer(int cpu)
388{ 430{
389 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
390
391 if (!ds)
392 return;
393
394 per_cpu(cpu_hw_events, cpu).ds = NULL; 431 per_cpu(cpu_hw_events, cpu).ds = NULL;
395 kfree(ds);
396} 432}
397 433
398void release_ds_buffers(void) 434void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f7aaadf9331f..8e4ea143ed96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
14 14
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16 16
17#include <asm/intel_ds.h>
18
17/* To enable MSR tracing please use the generic trace points. */ 19/* To enable MSR tracing please use the generic trace points. */
18 20
19/* 21/*
@@ -77,8 +79,6 @@ struct amd_nb {
77 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 79 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
78}; 80};
79 81
80/* The maximal number of PEBS events: */
81#define MAX_PEBS_EVENTS 8
82#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) 82#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
83 83
84/* 84/*
@@ -95,23 +95,6 @@ struct amd_nb {
95 PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ 95 PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
96 PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) 96 PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
97 97
98/*
99 * A debug store configuration.
100 *
101 * We only support architectures that use 64bit fields.
102 */
103struct debug_store {
104 u64 bts_buffer_base;
105 u64 bts_index;
106 u64 bts_absolute_maximum;
107 u64 bts_interrupt_threshold;
108 u64 pebs_buffer_base;
109 u64 pebs_index;
110 u64 pebs_absolute_maximum;
111 u64 pebs_interrupt_threshold;
112 u64 pebs_event_reset[MAX_PEBS_EVENTS];
113};
114
115#define PEBS_REGS \ 98#define PEBS_REGS \
116 (PERF_REG_X86_AX | \ 99 (PERF_REG_X86_AX | \
117 PERF_REG_X86_BX | \ 100 PERF_REG_X86_BX | \
@@ -216,6 +199,8 @@ struct cpu_hw_events {
216 * Intel DebugStore bits 199 * Intel DebugStore bits
217 */ 200 */
218 struct debug_store *ds; 201 struct debug_store *ds;
202 void *ds_pebs_vaddr;
203 void *ds_bts_vaddr;
219 u64 pebs_enabled; 204 u64 pebs_enabled;
220 int n_pebs; 205 int n_pebs;
221 int n_large_pebs; 206 int n_large_pebs;
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 2fbc69a0916e..4a7884b8dca5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -5,6 +5,7 @@
5 5
6#include <linux/percpu-defs.h> 6#include <linux/percpu-defs.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/intel_ds.h>
8 9
9/* 10/*
10 * cpu_entry_area is a percpu region that contains things needed by the CPU 11 * cpu_entry_area is a percpu region that contains things needed by the CPU
@@ -40,6 +41,18 @@ struct cpu_entry_area {
40 */ 41 */
41 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; 42 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
42#endif 43#endif
44#ifdef CONFIG_CPU_SUP_INTEL
45 /*
46 * Per CPU debug store for Intel performance monitoring. Wastes a
47 * full page at the moment.
48 */
49 struct debug_store cpu_debug_store;
50 /*
51 * The actual PEBS/BTS buffers must be mapped to user space
52 * Reserve enough fixmap PTEs.
53 */
54 struct debug_store_buffers cpu_debug_buffers;
55#endif
43}; 56};
44 57
45#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) 58#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 800104c8a3ed..07cdd1715705 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,11 +197,12 @@
197#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ 197#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
198#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ 198#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
199#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ 199#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
200#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
200 201
201#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 202#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
202#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 203#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
203#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ 204#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
204 205#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
205#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 206#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
206#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ 207#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
207#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ 208#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
340#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ 341#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
341#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ 342#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
342#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ 343#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
344#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
343 345
344#endif /* _ASM_X86_CPUFEATURES_H */ 346#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ec8be07c0cda..13c5ee878a47 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
21 21
22 desc->type = (info->read_exec_only ^ 1) << 1; 22 desc->type = (info->read_exec_only ^ 1) << 1;
23 desc->type |= info->contents << 2; 23 desc->type |= info->contents << 2;
24 /* Set the ACCESS bit so it can be mapped RO */
25 desc->type |= 1;
24 26
25 desc->s = 1; 27 desc->s = 1;
26 desc->dpl = 0x3; 28 desc->dpl = 0x3;
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 14d6d5007314..b027633e7300 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
50# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) 50# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
51#endif 51#endif
52 52
53#ifdef CONFIG_PAGE_TABLE_ISOLATION
54# define DISABLE_PTI 0
55#else
56# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
57#endif
58
53/* 59/*
54 * Make sure to add features to the correct mask 60 * Make sure to add features to the correct mask
55 */ 61 */
@@ -60,7 +66,7 @@
60#define DISABLED_MASK4 (DISABLE_PCID) 66#define DISABLED_MASK4 (DISABLE_PCID)
61#define DISABLED_MASK5 0 67#define DISABLED_MASK5 0
62#define DISABLED_MASK6 0 68#define DISABLED_MASK6 0
63#define DISABLED_MASK7 0 69#define DISABLED_MASK7 (DISABLE_PTI)
64#define DISABLED_MASK8 0 70#define DISABLED_MASK8 0
65#define DISABLED_MASK9 (DISABLE_MPX) 71#define DISABLED_MASK9 (DISABLE_MPX)
66#define DISABLED_MASK10 0 72#define DISABLED_MASK10 0
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..62a9f4966b42
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
1#ifndef _ASM_INTEL_DS_H
2#define _ASM_INTEL_DS_H
3
4#include <linux/percpu-defs.h>
5
6#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
7#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
8
9/* The maximal number of PEBS events: */
10#define MAX_PEBS_EVENTS 8
11
12/*
13 * A debug store configuration.
14 *
15 * We only support architectures that use 64bit fields.
16 */
17struct debug_store {
18 u64 bts_buffer_base;
19 u64 bts_index;
20 u64 bts_absolute_maximum;
21 u64 bts_interrupt_threshold;
22 u64 pebs_buffer_base;
23 u64 pebs_index;
24 u64 pebs_absolute_maximum;
25 u64 pebs_interrupt_threshold;
26 u64 pebs_event_reset[MAX_PEBS_EVENTS];
27} __aligned(PAGE_SIZE);
28
29DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
30
31struct debug_store_buffers {
32 char bts_buffer[BTS_BUFFER_SIZE];
33 char pebs_buffer[PEBS_BUFFER_SIZE];
34};
35
36#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5ede7cae1d67..c931b88982a0 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
50 * call gates. On native, we could merge the ldt_struct and LDT 50 * call gates. On native, we could merge the ldt_struct and LDT
51 * allocations, but it's not worth trying to optimize. 51 * allocations, but it's not worth trying to optimize.
52 */ 52 */
53 struct desc_struct *entries; 53 struct desc_struct *entries;
54 unsigned int nr_entries; 54 unsigned int nr_entries;
55
56 /*
57 * If PTI is in use, then the entries array is not mapped while we're
58 * in user mode. The whole array will be aliased at the addressed
59 * given by ldt_slot_va(slot). We use two slots so that we can allocate
60 * and map, and enable a new LDT without invalidating the mapping
61 * of an older, still-in-use LDT.
62 *
63 * slot will be -1 if this LDT doesn't have an alias mapping.
64 */
65 int slot;
55}; 66};
56 67
68/* This is a multiple of PAGE_SIZE. */
69#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
70
71static inline void *ldt_slot_va(int slot)
72{
73#ifdef CONFIG_X86_64
74 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
75#else
76 BUG();
77#endif
78}
79
57/* 80/*
58 * Used for LDT copy/destruction. 81 * Used for LDT copy/destruction.
59 */ 82 */
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
64} 87}
65int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); 88int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
66void destroy_context_ldt(struct mm_struct *mm); 89void destroy_context_ldt(struct mm_struct *mm);
90void ldt_arch_exit_mmap(struct mm_struct *mm);
67#else /* CONFIG_MODIFY_LDT_SYSCALL */ 91#else /* CONFIG_MODIFY_LDT_SYSCALL */
68static inline void init_new_context_ldt(struct mm_struct *mm) { } 92static inline void init_new_context_ldt(struct mm_struct *mm) { }
69static inline int ldt_dup_context(struct mm_struct *oldmm, 93static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
71{ 95{
72 return 0; 96 return 0;
73} 97}
74static inline void destroy_context_ldt(struct mm_struct *mm) {} 98static inline void destroy_context_ldt(struct mm_struct *mm) { }
99static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
75#endif 100#endif
76 101
77static inline void load_mm_ldt(struct mm_struct *mm) 102static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
96 * that we can see. 121 * that we can see.
97 */ 122 */
98 123
99 if (unlikely(ldt)) 124 if (unlikely(ldt)) {
100 set_ldt(ldt->entries, ldt->nr_entries); 125 if (static_cpu_has(X86_FEATURE_PTI)) {
101 else 126 if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
127 /*
128 * Whoops -- either the new LDT isn't mapped
129 * (if slot == -1) or is mapped into a bogus
130 * slot (if slot > 1).
131 */
132 clear_LDT();
133 return;
134 }
135
136 /*
137 * If page table isolation is enabled, ldt->entries
138 * will not be mapped in the userspace pagetables.
139 * Tell the CPU to access the LDT through the alias
140 * at ldt_slot_va(ldt->slot).
141 */
142 set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
143 } else {
144 set_ldt(ldt->entries, ldt->nr_entries);
145 }
146 } else {
102 clear_LDT(); 147 clear_LDT();
148 }
103#else 149#else
104 clear_LDT(); 150 clear_LDT();
105#endif 151#endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
194static inline void arch_exit_mmap(struct mm_struct *mm) 240static inline void arch_exit_mmap(struct mm_struct *mm)
195{ 241{
196 paravirt_arch_exit_mmap(mm); 242 paravirt_arch_exit_mmap(mm);
243 ldt_arch_exit_mmap(mm);
197} 244}
198 245
199#ifdef CONFIG_X86_64 246#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
30 */ 30 */
31extern gfp_t __userpte_alloc_gfp; 31extern gfp_t __userpte_alloc_gfp;
32 32
33#ifdef CONFIG_PAGE_TABLE_ISOLATION
34/*
35 * Instead of one PGD, we acquire two PGDs. Being order-1, it is
36 * both 8k in size and 8k-aligned. That lets us just flip bit 12
37 * in a pointer to swap between the two 4k halves.
38 */
39#define PGD_ALLOCATION_ORDER 1
40#else
41#define PGD_ALLOCATION_ORDER 0
42#endif
43
33/* 44/*
34 * Allocate and free page tables. 45 * Allocate and free page tables.
35 */ 46 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 95e2dfd75521..e42b8943cb1a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
28int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); 28int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
29 29
30void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 30void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
31void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
31void ptdump_walk_pgd_level_checkwx(void); 32void ptdump_walk_pgd_level_checkwx(void);
32 33
33#ifdef CONFIG_DEBUG_WX 34#ifdef CONFIG_DEBUG_WX
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
841 842
842static inline int p4d_bad(p4d_t p4d) 843static inline int p4d_bad(p4d_t p4d)
843{ 844{
844 return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; 845 unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
846
847 if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
848 ignore_flags |= _PAGE_NX;
849
850 return (p4d_flags(p4d) & ~ignore_flags) != 0;
845} 851}
846#endif /* CONFIG_PGTABLE_LEVELS > 3 */ 852#endif /* CONFIG_PGTABLE_LEVELS > 3 */
847 853
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
875 881
876static inline int pgd_bad(pgd_t pgd) 882static inline int pgd_bad(pgd_t pgd)
877{ 883{
878 return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; 884 unsigned long ignore_flags = _PAGE_USER;
885
886 if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
887 ignore_flags |= _PAGE_NX;
888
889 return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
879} 890}
880 891
881static inline int pgd_none(pgd_t pgd) 892static inline int pgd_none(pgd_t pgd)
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
904 * pgd_offset() returns a (pgd_t *) 915 * pgd_offset() returns a (pgd_t *)
905 * pgd_index() is used get the offset into the pgd page's array of pgd_t's; 916 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
906 */ 917 */
907#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) 918#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
919/*
920 * a shortcut to get a pgd_t in a given mm
921 */
922#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
908/* 923/*
909 * a shortcut which implies the use of the kernel's pgd, instead 924 * a shortcut which implies the use of the kernel's pgd, instead
910 * of a process's 925 * of a process's
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
1106 */ 1121 */
1107static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 1122static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
1108{ 1123{
1109 memcpy(dst, src, count * sizeof(pgd_t)); 1124 memcpy(dst, src, count * sizeof(pgd_t));
1125#ifdef CONFIG_PAGE_TABLE_ISOLATION
1126 if (!static_cpu_has(X86_FEATURE_PTI))
1127 return;
1128 /* Clone the user space pgd as well */
1129 memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
1130 count * sizeof(pgd_t));
1131#endif
1110} 1132}
1111 1133
1112#define PTE_SHIFT ilog2(PTRS_PER_PTE) 1134#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e9f05331e732..81462e9a34f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
131#endif 131#endif
132} 132}
133 133
134#ifdef CONFIG_PAGE_TABLE_ISOLATION
135/*
136 * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
137 * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
138 * the user one is in the last 4k. To switch between them, you
139 * just need to flip the 12th bit in their addresses.
140 */
141#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
142
143/*
144 * This generates better code than the inline assembly in
145 * __set_bit().
146 */
147static inline void *ptr_set_bit(void *ptr, int bit)
148{
149 unsigned long __ptr = (unsigned long)ptr;
150
151 __ptr |= BIT(bit);
152 return (void *)__ptr;
153}
154static inline void *ptr_clear_bit(void *ptr, int bit)
155{
156 unsigned long __ptr = (unsigned long)ptr;
157
158 __ptr &= ~BIT(bit);
159 return (void *)__ptr;
160}
161
162static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
163{
164 return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
165}
166
167static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
168{
169 return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
170}
171
172static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
173{
174 return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
175}
176
177static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
178{
179 return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
180}
181#endif /* CONFIG_PAGE_TABLE_ISOLATION */
182
183/*
184 * Page table pages are page-aligned. The lower half of the top
185 * level is used for userspace and the top half for the kernel.
186 *
187 * Returns true for parts of the PGD that map userspace and
188 * false for the parts that map the kernel.
189 */
190static inline bool pgdp_maps_userspace(void *__ptr)
191{
192 unsigned long ptr = (unsigned long)__ptr;
193
194 return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
195}
196
197#ifdef CONFIG_PAGE_TABLE_ISOLATION
198pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
199
200/*
201 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
202 * Populates the user and returns the resulting PGD that must be set in
203 * the kernel copy of the page tables.
204 */
205static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
206{
207 if (!static_cpu_has(X86_FEATURE_PTI))
208 return pgd;
209 return __pti_set_user_pgd(pgdp, pgd);
210}
211#else
212static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
213{
214 return pgd;
215}
216#endif
217
134static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) 218static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
135{ 219{
220#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
221 p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
222#else
136 *p4dp = p4d; 223 *p4dp = p4d;
224#endif
137} 225}
138 226
139static inline void native_p4d_clear(p4d_t *p4d) 227static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
147 235
148static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 236static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
149{ 237{
238#ifdef CONFIG_PAGE_TABLE_ISOLATION
239 *pgdp = pti_set_user_pgd(pgdp, pgd);
240#else
150 *pgdp = pgd; 241 *pgdp = pgd;
242#endif
151} 243}
152 244
153static inline void native_pgd_clear(pgd_t *pgd) 245static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3d27831bc58d..b97a539bcdee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
79#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 79#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
80 80
81#ifdef CONFIG_X86_5LEVEL 81#ifdef CONFIG_X86_5LEVEL
82# define VMALLOC_SIZE_TB _AC(16384, UL) 82# define VMALLOC_SIZE_TB _AC(12800, UL)
83# define __VMALLOC_BASE _AC(0xff92000000000000, UL) 83# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
84# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) 84# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
85# define LDT_PGD_ENTRY _AC(-112, UL)
86# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
85#else 87#else
86# define VMALLOC_SIZE_TB _AC(32, UL) 88# define VMALLOC_SIZE_TB _AC(32, UL)
87# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 89# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
88# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 90# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
91# define LDT_PGD_ENTRY _AC(-4, UL)
92# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
89#endif 93#endif
90 94
91#ifdef CONFIG_RANDOMIZE_MEMORY 95#ifdef CONFIG_RANDOMIZE_MEMORY
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 43212a43ee69..6a60fea90b9d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
38#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) 38#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
39#define CR3_PCID_MASK 0xFFFull 39#define CR3_PCID_MASK 0xFFFull
40#define CR3_NOFLUSH BIT_ULL(63) 40#define CR3_NOFLUSH BIT_ULL(63)
41
42#ifdef CONFIG_PAGE_TABLE_ISOLATION
43# define X86_CR3_PTI_SWITCH_BIT 11
44#endif
45
41#else 46#else
42/* 47/*
43 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 48 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cad8dab266bc..d3a67fba200a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -852,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
852 852
853#else 853#else
854/* 854/*
855 * User space process size. 47bits minus one guard page. The guard 855 * User space process size. This is the first address outside the user range.
856 * page is necessary on Intel CPUs: if a SYSCALL instruction is at 856 * There are a few constraints that determine this:
857 * the highest possible canonical userspace address, then that 857 *
858 * syscall will enter the kernel with a non-canonical return 858 * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
859 * address, and SYSRET will explode dangerously. We avoid this 859 * address, then that syscall will enter the kernel with a
860 * particular problem by preventing anything from being mapped 860 * non-canonical return address, and SYSRET will explode dangerously.
861 * at the maximum canonical address. 861 * We avoid this particular problem by preventing anything executable
862 * from being mapped at the maximum canonical address.
863 *
864 * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
865 * CPUs malfunction if they execute code from the highest canonical page.
866 * They'll speculate right off the end of the canonical space, and
867 * bad things happen. This is worked around in the same way as the
868 * Intel problem.
869 *
870 * With page table isolation enabled, we map the LDT in ... [stay tuned]
862 */ 871 */
863#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 872#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
864 873
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..0b5ef05b2d2d
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
1// SPDX-License-Identifier: GPL-2.0
2#ifndef _ASM_X86_PTI_H
3#define _ASM_X86_PTI_H
4#ifndef __ASSEMBLY__
5
6#ifdef CONFIG_PAGE_TABLE_ISOLATION
7extern void pti_init(void);
8extern void pti_check_boottime_disable(void);
9#else
10static inline void pti_check_boottime_disable(void) { }
11#endif
12
13#endif /* __ASSEMBLY__ */
14#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e1884cf35257..f68f9c836cca 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,38 +10,90 @@
10#include <asm/special_insns.h> 10#include <asm/special_insns.h>
11#include <asm/smp.h> 11#include <asm/smp.h>
12#include <asm/invpcid.h> 12#include <asm/invpcid.h>
13#include <asm/pti.h>
14#include <asm/processor-flags.h>
13 15
14static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 16/*
15{ 17 * The x86 feature is called PCID (Process Context IDentifier). It is similar
16 /* 18 * to what is traditionally called ASID on the RISC processors.
17 * Bump the generation count. This also serves as a full barrier 19 *
18 * that synchronizes with switch_mm(): callers are required to order 20 * We don't use the traditional ASID implementation, where each process/mm gets
19 * their read of mm_cpumask after their writes to the paging 21 * its own ASID and flush/restart when we run out of ASID space.
20 * structures. 22 *
21 */ 23 * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
22 return atomic64_inc_return(&mm->context.tlb_gen); 24 * that came by on this CPU, allowing cheaper switch_mm between processes on
23} 25 * this CPU.
26 *
27 * We end up with different spaces for different things. To avoid confusion we
28 * use different names for each of them:
29 *
30 * ASID - [0, TLB_NR_DYN_ASIDS-1]
31 * the canonical identifier for an mm
32 *
33 * kPCID - [1, TLB_NR_DYN_ASIDS]
34 * the value we write into the PCID part of CR3; corresponds to the
35 * ASID+1, because PCID 0 is special.
36 *
37 * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
38 * for KPTI each mm has two address spaces and thus needs two
39 * PCID values, but we can still do with a single ASID denomination
40 * for each mm. Corresponds to kPCID + 2048.
41 *
42 */
24 43
25/* There are 12 bits of space for ASIDS in CR3 */ 44/* There are 12 bits of space for ASIDS in CR3 */
26#define CR3_HW_ASID_BITS 12 45#define CR3_HW_ASID_BITS 12
46
27/* 47/*
28 * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for 48 * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
29 * user/kernel switches 49 * user/kernel switches
30 */ 50 */
31#define PTI_CONSUMED_ASID_BITS 0 51#ifdef CONFIG_PAGE_TABLE_ISOLATION
52# define PTI_CONSUMED_PCID_BITS 1
53#else
54# define PTI_CONSUMED_PCID_BITS 0
55#endif
56
57#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
32 58
33#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
34/* 59/*
35 * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account 60 * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
36 * for them being zero-based. Another -1 is because ASID 0 is reserved for 61 * for them being zero-based. Another -1 is because PCID 0 is reserved for
37 * use by non-PCID-aware users. 62 * use by non-PCID-aware users.
38 */ 63 */
39#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) 64#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
40 65
66/*
67 * 6 because 6 should be plenty and struct tlb_state will fit in two cache
68 * lines.
69 */
70#define TLB_NR_DYN_ASIDS 6
71
72/*
73 * Given @asid, compute kPCID
74 */
41static inline u16 kern_pcid(u16 asid) 75static inline u16 kern_pcid(u16 asid)
42{ 76{
43 VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 77 VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
78
79#ifdef CONFIG_PAGE_TABLE_ISOLATION
80 /*
81 * Make sure that the dynamic ASID space does not confict with the
82 * bit we are using to switch between user and kernel ASIDs.
83 */
84 BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
85
86 /*
87 * The ASID being passed in here should have respected the
88 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
89 */
90 VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
91#endif
44 /* 92 /*
93 * The dynamically-assigned ASIDs that get passed in are small
94 * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
95 * so do not bother to clear it.
96 *
45 * If PCID is on, ASID-aware code paths put the ASID+1 into the 97 * If PCID is on, ASID-aware code paths put the ASID+1 into the
46 * PCID bits. This serves two purposes. It prevents a nasty 98 * PCID bits. This serves two purposes. It prevents a nasty
47 * situation in which PCID-unaware code saves CR3, loads some other 99 * situation in which PCID-unaware code saves CR3, loads some other
@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
53 return asid + 1; 105 return asid + 1;
54} 106}
55 107
108/*
109 * Given @asid, compute uPCID
110 */
111static inline u16 user_pcid(u16 asid)
112{
113 u16 ret = kern_pcid(asid);
114#ifdef CONFIG_PAGE_TABLE_ISOLATION
115 ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
116#endif
117 return ret;
118}
119
56struct pgd_t; 120struct pgd_t;
57static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) 121static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
58{ 122{
@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
95 return !static_cpu_has(X86_FEATURE_PCID); 159 return !static_cpu_has(X86_FEATURE_PCID);
96} 160}
97 161
98/*
99 * 6 because 6 should be plenty and struct tlb_state will fit in
100 * two cache lines.
101 */
102#define TLB_NR_DYN_ASIDS 6
103
104struct tlb_context { 162struct tlb_context {
105 u64 ctx_id; 163 u64 ctx_id;
106 u64 tlb_gen; 164 u64 tlb_gen;
@@ -135,6 +193,24 @@ struct tlb_state {
135 bool is_lazy; 193 bool is_lazy;
136 194
137 /* 195 /*
196 * If set we changed the page tables in such a way that we
197 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
198 * This tells us to go invalidate all the non-loaded ctxs[]
199 * on the next context switch.
200 *
201 * The current ctx was kept up-to-date as it ran and does not
202 * need to be invalidated.
203 */
204 bool invalidate_other;
205
206 /*
207 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
208 * the corresponding user PCID needs a flush next time we
209 * switch to it; see SWITCH_TO_USER_CR3.
210 */
211 unsigned short user_pcid_flush_mask;
212
213 /*
138 * Access to this CR4 shadow and to H/W CR4 is protected by 214 * Access to this CR4 shadow and to H/W CR4 is protected by
139 * disabling interrupts when modifying either one. 215 * disabling interrupts when modifying either one.
140 */ 216 */
@@ -215,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void)
215} 291}
216 292
217/* 293/*
294 * Mark all other ASIDs as invalid, preserves the current.
295 */
296static inline void invalidate_other_asid(void)
297{
298 this_cpu_write(cpu_tlbstate.invalidate_other, true);
299}
300
301/*
218 * Save some of cr4 feature set we're using (e.g. Pentium 4MB 302 * Save some of cr4 feature set we're using (e.g. Pentium 4MB
219 * enable and PPro Global page enable), so that any CPU's that boot 303 * enable and PPro Global page enable), so that any CPU's that boot
220 * up after us can get the correct flags. This should only be used 304 * up after us can get the correct flags. This should only be used
@@ -234,14 +318,41 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
234extern void initialize_tlbstate_and_flush(void); 318extern void initialize_tlbstate_and_flush(void);
235 319
236/* 320/*
321 * Given an ASID, flush the corresponding user ASID. We can delay this
322 * until the next time we switch to it.
323 *
324 * See SWITCH_TO_USER_CR3.
325 */
326static inline void invalidate_user_asid(u16 asid)
327{
328 /* There is no user ASID if address space separation is off */
329 if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
330 return;
331
332 /*
333 * We only have a single ASID if PCID is off and the CR3
334 * write will have flushed it.
335 */
336 if (!cpu_feature_enabled(X86_FEATURE_PCID))
337 return;
338
339 if (!static_cpu_has(X86_FEATURE_PTI))
340 return;
341
342 __set_bit(kern_pcid(asid),
343 (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
344}
345
346/*
237 * flush the entire current user mapping 347 * flush the entire current user mapping
238 */ 348 */
239static inline void __native_flush_tlb(void) 349static inline void __native_flush_tlb(void)
240{ 350{
351 invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
241 /* 352 /*
242 * If current->mm == NULL then we borrow a mm which may change during a 353 * If current->mm == NULL then we borrow a mm which may change
243 * task switch and therefore we must not be preempted while we write CR3 354 * during a task switch and therefore we must not be preempted
244 * back: 355 * while we write CR3 back:
245 */ 356 */
246 preempt_disable(); 357 preempt_disable();
247 native_write_cr3(__native_read_cr3()); 358 native_write_cr3(__native_read_cr3());
@@ -259,6 +370,8 @@ static inline void __native_flush_tlb_global(void)
259 /* 370 /*
260 * Using INVPCID is considerably faster than a pair of writes 371 * Using INVPCID is considerably faster than a pair of writes
261 * to CR4 sandwiched inside an IRQ flag save/restore. 372 * to CR4 sandwiched inside an IRQ flag save/restore.
373 *
374 * Note, this works with CR4.PCIDE=0 or 1.
262 */ 375 */
263 invpcid_flush_all(); 376 invpcid_flush_all();
264 return; 377 return;
@@ -285,7 +398,21 @@ static inline void __native_flush_tlb_global(void)
285 */ 398 */
286static inline void __native_flush_tlb_single(unsigned long addr) 399static inline void __native_flush_tlb_single(unsigned long addr)
287{ 400{
401 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
402
288 asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); 403 asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
404
405 if (!static_cpu_has(X86_FEATURE_PTI))
406 return;
407
408 /*
409 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
410 * Just use invalidate_user_asid() in case we are called early.
411 */
412 if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
413 invalidate_user_asid(loaded_mm_asid);
414 else
415 invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
289} 416}
290 417
291/* 418/*
@@ -301,14 +428,6 @@ static inline void __flush_tlb_all(void)
301 */ 428 */
302 __flush_tlb(); 429 __flush_tlb();
303 } 430 }
304
305 /*
306 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
307 * we'd end up flushing kernel translations for the current ASID but
308 * we might fail to flush kernel translations for other cached ASIDs.
309 *
310 * To avoid this issue, we force PCID off if PGE is off.
311 */
312} 431}
313 432
314/* 433/*
@@ -318,6 +437,16 @@ static inline void __flush_tlb_one(unsigned long addr)
318{ 437{
319 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 438 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
320 __flush_tlb_single(addr); 439 __flush_tlb_single(addr);
440
441 if (!static_cpu_has(X86_FEATURE_PTI))
442 return;
443
444 /*
445 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
446 * but since kernel space is replicated across all, we must also
447 * invalidate all others.
448 */
449 invalidate_other_asid();
321} 450}
322 451
323#define TLB_FLUSH_ALL -1UL 452#define TLB_FLUSH_ALL -1UL
@@ -378,6 +507,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
378void native_flush_tlb_others(const struct cpumask *cpumask, 507void native_flush_tlb_others(const struct cpumask *cpumask,
379 const struct flush_tlb_info *info); 508 const struct flush_tlb_info *info);
380 509
510static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
511{
512 /*
513 * Bump the generation count. This also serves as a full barrier
514 * that synchronizes with switch_mm(): callers are required to order
515 * their read of mm_cpumask after their writes to the paging
516 * structures.
517 */
518 return atomic64_inc_return(&mm->context.tlb_gen);
519}
520
381static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 521static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
382 struct mm_struct *mm) 522 struct mm_struct *mm)
383{ 523{
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d9a7c659009c..b986b2ca688a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
7 7
8#ifdef CONFIG_X86_VSYSCALL_EMULATION 8#ifdef CONFIG_X86_VSYSCALL_EMULATION
9extern void map_vsyscall(void); 9extern void map_vsyscall(void);
10extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
10 11
11/* 12/*
12 * Called on instruction fetch fault in vsyscall page. 13 * Called on instruction fetch fault in vsyscall page.
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 7e1e730396ae..bcba3c643e63 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
78#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) 78#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
79#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ 79#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
80#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) 80#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
81#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ 81
82#define X86_CR3_PCID_BITS 12
83#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
84
85#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
86#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
82 87
83/* 88/*
84 * Intel CPU features in CR4 89 * Intel CPU features in CR4
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 676b7cf4b62b..76417a9aab73 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
17#include <asm/sigframe.h> 17#include <asm/sigframe.h>
18#include <asm/bootparam.h> 18#include <asm/bootparam.h>
19#include <asm/suspend.h> 19#include <asm/suspend.h>
20#include <asm/tlbflush.h>
20 21
21#ifdef CONFIG_XEN 22#ifdef CONFIG_XEN
22#include <xen/interface/xen.h> 23#include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
94 BLANK(); 95 BLANK();
95 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); 96 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
96 97
98 /* TLB state for the entry code */
99 OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
100
97 /* Layout info for cpu_entry_area */ 101 /* Layout info for cpu_entry_area */
98 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); 102 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
99 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); 103 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9757f07d738..c47de4ebf63a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
922 } 922 }
923 923
924 setup_force_cpu_cap(X86_FEATURE_ALWAYS); 924 setup_force_cpu_cap(X86_FEATURE_ALWAYS);
925
926 /* Assume for now that ALL x86 CPUs are insecure */
927 setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
928
925 fpu__init_system(c); 929 fpu__init_system(c);
926 930
927#ifdef CONFIG_X86_32 931#ifdef CONFIG_X86_32
@@ -1360,7 +1364,10 @@ void syscall_init(void)
1360 (entry_SYSCALL_64_trampoline - _entry_trampoline); 1364 (entry_SYSCALL_64_trampoline - _entry_trampoline);
1361 1365
1362 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); 1366 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
1363 wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); 1367 if (static_cpu_has(X86_FEATURE_PTI))
1368 wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1369 else
1370 wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
1364 1371
1365#ifdef CONFIG_IA32_EMULATION 1372#ifdef CONFIG_IA32_EMULATION
1366 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); 1373 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 36b17e0febe8..5fa110699ed2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
297 unsigned long sp; 297 unsigned long sp;
298#endif 298#endif
299 printk(KERN_DEFAULT 299 printk(KERN_DEFAULT
300 "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, 300 "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
301 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", 301 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
302 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", 302 IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
303 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", 303 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
304 IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); 304 IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
305 IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
306 (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
305 307
306 if (notify_die(DIE_OOPS, str, regs, err, 308 if (notify_die(DIE_OOPS, str, regs, err,
307 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 309 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7dca675fe78d..04a625f0fcda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
341 .balign PAGE_SIZE; \ 341 .balign PAGE_SIZE; \
342GLOBAL(name) 342GLOBAL(name)
343 343
344#ifdef CONFIG_PAGE_TABLE_ISOLATION
345/*
346 * Each PGD needs to be 8k long and 8k aligned. We do not
347 * ever go out to userspace with these, so we do not
348 * strictly *need* the second page, but this allows us to
349 * have a single set_pgd() implementation that does not
350 * need to worry about whether it has 4k or 8k to work
351 * with.
352 *
353 * This ensures PGDs are 8k long:
354 */
355#define PTI_USER_PGD_FILL 512
356/* This ensures they are 8k-aligned: */
357#define NEXT_PGD_PAGE(name) \
358 .balign 2 * PAGE_SIZE; \
359GLOBAL(name)
360#else
361#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
362#define PTI_USER_PGD_FILL 0
363#endif
364
344/* Automate the creation of 1 to 1 mapping pmd entries */ 365/* Automate the creation of 1 to 1 mapping pmd entries */
345#define PMDS(START, PERM, COUNT) \ 366#define PMDS(START, PERM, COUNT) \
346 i = 0 ; \ 367 i = 0 ; \
@@ -350,13 +371,14 @@ GLOBAL(name)
350 .endr 371 .endr
351 372
352 __INITDATA 373 __INITDATA
353NEXT_PAGE(early_top_pgt) 374NEXT_PGD_PAGE(early_top_pgt)
354 .fill 511,8,0 375 .fill 511,8,0
355#ifdef CONFIG_X86_5LEVEL 376#ifdef CONFIG_X86_5LEVEL
356 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 377 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
357#else 378#else
358 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 379 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
359#endif 380#endif
381 .fill PTI_USER_PGD_FILL,8,0
360 382
361NEXT_PAGE(early_dynamic_pgts) 383NEXT_PAGE(early_dynamic_pgts)
362 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 384 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
364 .data 386 .data
365 387
366#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) 388#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
367NEXT_PAGE(init_top_pgt) 389NEXT_PGD_PAGE(init_top_pgt)
368 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 390 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
369 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 391 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
370 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 392 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
371 .org init_top_pgt + PGD_START_KERNEL*8, 0 393 .org init_top_pgt + PGD_START_KERNEL*8, 0
372 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 394 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
373 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 395 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
396 .fill PTI_USER_PGD_FILL,8,0
374 397
375NEXT_PAGE(level3_ident_pgt) 398NEXT_PAGE(level3_ident_pgt)
376 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 399 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
381 */ 404 */
382 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) 405 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
383#else 406#else
384NEXT_PAGE(init_top_pgt) 407NEXT_PGD_PAGE(init_top_pgt)
385 .fill 512,8,0 408 .fill 512,8,0
409 .fill PTI_USER_PGD_FILL,8,0
386#endif 410#endif
387 411
388#ifdef CONFIG_X86_5LEVEL 412#ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a6b5d62f45a7..579cc4a66fdf 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25 25
26#include <asm/ldt.h> 26#include <asm/ldt.h>
27#include <asm/tlb.h>
27#include <asm/desc.h> 28#include <asm/desc.h>
28#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
29#include <asm/syscalls.h> 30#include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
51static void flush_ldt(void *__mm) 52static void flush_ldt(void *__mm)
52{ 53{
53 struct mm_struct *mm = __mm; 54 struct mm_struct *mm = __mm;
54 mm_context_t *pc;
55 55
56 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) 56 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
57 return; 57 return;
58 58
59 pc = &mm->context; 59 load_mm_ldt(mm);
60 set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
61 60
62 refresh_ldt_segments(); 61 refresh_ldt_segments();
63} 62}
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
94 return NULL; 93 return NULL;
95 } 94 }
96 95
96 /* The new LDT isn't aliased for PTI yet. */
97 new_ldt->slot = -1;
98
97 new_ldt->nr_entries = num_entries; 99 new_ldt->nr_entries = num_entries;
98 return new_ldt; 100 return new_ldt;
99} 101}
100 102
103/*
104 * If PTI is enabled, this maps the LDT into the kernelmode and
105 * usermode tables for the given mm.
106 *
107 * There is no corresponding unmap function. Even if the LDT is freed, we
108 * leave the PTEs around until the slot is reused or the mm is destroyed.
109 * This is harmless: the LDT is always in ordinary memory, and no one will
110 * access the freed slot.
111 *
112 * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
113 * it useful, and the flush would slow down modify_ldt().
114 */
115static int
116map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
117{
118#ifdef CONFIG_PAGE_TABLE_ISOLATION
119 bool is_vmalloc, had_top_level_entry;
120 unsigned long va;
121 spinlock_t *ptl;
122 pgd_t *pgd;
123 int i;
124
125 if (!static_cpu_has(X86_FEATURE_PTI))
126 return 0;
127
128 /*
129 * Any given ldt_struct should have map_ldt_struct() called at most
130 * once.
131 */
132 WARN_ON(ldt->slot != -1);
133
134 /*
135 * Did we already have the top level entry allocated? We can't
136 * use pgd_none() for this because it doens't do anything on
137 * 4-level page table kernels.
138 */
139 pgd = pgd_offset(mm, LDT_BASE_ADDR);
140 had_top_level_entry = (pgd->pgd != 0);
141
142 is_vmalloc = is_vmalloc_addr(ldt->entries);
143
144 for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
145 unsigned long offset = i << PAGE_SHIFT;
146 const void *src = (char *)ldt->entries + offset;
147 unsigned long pfn;
148 pte_t pte, *ptep;
149
150 va = (unsigned long)ldt_slot_va(slot) + offset;
151 pfn = is_vmalloc ? vmalloc_to_pfn(src) :
152 page_to_pfn(virt_to_page(src));
153 /*
154 * Treat the PTI LDT range as a *userspace* range.
155 * get_locked_pte() will allocate all needed pagetables
156 * and account for them in this mm.
157 */
158 ptep = get_locked_pte(mm, va, &ptl);
159 if (!ptep)
160 return -ENOMEM;
161 /*
162 * Map it RO so the easy to find address is not a primary
163 * target via some kernel interface which misses a
164 * permission check.
165 */
166 pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
167 set_pte_at(mm, va, ptep, pte);
168 pte_unmap_unlock(ptep, ptl);
169 }
170
171 if (mm->context.ldt) {
172 /*
173 * We already had an LDT. The top-level entry should already
174 * have been allocated and synchronized with the usermode
175 * tables.
176 */
177 WARN_ON(!had_top_level_entry);
178 if (static_cpu_has(X86_FEATURE_PTI))
179 WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
180 } else {
181 /*
182 * This is the first time we're mapping an LDT for this process.
183 * Sync the pgd to the usermode tables.
184 */
185 WARN_ON(had_top_level_entry);
186 if (static_cpu_has(X86_FEATURE_PTI)) {
187 WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
188 set_pgd(kernel_to_user_pgdp(pgd), *pgd);
189 }
190 }
191
192 va = (unsigned long)ldt_slot_va(slot);
193 flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
194
195 ldt->slot = slot;
196#endif
197 return 0;
198}
199
200static void free_ldt_pgtables(struct mm_struct *mm)
201{
202#ifdef CONFIG_PAGE_TABLE_ISOLATION
203 struct mmu_gather tlb;
204 unsigned long start = LDT_BASE_ADDR;
205 unsigned long end = start + (1UL << PGDIR_SHIFT);
206
207 if (!static_cpu_has(X86_FEATURE_PTI))
208 return;
209
210 tlb_gather_mmu(&tlb, mm, start, end);
211 free_pgd_range(&tlb, start, end, start, end);
212 tlb_finish_mmu(&tlb, start, end);
213#endif
214}
215
101/* After calling this, the LDT is immutable. */ 216/* After calling this, the LDT is immutable. */
102static void finalize_ldt_struct(struct ldt_struct *ldt) 217static void finalize_ldt_struct(struct ldt_struct *ldt)
103{ 218{
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
156 new_ldt->nr_entries * LDT_ENTRY_SIZE); 271 new_ldt->nr_entries * LDT_ENTRY_SIZE);
157 finalize_ldt_struct(new_ldt); 272 finalize_ldt_struct(new_ldt);
158 273
274 retval = map_ldt_struct(mm, new_ldt, 0);
275 if (retval) {
276 free_ldt_pgtables(mm);
277 free_ldt_struct(new_ldt);
278 goto out_unlock;
279 }
159 mm->context.ldt = new_ldt; 280 mm->context.ldt = new_ldt;
160 281
161out_unlock: 282out_unlock:
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
174 mm->context.ldt = NULL; 295 mm->context.ldt = NULL;
175} 296}
176 297
298void ldt_arch_exit_mmap(struct mm_struct *mm)
299{
300 free_ldt_pgtables(mm);
301}
302
177static int read_ldt(void __user *ptr, unsigned long bytecount) 303static int read_ldt(void __user *ptr, unsigned long bytecount)
178{ 304{
179 struct mm_struct *mm = current->mm; 305 struct mm_struct *mm = current->mm;
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
287 new_ldt->entries[ldt_info.entry_number] = ldt; 413 new_ldt->entries[ldt_info.entry_number] = ldt;
288 finalize_ldt_struct(new_ldt); 414 finalize_ldt_struct(new_ldt);
289 415
416 /*
417 * If we are using PTI, map the new LDT into the userspace pagetables.
418 * If there is already an LDT, use the other slot so that other CPUs
419 * will continue to use the old LDT until install_ldt() switches
420 * them over to the new LDT.
421 */
422 error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
423 if (error) {
424 free_ldt_struct(old_ldt);
425 goto out_unlock;
426 }
427
290 install_ldt(mm, new_ldt); 428 install_ldt(mm, new_ldt);
291 free_ldt_struct(old_ldt); 429 free_ldt_struct(old_ldt);
292 error = 0; 430 error = 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9a9c9b076955..a5b802a12212 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
93 cpu = get_cpu(); 93 cpu = get_cpu();
94 94
95 while (n-- > 0) { 95 while (n-- > 0) {
96 if (LDT_empty(info) || LDT_zero(info)) { 96 if (LDT_empty(info) || LDT_zero(info))
97 memset(desc, 0, sizeof(*desc)); 97 memset(desc, 0, sizeof(*desc));
98 } else { 98 else
99 fill_ldt(desc, info); 99 fill_ldt(desc, info);
100
101 /*
102 * Always set the accessed bit so that the CPU
103 * doesn't try to write to the (read-only) GDT.
104 */
105 desc->type |= 1;
106 }
107 ++info; 100 ++info;
108 ++desc; 101 ++desc;
109 } 102 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d2a8b5a24a44..1e413a9326aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
61 . = ALIGN(HPAGE_SIZE); \ 61 . = ALIGN(HPAGE_SIZE); \
62 __end_rodata_hpage_align = .; 62 __end_rodata_hpage_align = .;
63 63
64#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
65#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
66
64#else 67#else
65 68
66#define X64_ALIGN_RODATA_BEGIN 69#define X64_ALIGN_RODATA_BEGIN
67#define X64_ALIGN_RODATA_END 70#define X64_ALIGN_RODATA_END
68 71
72#define ALIGN_ENTRY_TEXT_BEGIN
73#define ALIGN_ENTRY_TEXT_END
74
69#endif 75#endif
70 76
71PHDRS { 77PHDRS {
@@ -102,8 +108,10 @@ SECTIONS
102 CPUIDLE_TEXT 108 CPUIDLE_TEXT
103 LOCK_TEXT 109 LOCK_TEXT
104 KPROBES_TEXT 110 KPROBES_TEXT
111 ALIGN_ENTRY_TEXT_BEGIN
105 ENTRY_TEXT 112 ENTRY_TEXT
106 IRQENTRY_TEXT 113 IRQENTRY_TEXT
114 ALIGN_ENTRY_TEXT_END
107 SOFTIRQENTRY_TEXT 115 SOFTIRQENTRY_TEXT
108 *(.fixup) 116 *(.fixup)
109 *(.gnu.warning) 117 *(.gnu.warning)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 52195ee3f6d5..27e9e90a8d35 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
41obj-$(CONFIG_ACPI_NUMA) += srat.o 41obj-$(CONFIG_ACPI_NUMA) += srat.o
42obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 42obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
43 43
44obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 44obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
45obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 45obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
46obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 46obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
47obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
47 48
48obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o 49obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
49obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o 50obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index fe814fd5e014..b9283cc27622 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
38 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); 38 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
39} 39}
40 40
41static void percpu_setup_debug_store(int cpu)
42{
43#ifdef CONFIG_CPU_SUP_INTEL
44 int npages;
45 void *cea;
46
47 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
48 return;
49
50 cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
51 npages = sizeof(struct debug_store) / PAGE_SIZE;
52 BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
53 cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
54 PAGE_KERNEL);
55
56 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
57 /*
58 * Force the population of PMDs for not yet allocated per cpu
59 * memory like debug store buffers.
60 */
61 npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
62 for (; npages; npages--, cea += PAGE_SIZE)
63 cea_set_pte(cea, 0, PAGE_NONE);
64#endif
65}
66
41/* Setup the fixmap mappings only once per-processor */ 67/* Setup the fixmap mappings only once per-processor */
42static void __init setup_cpu_entry_area(int cpu) 68static void __init setup_cpu_entry_area(int cpu)
43{ 69{
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
109 cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, 135 cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
110 __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); 136 __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
111#endif 137#endif
138 percpu_setup_debug_store(cpu);
112} 139}
113 140
114static __init void setup_cpu_entry_area_ptes(void) 141static __init void setup_cpu_entry_area_ptes(void)
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
5 5
6static int ptdump_show(struct seq_file *m, void *v) 6static int ptdump_show(struct seq_file *m, void *v)
7{ 7{
8 ptdump_walk_pgd_level(m, NULL); 8 ptdump_walk_pgd_level_debugfs(m, NULL, false);
9 return 0; 9 return 0;
10} 10}
11 11
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
22 .release = single_release, 22 .release = single_release,
23}; 23};
24 24
25static struct dentry *pe; 25static int ptdump_show_curknl(struct seq_file *m, void *v)
26{
27 if (current->mm->pgd) {
28 down_read(&current->mm->mmap_sem);
29 ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
30 up_read(&current->mm->mmap_sem);
31 }
32 return 0;
33}
34
35static int ptdump_open_curknl(struct inode *inode, struct file *filp)
36{
37 return single_open(filp, ptdump_show_curknl, NULL);
38}
39
40static const struct file_operations ptdump_curknl_fops = {
41 .owner = THIS_MODULE,
42 .open = ptdump_open_curknl,
43 .read = seq_read,
44 .llseek = seq_lseek,
45 .release = single_release,
46};
47
48#ifdef CONFIG_PAGE_TABLE_ISOLATION
49static struct dentry *pe_curusr;
50
51static int ptdump_show_curusr(struct seq_file *m, void *v)
52{
53 if (current->mm->pgd) {
54 down_read(&current->mm->mmap_sem);
55 ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
56 up_read(&current->mm->mmap_sem);
57 }
58 return 0;
59}
60
61static int ptdump_open_curusr(struct inode *inode, struct file *filp)
62{
63 return single_open(filp, ptdump_show_curusr, NULL);
64}
65
66static const struct file_operations ptdump_curusr_fops = {
67 .owner = THIS_MODULE,
68 .open = ptdump_open_curusr,
69 .read = seq_read,
70 .llseek = seq_lseek,
71 .release = single_release,
72};
73#endif
74
75static struct dentry *dir, *pe_knl, *pe_curknl;
26 76
27static int __init pt_dump_debug_init(void) 77static int __init pt_dump_debug_init(void)
28{ 78{
29 pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, 79 dir = debugfs_create_dir("page_tables", NULL);
30 &ptdump_fops); 80 if (!dir)
31 if (!pe)
32 return -ENOMEM; 81 return -ENOMEM;
33 82
83 pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
84 &ptdump_fops);
85 if (!pe_knl)
86 goto err;
87
88 pe_curknl = debugfs_create_file("current_kernel", 0400,
89 dir, NULL, &ptdump_curknl_fops);
90 if (!pe_curknl)
91 goto err;
92
93#ifdef CONFIG_PAGE_TABLE_ISOLATION
94 pe_curusr = debugfs_create_file("current_user", 0400,
95 dir, NULL, &ptdump_curusr_fops);
96 if (!pe_curusr)
97 goto err;
98#endif
34 return 0; 99 return 0;
100err:
101 debugfs_remove_recursive(dir);
102 return -ENOMEM;
35} 103}
36 104
37static void __exit pt_dump_debug_exit(void) 105static void __exit pt_dump_debug_exit(void)
38{ 106{
39 debugfs_remove_recursive(pe); 107 debugfs_remove_recursive(dir);
40} 108}
41 109
42module_init(pt_dump_debug_init); 110module_init(pt_dump_debug_init);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 43dedbfb7257..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,12 +52,18 @@ enum address_markers_idx {
52 USER_SPACE_NR = 0, 52 USER_SPACE_NR = 0,
53 KERNEL_SPACE_NR, 53 KERNEL_SPACE_NR,
54 LOW_KERNEL_NR, 54 LOW_KERNEL_NR,
55#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
56 LDT_NR,
57#endif
55 VMALLOC_START_NR, 58 VMALLOC_START_NR,
56 VMEMMAP_START_NR, 59 VMEMMAP_START_NR,
57#ifdef CONFIG_KASAN 60#ifdef CONFIG_KASAN
58 KASAN_SHADOW_START_NR, 61 KASAN_SHADOW_START_NR,
59 KASAN_SHADOW_END_NR, 62 KASAN_SHADOW_END_NR,
60#endif 63#endif
64#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
65 LDT_NR,
66#endif
61 CPU_ENTRY_AREA_NR, 67 CPU_ENTRY_AREA_NR,
62#ifdef CONFIG_X86_ESPFIX64 68#ifdef CONFIG_X86_ESPFIX64
63 ESPFIX_START_NR, 69 ESPFIX_START_NR,
@@ -82,6 +88,9 @@ static struct addr_marker address_markers[] = {
82 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, 88 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
83 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, 89 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
84#endif 90#endif
91#ifdef CONFIG_MODIFY_LDT_SYSCALL
92 [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
93#endif
85 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 94 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
86#ifdef CONFIG_X86_ESPFIX64 95#ifdef CONFIG_X86_ESPFIX64
87 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 96 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
467} 476}
468 477
469static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 478static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
470 bool checkwx) 479 bool checkwx, bool dmesg)
471{ 480{
472#ifdef CONFIG_X86_64 481#ifdef CONFIG_X86_64
473 pgd_t *start = (pgd_t *) &init_top_pgt; 482 pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
480 489
481 if (pgd) { 490 if (pgd) {
482 start = pgd; 491 start = pgd;
483 st.to_dmesg = true; 492 st.to_dmesg = dmesg;
484 } 493 }
485 494
486 st.check_wx = checkwx; 495 st.check_wx = checkwx;
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
518 527
519void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) 528void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
520{ 529{
521 ptdump_walk_pgd_level_core(m, pgd, false); 530 ptdump_walk_pgd_level_core(m, pgd, false, true);
531}
532
533void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
534{
535#ifdef CONFIG_PAGE_TABLE_ISOLATION
536 if (user && static_cpu_has(X86_FEATURE_PTI))
537 pgd = kernel_to_user_pgdp(pgd);
538#endif
539 ptdump_walk_pgd_level_core(m, pgd, false, false);
540}
541EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
542
543static void ptdump_walk_user_pgd_level_checkwx(void)
544{
545#ifdef CONFIG_PAGE_TABLE_ISOLATION
546 pgd_t *pgd = (pgd_t *) &init_top_pgt;
547
548 if (!static_cpu_has(X86_FEATURE_PTI))
549 return;
550
551 pr_info("x86/mm: Checking user space page tables\n");
552 pgd = kernel_to_user_pgdp(pgd);
553 ptdump_walk_pgd_level_core(NULL, pgd, true, false);
554#endif
522} 555}
523EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
524 556
525void ptdump_walk_pgd_level_checkwx(void) 557void ptdump_walk_pgd_level_checkwx(void)
526{ 558{
527 ptdump_walk_pgd_level_core(NULL, NULL, true); 559 ptdump_walk_pgd_level_core(NULL, NULL, true, false);
560 ptdump_walk_user_pgd_level_checkwx();
528} 561}
529 562
530static int __init pt_dump_init(void) 563static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6fdf91ef130a..8ca324d07282 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
20#include <asm/kaslr.h> 20#include <asm/kaslr.h>
21#include <asm/hypervisor.h> 21#include <asm/hypervisor.h>
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/pti.h>
23 24
24/* 25/*
25 * We need to define the tracepoints somewhere, and tlb.c 26 * We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
160 161
161static int page_size_mask; 162static int page_size_mask;
162 163
164static void enable_global_pages(void)
165{
166 if (!static_cpu_has(X86_FEATURE_PTI))
167 __supported_pte_mask |= _PAGE_GLOBAL;
168}
169
163static void __init probe_page_size_mask(void) 170static void __init probe_page_size_mask(void)
164{ 171{
165 /* 172 /*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
177 cr4_set_bits_and_update_boot(X86_CR4_PSE); 184 cr4_set_bits_and_update_boot(X86_CR4_PSE);
178 185
179 /* Enable PGE if available */ 186 /* Enable PGE if available */
187 __supported_pte_mask &= ~_PAGE_GLOBAL;
180 if (boot_cpu_has(X86_FEATURE_PGE)) { 188 if (boot_cpu_has(X86_FEATURE_PGE)) {
181 cr4_set_bits_and_update_boot(X86_CR4_PGE); 189 cr4_set_bits_and_update_boot(X86_CR4_PGE);
182 __supported_pte_mask |= _PAGE_GLOBAL; 190 enable_global_pages();
183 } else 191 }
184 __supported_pte_mask &= ~_PAGE_GLOBAL;
185 192
186 /* Enable 1 GB linear kernel mappings if available: */ 193 /* Enable 1 GB linear kernel mappings if available: */
187 if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { 194 if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
194 201
195static void setup_pcid(void) 202static void setup_pcid(void)
196{ 203{
197#ifdef CONFIG_X86_64 204 if (!IS_ENABLED(CONFIG_X86_64))
198 if (boot_cpu_has(X86_FEATURE_PCID)) { 205 return;
199 if (boot_cpu_has(X86_FEATURE_PGE)) { 206
200 /* 207 if (!boot_cpu_has(X86_FEATURE_PCID))
201 * This can't be cr4_set_bits_and_update_boot() -- 208 return;
202 * the trampoline code can't handle CR4.PCIDE and 209
203 * it wouldn't do any good anyway. Despite the name, 210 if (boot_cpu_has(X86_FEATURE_PGE)) {
204 * cr4_set_bits_and_update_boot() doesn't actually 211 /*
205 * cause the bits in question to remain set all the 212 * This can't be cr4_set_bits_and_update_boot() -- the
206 * way through the secondary boot asm. 213 * trampoline code can't handle CR4.PCIDE and it wouldn't
207 * 214 * do any good anyway. Despite the name,
208 * Instead, we brute-force it and set CR4.PCIDE 215 * cr4_set_bits_and_update_boot() doesn't actually cause
209 * manually in start_secondary(). 216 * the bits in question to remain set all the way through
210 */ 217 * the secondary boot asm.
211 cr4_set_bits(X86_CR4_PCIDE); 218 *
212 } else { 219 * Instead, we brute-force it and set CR4.PCIDE manually in
213 /* 220 * start_secondary().
214 * flush_tlb_all(), as currently implemented, won't 221 */
215 * work if PCID is on but PGE is not. Since that 222 cr4_set_bits(X86_CR4_PCIDE);
216 * combination doesn't exist on real hardware, there's 223
217 * no reason to try to fully support it, but it's 224 /*
218 * polite to avoid corrupting data if we're on 225 * INVPCID's single-context modes (2/3) only work if we set
219 * an improperly configured VM. 226 * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
220 */ 227 * on systems that have X86_CR4_PCIDE clear, or that have
221 setup_clear_cpu_cap(X86_FEATURE_PCID); 228 * no INVPCID support at all.
222 } 229 */
230 if (boot_cpu_has(X86_FEATURE_INVPCID))
231 setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
232 } else {
233 /*
234 * flush_tlb_all(), as currently implemented, won't work if
235 * PCID is on but PGE is not. Since that combination
236 * doesn't exist on real hardware, there's no reason to try
237 * to fully support it, but it's polite to avoid corrupting
238 * data if we're on an improperly configured VM.
239 */
240 setup_clear_cpu_cap(X86_FEATURE_PCID);
223 } 241 }
224#endif
225} 242}
226 243
227#ifdef CONFIG_X86_32 244#ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
622{ 639{
623 unsigned long end; 640 unsigned long end;
624 641
642 pti_check_boottime_disable();
625 probe_page_size_mask(); 643 probe_page_size_mask();
626 setup_pcid(); 644 setup_pcid();
627 645
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
845 free_area_init_nodes(max_zone_pfns); 863 free_area_init_nodes(max_zone_pfns);
846} 864}
847 865
848DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 866__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
849 .loaded_mm = &init_mm, 867 .loaded_mm = &init_mm,
850 .next_asid = 1, 868 .next_asid = 1,
851 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 869 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 96d456a94b03..004abf9ebf12 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
355 kmem_cache_free(pgd_cache, pgd); 355 kmem_cache_free(pgd_cache, pgd);
356} 356}
357#else 357#else
358
358static inline pgd_t *_pgd_alloc(void) 359static inline pgd_t *_pgd_alloc(void)
359{ 360{
360 return (pgd_t *)__get_free_page(PGALLOC_GFP); 361 return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
361} 362}
362 363
363static inline void _pgd_free(pgd_t *pgd) 364static inline void _pgd_free(pgd_t *pgd)
364{ 365{
365 free_page((unsigned long)pgd); 366 free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
366} 367}
367#endif /* CONFIG_X86_PAE */ 368#endif /* CONFIG_X86_PAE */
368 369
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..bce8aea65606
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,387 @@
1/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * This code is based in part on work published here:
14 *
15 * https://github.com/IAIK/KAISER
16 *
17 * The original work was written by and and signed off by for the Linux
18 * kernel by:
19 *
20 * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
21 * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
22 * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
23 * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
24 *
25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
27 * Andy Lutomirsky <luto@amacapital.net>
28 */
29#include <linux/kernel.h>
30#include <linux/errno.h>
31#include <linux/string.h>
32#include <linux/types.h>
33#include <linux/bug.h>
34#include <linux/init.h>
35#include <linux/spinlock.h>
36#include <linux/mm.h>
37#include <linux/uaccess.h>
38
39#include <asm/cpufeature.h>
40#include <asm/hypervisor.h>
41#include <asm/vsyscall.h>
42#include <asm/cmdline.h>
43#include <asm/pti.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/tlbflush.h>
47#include <asm/desc.h>
48
49#undef pr_fmt
50#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
51
52/* Backporting helper */
53#ifndef __GFP_NOTRACK
54#define __GFP_NOTRACK 0
55#endif
56
57static void __init pti_print_if_insecure(const char *reason)
58{
59 if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
60 pr_info("%s\n", reason);
61}
62
63static void __init pti_print_if_secure(const char *reason)
64{
65 if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
66 pr_info("%s\n", reason);
67}
68
69void __init pti_check_boottime_disable(void)
70{
71 char arg[5];
72 int ret;
73
74 if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
75 pti_print_if_insecure("disabled on XEN PV.");
76 return;
77 }
78
79 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
80 if (ret > 0) {
81 if (ret == 3 && !strncmp(arg, "off", 3)) {
82 pti_print_if_insecure("disabled on command line.");
83 return;
84 }
85 if (ret == 2 && !strncmp(arg, "on", 2)) {
86 pti_print_if_secure("force enabled on command line.");
87 goto enable;
88 }
89 if (ret == 4 && !strncmp(arg, "auto", 4))
90 goto autosel;
91 }
92
93 if (cmdline_find_option_bool(boot_command_line, "nopti")) {
94 pti_print_if_insecure("disabled on command line.");
95 return;
96 }
97
98autosel:
99 if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
100 return;
101enable:
102 setup_force_cpu_cap(X86_FEATURE_PTI);
103}
104
105pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
106{
107 /*
108 * Changes to the high (kernel) portion of the kernelmode page
109 * tables are not automatically propagated to the usermode tables.
110 *
111 * Users should keep in mind that, unlike the kernelmode tables,
112 * there is no vmalloc_fault equivalent for the usermode tables.
113 * Top-level entries added to init_mm's usermode pgd after boot
114 * will not be automatically propagated to other mms.
115 */
116 if (!pgdp_maps_userspace(pgdp))
117 return pgd;
118
119 /*
120 * The user page tables get the full PGD, accessible from
121 * userspace:
122 */
123 kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
124
125 /*
126 * If this is normal user memory, make it NX in the kernel
127 * pagetables so that, if we somehow screw up and return to
128 * usermode with the kernel CR3 loaded, we'll get a page fault
129 * instead of allowing user code to execute with the wrong CR3.
130 *
131 * As exceptions, we don't set NX if:
132 * - _PAGE_USER is not set. This could be an executable
133 * EFI runtime mapping or something similar, and the kernel
134 * may execute from it
135 * - we don't have NX support
136 * - we're clearing the PGD (i.e. the new pgd is not present).
137 */
138 if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
139 (__supported_pte_mask & _PAGE_NX))
140 pgd.pgd |= _PAGE_NX;
141
142 /* return the copy of the PGD we want the kernel to use: */
143 return pgd;
144}
145
146/*
147 * Walk the user copy of the page tables (optionally) trying to allocate
148 * page table pages on the way down.
149 *
150 * Returns a pointer to a P4D on success, or NULL on failure.
151 */
152static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
153{
154 pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
155 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
156
157 if (address < PAGE_OFFSET) {
158 WARN_ONCE(1, "attempt to walk user address\n");
159 return NULL;
160 }
161
162 if (pgd_none(*pgd)) {
163 unsigned long new_p4d_page = __get_free_page(gfp);
164 if (!new_p4d_page)
165 return NULL;
166
167 if (pgd_none(*pgd)) {
168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
169 new_p4d_page = 0;
170 }
171 if (new_p4d_page)
172 free_page(new_p4d_page);
173 }
174 BUILD_BUG_ON(pgd_large(*pgd) != 0);
175
176 return p4d_offset(pgd, address);
177}
178
179/*
180 * Walk the user copy of the page tables (optionally) trying to allocate
181 * page table pages on the way down.
182 *
183 * Returns a pointer to a PMD on success, or NULL on failure.
184 */
185static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
186{
187 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
188 p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
189 pud_t *pud;
190
191 BUILD_BUG_ON(p4d_large(*p4d) != 0);
192 if (p4d_none(*p4d)) {
193 unsigned long new_pud_page = __get_free_page(gfp);
194 if (!new_pud_page)
195 return NULL;
196
197 if (p4d_none(*p4d)) {
198 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
199 new_pud_page = 0;
200 }
201 if (new_pud_page)
202 free_page(new_pud_page);
203 }
204
205 pud = pud_offset(p4d, address);
206 /* The user page tables do not use large mappings: */
207 if (pud_large(*pud)) {
208 WARN_ON(1);
209 return NULL;
210 }
211 if (pud_none(*pud)) {
212 unsigned long new_pmd_page = __get_free_page(gfp);
213 if (!new_pmd_page)
214 return NULL;
215
216 if (pud_none(*pud)) {
217 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
218 new_pmd_page = 0;
219 }
220 if (new_pmd_page)
221 free_page(new_pmd_page);
222 }
223
224 return pmd_offset(pud, address);
225}
226
227#ifdef CONFIG_X86_VSYSCALL_EMULATION
228/*
229 * Walk the shadow copy of the page tables (optionally) trying to allocate
230 * page table pages on the way down. Does not support large pages.
231 *
232 * Note: this is only used when mapping *new* kernel data into the
233 * user/shadow page tables. It is never used for userspace data.
234 *
235 * Returns a pointer to a PTE on success, or NULL on failure.
236 */
237static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
238{
239 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
240 pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
241 pte_t *pte;
242
243 /* We can't do anything sensible if we hit a large mapping. */
244 if (pmd_large(*pmd)) {
245 WARN_ON(1);
246 return NULL;
247 }
248
249 if (pmd_none(*pmd)) {
250 unsigned long new_pte_page = __get_free_page(gfp);
251 if (!new_pte_page)
252 return NULL;
253
254 if (pmd_none(*pmd)) {
255 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
256 new_pte_page = 0;
257 }
258 if (new_pte_page)
259 free_page(new_pte_page);
260 }
261
262 pte = pte_offset_kernel(pmd, address);
263 if (pte_flags(*pte) & _PAGE_USER) {
264 WARN_ONCE(1, "attempt to walk to user pte\n");
265 return NULL;
266 }
267 return pte;
268}
269
270static void __init pti_setup_vsyscall(void)
271{
272 pte_t *pte, *target_pte;
273 unsigned int level;
274
275 pte = lookup_address(VSYSCALL_ADDR, &level);
276 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
277 return;
278
279 target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
280 if (WARN_ON(!target_pte))
281 return;
282
283 *target_pte = *pte;
284 set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
285}
286#else
287static void __init pti_setup_vsyscall(void) { }
288#endif
289
290static void __init
291pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
292{
293 unsigned long addr;
294
295 /*
296 * Clone the populated PMDs which cover start to end. These PMD areas
297 * can have holes.
298 */
299 for (addr = start; addr < end; addr += PMD_SIZE) {
300 pmd_t *pmd, *target_pmd;
301 pgd_t *pgd;
302 p4d_t *p4d;
303 pud_t *pud;
304
305 pgd = pgd_offset_k(addr);
306 if (WARN_ON(pgd_none(*pgd)))
307 return;
308 p4d = p4d_offset(pgd, addr);
309 if (WARN_ON(p4d_none(*p4d)))
310 return;
311 pud = pud_offset(p4d, addr);
312 if (pud_none(*pud))
313 continue;
314 pmd = pmd_offset(pud, addr);
315 if (pmd_none(*pmd))
316 continue;
317
318 target_pmd = pti_user_pagetable_walk_pmd(addr);
319 if (WARN_ON(!target_pmd))
320 return;
321
322 /*
323 * Copy the PMD. That is, the kernelmode and usermode
324 * tables will share the last-level page tables of this
325 * address range
326 */
327 *target_pmd = pmd_clear_flags(*pmd, clear);
328 }
329}
330
331/*
332 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
333 * next-level entry on 5-level systems.
334 */
335static void __init pti_clone_p4d(unsigned long addr)
336{
337 p4d_t *kernel_p4d, *user_p4d;
338 pgd_t *kernel_pgd;
339
340 user_p4d = pti_user_pagetable_walk_p4d(addr);
341 kernel_pgd = pgd_offset_k(addr);
342 kernel_p4d = p4d_offset(kernel_pgd, addr);
343 *user_p4d = *kernel_p4d;
344}
345
346/*
347 * Clone the CPU_ENTRY_AREA into the user space visible page table.
348 */
349static void __init pti_clone_user_shared(void)
350{
351 pti_clone_p4d(CPU_ENTRY_AREA_BASE);
352}
353
354/*
355 * Clone the ESPFIX P4D into the user space visinble page table
356 */
357static void __init pti_setup_espfix64(void)
358{
359#ifdef CONFIG_X86_ESPFIX64
360 pti_clone_p4d(ESPFIX_BASE_ADDR);
361#endif
362}
363
364/*
365 * Clone the populated PMDs of the entry and irqentry text and force it RO.
366 */
367static void __init pti_clone_entry_text(void)
368{
369 pti_clone_pmds((unsigned long) __entry_text_start,
370 (unsigned long) __irqentry_text_end, _PAGE_RW);
371}
372
373/*
374 * Initialize kernel page table isolation
375 */
376void __init pti_init(void)
377{
378 if (!static_cpu_has(X86_FEATURE_PTI))
379 return;
380
381 pr_info("enabled\n");
382
383 pti_clone_user_shared();
384 pti_clone_entry_text();
385 pti_setup_espfix64();
386 pti_setup_vsyscall();
387}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0a1be3adc97e..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
29 */ 29 */
30 30
31/*
32 * We get here when we do something requiring a TLB invalidation
33 * but could not go invalidate all of the contexts. We do the
34 * necessary invalidation by clearing out the 'ctx_id' which
35 * forces a TLB flush when the context is loaded.
36 */
37void clear_asid_other(void)
38{
39 u16 asid;
40
41 /*
42 * This is only expected to be set if we have disabled
43 * kernel _PAGE_GLOBAL pages.
44 */
45 if (!static_cpu_has(X86_FEATURE_PTI)) {
46 WARN_ON_ONCE(1);
47 return;
48 }
49
50 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
51 /* Do not need to flush the current asid */
52 if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
53 continue;
54 /*
55 * Make sure the next time we go to switch to
56 * this asid, we do a flush:
57 */
58 this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
59 }
60 this_cpu_write(cpu_tlbstate.invalidate_other, false);
61}
62
31atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 63atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
32 64
33 65
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
42 return; 74 return;
43 } 75 }
44 76
77 if (this_cpu_read(cpu_tlbstate.invalidate_other))
78 clear_asid_other();
79
45 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 80 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
46 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 81 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
47 next->context.ctx_id) 82 next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
65 *need_flush = true; 100 *need_flush = true;
66} 101}
67 102
103static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
104{
105 unsigned long new_mm_cr3;
106
107 if (need_flush) {
108 invalidate_user_asid(new_asid);
109 new_mm_cr3 = build_cr3(pgdir, new_asid);
110 } else {
111 new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
112 }
113
114 /*
115 * Caution: many callers of this function expect
116 * that load_cr3() is serializing and orders TLB
117 * fills with respect to the mm_cpumask writes.
118 */
119 write_cr3(new_mm_cr3);
120}
121
68void leave_mm(int cpu) 122void leave_mm(int cpu)
69{ 123{
70 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 124 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
195 if (need_flush) { 249 if (need_flush) {
196 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 250 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
197 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 251 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
198 write_cr3(build_cr3(next->pgd, new_asid)); 252 load_new_mm_cr3(next->pgd, new_asid, true);
199 253
200 /* 254 /*
201 * NB: This gets called via leave_mm() in the idle path 255 * NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
208 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 262 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
209 } else { 263 } else {
210 /* The new ASID is already up to date. */ 264 /* The new ASID is already up to date. */
211 write_cr3(build_cr3_noflush(next->pgd, new_asid)); 265 load_new_mm_cr3(next->pgd, new_asid, false);
212 266
213 /* See above wrt _rcuidle. */ 267 /* See above wrt _rcuidle. */
214 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); 268 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6a151ce70e86..d87ac96e37ed 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -196,6 +196,9 @@ static pgd_t *efi_pgd;
196 * because we want to avoid inserting EFI region mappings (EFI_VA_END 196 * because we want to avoid inserting EFI region mappings (EFI_VA_END
197 * to EFI_VA_START) into the standard kernel page tables. Everything 197 * to EFI_VA_START) into the standard kernel page tables. Everything
198 * else can be shared, see efi_sync_low_kernel_mappings(). 198 * else can be shared, see efi_sync_low_kernel_mappings().
199 *
200 * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
201 * allocation.
199 */ 202 */
200int __init efi_alloc_page_tables(void) 203int __init efi_alloc_page_tables(void)
201{ 204{
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void)
208 return 0; 211 return 0;
209 212
210 gfp_mask = GFP_KERNEL | __GFP_ZERO; 213 gfp_mask = GFP_KERNEL | __GFP_ZERO;
211 efi_pgd = (pgd_t *)__get_free_page(gfp_mask); 214 efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
212 if (!efi_pgd) 215 if (!efi_pgd)
213 return -ENOMEM; 216 return -ENOMEM;
214 217
diff --git a/include/linux/pti.h b/include/linux/pti.h
new file mode 100644
index 000000000000..0174883a935a
--- /dev/null
+++ b/include/linux/pti.h
@@ -0,0 +1,11 @@
1// SPDX-License-Identifier: GPL-2.0
2#ifndef _INCLUDE_PTI_H
3#define _INCLUDE_PTI_H
4
5#ifdef CONFIG_PAGE_TABLE_ISOLATION
6#include <asm/pti.h>
7#else
8static inline void pti_init(void) { }
9#endif
10
11#endif
diff --git a/init/main.c b/init/main.c
index 7b606fc48482..a8100b954839 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
75#include <linux/slab.h> 75#include <linux/slab.h>
76#include <linux/perf_event.h> 76#include <linux/perf_event.h>
77#include <linux/ptrace.h> 77#include <linux/ptrace.h>
78#include <linux/pti.h>
78#include <linux/blkdev.h> 79#include <linux/blkdev.h>
79#include <linux/elevator.h> 80#include <linux/elevator.h>
80#include <linux/sched_clock.h> 81#include <linux/sched_clock.h>
@@ -506,6 +507,8 @@ static void __init mm_init(void)
506 ioremap_huge_init(); 507 ioremap_huge_init();
507 /* Should be run before the first non-init thread is created */ 508 /* Should be run before the first non-init thread is created */
508 init_espfix_bsp(); 509 init_espfix_bsp();
510 /* Should be run after espfix64 is set up. */
511 pti_init();
509} 512}
510 513
511asmlinkage __visible void __init start_kernel(void) 514asmlinkage __visible void __init start_kernel(void)
diff --git a/security/Kconfig b/security/Kconfig
index e8e449444e65..a623d13bf288 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@ config SECURITY_NETWORK
54 implement socket and networking access controls. 54 implement socket and networking access controls.
55 If you are unsure how to answer this question, answer N. 55 If you are unsure how to answer this question, answer N.
56 56
57config PAGE_TABLE_ISOLATION
58 bool "Remove the kernel mapping in user mode"
59 depends on X86_64 && !UML
60 help
61 This feature reduces the number of hardware side channels by
62 ensuring that the majority of kernel addresses are not mapped
63 into userspace.
64
65 See Documentation/x86/pagetable-isolation.txt for more details.
66
57config SECURITY_INFINIBAND 67config SECURITY_INFINIBAND
58 bool "Infiniband Security Hooks" 68 bool "Infiniband Security Hooks"
59 depends on SECURITY && INFINIBAND 69 depends on SECURITY && INFINIBAND
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 0304ffb714f2..1aef72df20a1 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
122 * NB: Different Linux versions do different things with the 122 * NB: Different Linux versions do different things with the
123 * accessed bit in set_thread_area(). 123 * accessed bit in set_thread_area().
124 */ 124 */
125 if (ar != expected_ar && 125 if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
126 (ldt || ar != (expected_ar | AR_ACCESSED))) {
127 printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", 126 printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
128 (ldt ? "LDT" : "GDT"), index, ar, expected_ar); 127 (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
129 nerrs++; 128 nerrs++;