diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-29 20:02:49 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-29 20:02:49 -0500 |
commit | 5aa90a84589282b87666f92b6c3c917c8080a9bf (patch) | |
tree | b03c3c5879240496fda0c43e070a89b327a894de | |
parent | 61233580f1f33c50e159c50e24d80ffd2ba2e06b (diff) | |
parent | 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 (diff) |
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 page table isolation updates from Thomas Gleixner:
"This is the final set of enabling page table isolation on x86:
- Infrastructure patches for handling the extra page tables.
- Patches which map the various bits and pieces which are required to
get in and out of user space into the user space visible page
tables.
- The required changes to have CR3 switching in the entry/exit code.
- Optimizations for the CR3 switching along with documentation how
the ASID/PCID mechanism works.
- Updates to dump pagetables to cover the user space page tables for
W+X scans and extra debugfs files to analyze both the kernel and
the user space visible page tables
The whole functionality is compile time controlled via a config switch
and can be turned on/off on the command line as well"
* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
x86/ldt: Make the LDT mapping RO
x86/mm/dump_pagetables: Allow dumping current pagetables
x86/mm/dump_pagetables: Check user space page table for WX pages
x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
x86/mm/pti: Add Kconfig
x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
x86/mm: Use INVPCID for __native_flush_tlb_single()
x86/mm: Optimize RESTORE_CR3
x86/mm: Use/Fix PCID to optimize user/kernel switches
x86/mm: Abstract switching CR3
x86/mm: Allow flushing for future ASID switches
x86/pti: Map the vsyscall page if needed
x86/pti: Put the LDT in its own PGD if PTI is on
x86/mm/64: Make a full PGD-entry size hole in the memory map
x86/events/intel/ds: Map debug buffers in cpu_entry_area
x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
x86/mm/pti: Map ESPFIX into user space
x86/mm/pti: Share entry text PMD
x86/entry: Align entry text section to PMD boundary
...
45 files changed, 1636 insertions, 202 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..e49311d53504 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -2708,6 +2708,8 @@ | |||
2708 | steal time is computed, but won't influence scheduler | 2708 | steal time is computed, but won't influence scheduler |
2709 | behaviour | 2709 | behaviour |
2710 | 2710 | ||
2711 | nopti [X86-64] Disable kernel page table isolation | ||
2712 | |||
2711 | nolapic [X86-32,APIC] Do not enable or use the local APIC. | 2713 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
2712 | 2714 | ||
2713 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 2715 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
@@ -3282,6 +3284,12 @@ | |||
3282 | pt. [PARIDE] | 3284 | pt. [PARIDE] |
3283 | See Documentation/blockdev/paride.txt. | 3285 | See Documentation/blockdev/paride.txt. |
3284 | 3286 | ||
3287 | pti= [X86_64] | ||
3288 | Control user/kernel address space isolation: | ||
3289 | on - enable | ||
3290 | off - disable | ||
3291 | auto - default setting | ||
3292 | |||
3285 | pty.legacy_count= | 3293 | pty.legacy_count= |
3286 | [KNL] Number of legacy pty's. Overwrites compiled-in | 3294 | [KNL] Number of legacy pty's. Overwrites compiled-in |
3287 | default number. | 3295 | default number. |
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 51101708a03a..ad41b3813f0a 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) | |||
12 | ... unused hole ... | 12 | ... unused hole ... |
13 | ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) | 13 | ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) |
14 | ... unused hole ... | 14 | ... unused hole ... |
15 | fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI | ||
15 | fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping | 16 | fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping |
16 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | 17 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks |
17 | ... unused hole ... | 18 | ... unused hole ... |
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables: | |||
29 | hole caused by [56:63] sign extension | 30 | hole caused by [56:63] sign extension |
30 | ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor | 31 | ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor |
31 | ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory | 32 | ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory |
32 | ff90000000000000 - ff91ffffffffffff (=49 bits) hole | 33 | ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI |
33 | ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space | 34 | ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) |
34 | ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole | 35 | ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole |
35 | ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) | 36 | ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) |
36 | ... unused hole ... | 37 | ... unused hole ... |
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index d5364ca2e3f9..b5e5e02f8cde 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c | |||
@@ -23,6 +23,9 @@ | |||
23 | */ | 23 | */ |
24 | #undef CONFIG_AMD_MEM_ENCRYPT | 24 | #undef CONFIG_AMD_MEM_ENCRYPT |
25 | 25 | ||
26 | /* No PAGE_TABLE_ISOLATION support needed either: */ | ||
27 | #undef CONFIG_PAGE_TABLE_ISOLATION | ||
28 | |||
26 | #include "misc.h" | 29 | #include "misc.h" |
27 | 30 | ||
28 | /* These actually do the work of building the kernel identity maps. */ | 31 | /* These actually do the work of building the kernel identity maps. */ |
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3fd8bc560fae..45a63e00a6af 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h | |||
@@ -1,6 +1,11 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #include <linux/jump_label.h> | 2 | #include <linux/jump_label.h> |
3 | #include <asm/unwind_hints.h> | 3 | #include <asm/unwind_hints.h> |
4 | #include <asm/cpufeatures.h> | ||
5 | #include <asm/page_types.h> | ||
6 | #include <asm/percpu.h> | ||
7 | #include <asm/asm-offsets.h> | ||
8 | #include <asm/processor-flags.h> | ||
4 | 9 | ||
5 | /* | 10 | /* |
6 | 11 | ||
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with | |||
187 | #endif | 192 | #endif |
188 | .endm | 193 | .endm |
189 | 194 | ||
195 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
196 | |||
197 | /* | ||
198 | * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two | ||
199 | * halves: | ||
200 | */ | ||
201 | #define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT) | ||
202 | #define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT)) | ||
203 | |||
204 | .macro SET_NOFLUSH_BIT reg:req | ||
205 | bts $X86_CR3_PCID_NOFLUSH_BIT, \reg | ||
206 | .endm | ||
207 | |||
208 | .macro ADJUST_KERNEL_CR3 reg:req | ||
209 | ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID | ||
210 | /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ | ||
211 | andq $(~PTI_SWITCH_MASK), \reg | ||
212 | .endm | ||
213 | |||
214 | .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | ||
215 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | ||
216 | mov %cr3, \scratch_reg | ||
217 | ADJUST_KERNEL_CR3 \scratch_reg | ||
218 | mov \scratch_reg, %cr3 | ||
219 | .Lend_\@: | ||
220 | .endm | ||
221 | |||
222 | #define THIS_CPU_user_pcid_flush_mask \ | ||
223 | PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask | ||
224 | |||
225 | .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req | ||
226 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | ||
227 | mov %cr3, \scratch_reg | ||
228 | |||
229 | ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID | ||
230 | |||
231 | /* | ||
232 | * Test if the ASID needs a flush. | ||
233 | */ | ||
234 | movq \scratch_reg, \scratch_reg2 | ||
235 | andq $(0x7FF), \scratch_reg /* mask ASID */ | ||
236 | bt \scratch_reg, THIS_CPU_user_pcid_flush_mask | ||
237 | jnc .Lnoflush_\@ | ||
238 | |||
239 | /* Flush needed, clear the bit */ | ||
240 | btr \scratch_reg, THIS_CPU_user_pcid_flush_mask | ||
241 | movq \scratch_reg2, \scratch_reg | ||
242 | jmp .Lwrcr3_\@ | ||
243 | |||
244 | .Lnoflush_\@: | ||
245 | movq \scratch_reg2, \scratch_reg | ||
246 | SET_NOFLUSH_BIT \scratch_reg | ||
247 | |||
248 | .Lwrcr3_\@: | ||
249 | /* Flip the PGD and ASID to the user version */ | ||
250 | orq $(PTI_SWITCH_MASK), \scratch_reg | ||
251 | mov \scratch_reg, %cr3 | ||
252 | .Lend_\@: | ||
253 | .endm | ||
254 | |||
255 | .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req | ||
256 | pushq %rax | ||
257 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax | ||
258 | popq %rax | ||
259 | .endm | ||
260 | |||
261 | .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | ||
262 | ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI | ||
263 | movq %cr3, \scratch_reg | ||
264 | movq \scratch_reg, \save_reg | ||
265 | /* | ||
266 | * Is the "switch mask" all zero? That means that both of | ||
267 | * these are zero: | ||
268 | * | ||
269 | * 1. The user/kernel PCID bit, and | ||
270 | * 2. The user/kernel "bit" that points CR3 to the | ||
271 | * bottom half of the 8k PGD | ||
272 | * | ||
273 | * That indicates a kernel CR3 value, not a user CR3. | ||
274 | */ | ||
275 | testq $(PTI_SWITCH_MASK), \scratch_reg | ||
276 | jz .Ldone_\@ | ||
277 | |||
278 | ADJUST_KERNEL_CR3 \scratch_reg | ||
279 | movq \scratch_reg, %cr3 | ||
280 | |||
281 | .Ldone_\@: | ||
282 | .endm | ||
283 | |||
284 | .macro RESTORE_CR3 scratch_reg:req save_reg:req | ||
285 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | ||
286 | |||
287 | ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID | ||
288 | |||
289 | /* | ||
290 | * KERNEL pages can always resume with NOFLUSH as we do | ||
291 | * explicit flushes. | ||
292 | */ | ||
293 | bt $X86_CR3_PTI_SWITCH_BIT, \save_reg | ||
294 | jnc .Lnoflush_\@ | ||
295 | |||
296 | /* | ||
297 | * Check if there's a pending flush for the user ASID we're | ||
298 | * about to set. | ||
299 | */ | ||
300 | movq \save_reg, \scratch_reg | ||
301 | andq $(0x7FF), \scratch_reg | ||
302 | bt \scratch_reg, THIS_CPU_user_pcid_flush_mask | ||
303 | jnc .Lnoflush_\@ | ||
304 | |||
305 | btr \scratch_reg, THIS_CPU_user_pcid_flush_mask | ||
306 | jmp .Lwrcr3_\@ | ||
307 | |||
308 | .Lnoflush_\@: | ||
309 | SET_NOFLUSH_BIT \save_reg | ||
310 | |||
311 | .Lwrcr3_\@: | ||
312 | /* | ||
313 | * The CR3 write could be avoided when not changing its value, | ||
314 | * but would require a CR3 read *and* a scratch register. | ||
315 | */ | ||
316 | movq \save_reg, %cr3 | ||
317 | .Lend_\@: | ||
318 | .endm | ||
319 | |||
320 | #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ | ||
321 | |||
322 | .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | ||
323 | .endm | ||
324 | .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req | ||
325 | .endm | ||
326 | .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req | ||
327 | .endm | ||
328 | .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | ||
329 | .endm | ||
330 | .macro RESTORE_CR3 scratch_reg:req save_reg:req | ||
331 | .endm | ||
332 | |||
333 | #endif | ||
334 | |||
190 | #endif /* CONFIG_X86_64 */ | 335 | #endif /* CONFIG_X86_64 */ |
191 | 336 | ||
192 | /* | 337 | /* |
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3d19c830e1b1..f048e384ff54 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <asm/segment.h> | 23 | #include <asm/segment.h> |
24 | #include <asm/cache.h> | 24 | #include <asm/cache.h> |
25 | #include <asm/errno.h> | 25 | #include <asm/errno.h> |
26 | #include "calling.h" | ||
27 | #include <asm/asm-offsets.h> | 26 | #include <asm/asm-offsets.h> |
28 | #include <asm/msr.h> | 27 | #include <asm/msr.h> |
29 | #include <asm/unistd.h> | 28 | #include <asm/unistd.h> |
@@ -40,6 +39,8 @@ | |||
40 | #include <asm/frame.h> | 39 | #include <asm/frame.h> |
41 | #include <linux/err.h> | 40 | #include <linux/err.h> |
42 | 41 | ||
42 | #include "calling.h" | ||
43 | |||
43 | .code64 | 44 | .code64 |
44 | .section .entry.text, "ax" | 45 | .section .entry.text, "ax" |
45 | 46 | ||
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline) | |||
168 | /* Stash the user RSP. */ | 169 | /* Stash the user RSP. */ |
169 | movq %rsp, RSP_SCRATCH | 170 | movq %rsp, RSP_SCRATCH |
170 | 171 | ||
172 | /* Note: using %rsp as a scratch reg. */ | ||
173 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | ||
174 | |||
171 | /* Load the top of the task stack into RSP */ | 175 | /* Load the top of the task stack into RSP */ |
172 | movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp | 176 | movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp |
173 | 177 | ||
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64) | |||
207 | */ | 211 | */ |
208 | 212 | ||
209 | swapgs | 213 | swapgs |
214 | /* | ||
215 | * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it | ||
216 | * is not required to switch CR3. | ||
217 | */ | ||
210 | movq %rsp, PER_CPU_VAR(rsp_scratch) | 218 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
211 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 219 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
212 | 220 | ||
@@ -403,6 +411,7 @@ syscall_return_via_sysret: | |||
403 | * We are on the trampoline stack. All regs except RDI are live. | 411 | * We are on the trampoline stack. All regs except RDI are live. |
404 | * We can do future final exit work right here. | 412 | * We can do future final exit work right here. |
405 | */ | 413 | */ |
414 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi | ||
406 | 415 | ||
407 | popq %rdi | 416 | popq %rdi |
408 | popq %rsp | 417 | popq %rsp |
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) | |||
740 | * We can do future final exit work right here. | 749 | * We can do future final exit work right here. |
741 | */ | 750 | */ |
742 | 751 | ||
752 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi | ||
753 | |||
743 | /* Restore RDI. */ | 754 | /* Restore RDI. */ |
744 | popq %rdi | 755 | popq %rdi |
745 | SWAPGS | 756 | SWAPGS |
@@ -822,7 +833,9 @@ native_irq_return_ldt: | |||
822 | */ | 833 | */ |
823 | 834 | ||
824 | pushq %rdi /* Stash user RDI */ | 835 | pushq %rdi /* Stash user RDI */ |
825 | SWAPGS | 836 | SWAPGS /* to kernel GS */ |
837 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ | ||
838 | |||
826 | movq PER_CPU_VAR(espfix_waddr), %rdi | 839 | movq PER_CPU_VAR(espfix_waddr), %rdi |
827 | movq %rax, (0*8)(%rdi) /* user RAX */ | 840 | movq %rax, (0*8)(%rdi) /* user RAX */ |
828 | movq (1*8)(%rsp), %rax /* user RIP */ | 841 | movq (1*8)(%rsp), %rax /* user RIP */ |
@@ -838,7 +851,6 @@ native_irq_return_ldt: | |||
838 | /* Now RAX == RSP. */ | 851 | /* Now RAX == RSP. */ |
839 | 852 | ||
840 | andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ | 853 | andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ |
841 | popq %rdi /* Restore user RDI */ | ||
842 | 854 | ||
843 | /* | 855 | /* |
844 | * espfix_stack[31:16] == 0. The page tables are set up such that | 856 | * espfix_stack[31:16] == 0. The page tables are set up such that |
@@ -849,7 +861,11 @@ native_irq_return_ldt: | |||
849 | * still points to an RO alias of the ESPFIX stack. | 861 | * still points to an RO alias of the ESPFIX stack. |
850 | */ | 862 | */ |
851 | orq PER_CPU_VAR(espfix_stack), %rax | 863 | orq PER_CPU_VAR(espfix_stack), %rax |
852 | SWAPGS | 864 | |
865 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi | ||
866 | SWAPGS /* to user GS */ | ||
867 | popq %rdi /* Restore user RDI */ | ||
868 | |||
853 | movq %rax, %rsp | 869 | movq %rax, %rsp |
854 | UNWIND_HINT_IRET_REGS offset=8 | 870 | UNWIND_HINT_IRET_REGS offset=8 |
855 | 871 | ||
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack) | |||
949 | UNWIND_HINT_FUNC | 965 | UNWIND_HINT_FUNC |
950 | 966 | ||
951 | pushq %rdi | 967 | pushq %rdi |
968 | /* Need to switch before accessing the thread stack. */ | ||
969 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi | ||
952 | movq %rsp, %rdi | 970 | movq %rsp, %rdi |
953 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 971 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
954 | UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI | 972 | UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI |
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry) | |||
1250 | js 1f /* negative -> in kernel */ | 1268 | js 1f /* negative -> in kernel */ |
1251 | SWAPGS | 1269 | SWAPGS |
1252 | xorl %ebx, %ebx | 1270 | xorl %ebx, %ebx |
1253 | 1: ret | 1271 | |
1272 | 1: | ||
1273 | SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 | ||
1274 | |||
1275 | ret | ||
1254 | END(paranoid_entry) | 1276 | END(paranoid_entry) |
1255 | 1277 | ||
1256 | /* | 1278 | /* |
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit) | |||
1272 | testl %ebx, %ebx /* swapgs needed? */ | 1294 | testl %ebx, %ebx /* swapgs needed? */ |
1273 | jnz .Lparanoid_exit_no_swapgs | 1295 | jnz .Lparanoid_exit_no_swapgs |
1274 | TRACE_IRQS_IRETQ | 1296 | TRACE_IRQS_IRETQ |
1297 | RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 | ||
1275 | SWAPGS_UNSAFE_STACK | 1298 | SWAPGS_UNSAFE_STACK |
1276 | jmp .Lparanoid_exit_restore | 1299 | jmp .Lparanoid_exit_restore |
1277 | .Lparanoid_exit_no_swapgs: | 1300 | .Lparanoid_exit_no_swapgs: |
@@ -1299,6 +1322,8 @@ ENTRY(error_entry) | |||
1299 | * from user mode due to an IRET fault. | 1322 | * from user mode due to an IRET fault. |
1300 | */ | 1323 | */ |
1301 | SWAPGS | 1324 | SWAPGS |
1325 | /* We have user CR3. Change to kernel CR3. */ | ||
1326 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | ||
1302 | 1327 | ||
1303 | .Lerror_entry_from_usermode_after_swapgs: | 1328 | .Lerror_entry_from_usermode_after_swapgs: |
1304 | /* Put us onto the real thread stack. */ | 1329 | /* Put us onto the real thread stack. */ |
@@ -1345,6 +1370,7 @@ ENTRY(error_entry) | |||
1345 | * .Lgs_change's error handler with kernel gsbase. | 1370 | * .Lgs_change's error handler with kernel gsbase. |
1346 | */ | 1371 | */ |
1347 | SWAPGS | 1372 | SWAPGS |
1373 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | ||
1348 | jmp .Lerror_entry_done | 1374 | jmp .Lerror_entry_done |
1349 | 1375 | ||
1350 | .Lbstep_iret: | 1376 | .Lbstep_iret: |
@@ -1354,10 +1380,11 @@ ENTRY(error_entry) | |||
1354 | 1380 | ||
1355 | .Lerror_bad_iret: | 1381 | .Lerror_bad_iret: |
1356 | /* | 1382 | /* |
1357 | * We came from an IRET to user mode, so we have user gsbase. | 1383 | * We came from an IRET to user mode, so we have user |
1358 | * Switch to kernel gsbase: | 1384 | * gsbase and CR3. Switch to kernel gsbase and CR3: |
1359 | */ | 1385 | */ |
1360 | SWAPGS | 1386 | SWAPGS |
1387 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | ||
1361 | 1388 | ||
1362 | /* | 1389 | /* |
1363 | * Pretend that the exception came from user mode: set up pt_regs | 1390 | * Pretend that the exception came from user mode: set up pt_regs |
@@ -1389,6 +1416,10 @@ END(error_exit) | |||
1389 | /* | 1416 | /* |
1390 | * Runs on exception stack. Xen PV does not go through this path at all, | 1417 | * Runs on exception stack. Xen PV does not go through this path at all, |
1391 | * so we can use real assembly here. | 1418 | * so we can use real assembly here. |
1419 | * | ||
1420 | * Registers: | ||
1421 | * %r14: Used to save/restore the CR3 of the interrupted context | ||
1422 | * when PAGE_TABLE_ISOLATION is in use. Do not clobber. | ||
1392 | */ | 1423 | */ |
1393 | ENTRY(nmi) | 1424 | ENTRY(nmi) |
1394 | UNWIND_HINT_IRET_REGS | 1425 | UNWIND_HINT_IRET_REGS |
@@ -1452,6 +1483,7 @@ ENTRY(nmi) | |||
1452 | 1483 | ||
1453 | swapgs | 1484 | swapgs |
1454 | cld | 1485 | cld |
1486 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx | ||
1455 | movq %rsp, %rdx | 1487 | movq %rsp, %rdx |
1456 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 1488 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
1457 | UNWIND_HINT_IRET_REGS base=%rdx offset=8 | 1489 | UNWIND_HINT_IRET_REGS base=%rdx offset=8 |
@@ -1704,6 +1736,8 @@ end_repeat_nmi: | |||
1704 | movq $-1, %rsi | 1736 | movq $-1, %rsi |
1705 | call do_nmi | 1737 | call do_nmi |
1706 | 1738 | ||
1739 | RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 | ||
1740 | |||
1707 | testl %ebx, %ebx /* swapgs needed? */ | 1741 | testl %ebx, %ebx /* swapgs needed? */ |
1708 | jnz nmi_restore | 1742 | jnz nmi_restore |
1709 | nmi_swapgs: | 1743 | nmi_swapgs: |
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 95ad40eb7eff..40f17009ec20 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S | |||
@@ -49,6 +49,10 @@ | |||
49 | ENTRY(entry_SYSENTER_compat) | 49 | ENTRY(entry_SYSENTER_compat) |
50 | /* Interrupts are off on entry. */ | 50 | /* Interrupts are off on entry. */ |
51 | SWAPGS | 51 | SWAPGS |
52 | |||
53 | /* We are about to clobber %rsp anyway, clobbering here is OK */ | ||
54 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | ||
55 | |||
52 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 56 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
53 | 57 | ||
54 | /* | 58 | /* |
@@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) | |||
216 | pushq $0 /* pt_regs->r15 = 0 */ | 220 | pushq $0 /* pt_regs->r15 = 0 */ |
217 | 221 | ||
218 | /* | 222 | /* |
223 | * We just saved %rdi so it is safe to clobber. It is not | ||
224 | * preserved during the C calls inside TRACE_IRQS_OFF anyway. | ||
225 | */ | ||
226 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi | ||
227 | |||
228 | /* | ||
219 | * User mode is traced as though IRQs are on, and SYSENTER | 229 | * User mode is traced as though IRQs are on, and SYSENTER |
220 | * turned them off. | 230 | * turned them off. |
221 | */ | 231 | */ |
@@ -256,10 +266,22 @@ sysret32_from_system_call: | |||
256 | * when the system call started, which is already known to user | 266 | * when the system call started, which is already known to user |
257 | * code. We zero R8-R10 to avoid info leaks. | 267 | * code. We zero R8-R10 to avoid info leaks. |
258 | */ | 268 | */ |
269 | movq RSP-ORIG_RAX(%rsp), %rsp | ||
270 | |||
271 | /* | ||
272 | * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored | ||
273 | * on the process stack which is not mapped to userspace and | ||
274 | * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 | ||
275 | * switch until after after the last reference to the process | ||
276 | * stack. | ||
277 | * | ||
278 | * %r8/%r9 are zeroed before the sysret, thus safe to clobber. | ||
279 | */ | ||
280 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 | ||
281 | |||
259 | xorq %r8, %r8 | 282 | xorq %r8, %r8 |
260 | xorq %r9, %r9 | 283 | xorq %r9, %r9 |
261 | xorq %r10, %r10 | 284 | xorq %r10, %r10 |
262 | movq RSP-ORIG_RAX(%rsp), %rsp | ||
263 | swapgs | 285 | swapgs |
264 | sysretl | 286 | sysretl |
265 | END(entry_SYSCALL_compat) | 287 | END(entry_SYSCALL_compat) |
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 1faf40f2dda9..577fa8adb785 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c | |||
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr) | |||
344 | * vsyscalls but leave the page not present. If so, we skip calling | 344 | * vsyscalls but leave the page not present. If so, we skip calling |
345 | * this. | 345 | * this. |
346 | */ | 346 | */ |
347 | static void __init set_vsyscall_pgtable_user_bits(void) | 347 | void __init set_vsyscall_pgtable_user_bits(pgd_t *root) |
348 | { | 348 | { |
349 | pgd_t *pgd; | 349 | pgd_t *pgd; |
350 | p4d_t *p4d; | 350 | p4d_t *p4d; |
351 | pud_t *pud; | 351 | pud_t *pud; |
352 | pmd_t *pmd; | 352 | pmd_t *pmd; |
353 | 353 | ||
354 | pgd = pgd_offset_k(VSYSCALL_ADDR); | 354 | pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); |
355 | set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); | 355 | set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); |
356 | p4d = p4d_offset(pgd, VSYSCALL_ADDR); | 356 | p4d = p4d_offset(pgd, VSYSCALL_ADDR); |
357 | #if CONFIG_PGTABLE_LEVELS >= 5 | 357 | #if CONFIG_PGTABLE_LEVELS >= 5 |
@@ -373,7 +373,7 @@ void __init map_vsyscall(void) | |||
373 | vsyscall_mode == NATIVE | 373 | vsyscall_mode == NATIVE |
374 | ? PAGE_KERNEL_VSYSCALL | 374 | ? PAGE_KERNEL_VSYSCALL |
375 | : PAGE_KERNEL_VVAR); | 375 | : PAGE_KERNEL_VVAR); |
376 | set_vsyscall_pgtable_user_bits(); | 376 | set_vsyscall_pgtable_user_bits(swapper_pg_dir); |
377 | } | 377 | } |
378 | 378 | ||
379 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != | 379 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != |
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3674a4b6f8bd..8f0aace08b87 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c | |||
@@ -3,16 +3,18 @@ | |||
3 | #include <linux/types.h> | 3 | #include <linux/types.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | 5 | ||
6 | #include <asm/cpu_entry_area.h> | ||
6 | #include <asm/perf_event.h> | 7 | #include <asm/perf_event.h> |
7 | #include <asm/insn.h> | 8 | #include <asm/insn.h> |
8 | 9 | ||
9 | #include "../perf_event.h" | 10 | #include "../perf_event.h" |
10 | 11 | ||
12 | /* Waste a full page so it can be mapped into the cpu_entry_area */ | ||
13 | DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); | ||
14 | |||
11 | /* The size of a BTS record in bytes: */ | 15 | /* The size of a BTS record in bytes: */ |
12 | #define BTS_RECORD_SIZE 24 | 16 | #define BTS_RECORD_SIZE 24 |
13 | 17 | ||
14 | #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) | ||
15 | #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) | ||
16 | #define PEBS_FIXUP_SIZE PAGE_SIZE | 18 | #define PEBS_FIXUP_SIZE PAGE_SIZE |
17 | 19 | ||
18 | /* | 20 | /* |
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu) | |||
279 | 281 | ||
280 | static DEFINE_PER_CPU(void *, insn_buffer); | 282 | static DEFINE_PER_CPU(void *, insn_buffer); |
281 | 283 | ||
282 | static int alloc_pebs_buffer(int cpu) | 284 | static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) |
283 | { | 285 | { |
284 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | 286 | phys_addr_t pa; |
287 | size_t msz = 0; | ||
288 | |||
289 | pa = virt_to_phys(addr); | ||
290 | for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) | ||
291 | cea_set_pte(cea, pa, prot); | ||
292 | } | ||
293 | |||
294 | static void ds_clear_cea(void *cea, size_t size) | ||
295 | { | ||
296 | size_t msz = 0; | ||
297 | |||
298 | for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) | ||
299 | cea_set_pte(cea, 0, PAGE_NONE); | ||
300 | } | ||
301 | |||
302 | static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) | ||
303 | { | ||
304 | unsigned int order = get_order(size); | ||
285 | int node = cpu_to_node(cpu); | 305 | int node = cpu_to_node(cpu); |
286 | int max; | 306 | struct page *page; |
287 | void *buffer, *ibuffer; | 307 | |
308 | page = __alloc_pages_node(node, flags | __GFP_ZERO, order); | ||
309 | return page ? page_address(page) : NULL; | ||
310 | } | ||
311 | |||
312 | static void dsfree_pages(const void *buffer, size_t size) | ||
313 | { | ||
314 | if (buffer) | ||
315 | free_pages((unsigned long)buffer, get_order(size)); | ||
316 | } | ||
317 | |||
318 | static int alloc_pebs_buffer(int cpu) | ||
319 | { | ||
320 | struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); | ||
321 | struct debug_store *ds = hwev->ds; | ||
322 | size_t bsiz = x86_pmu.pebs_buffer_size; | ||
323 | int max, node = cpu_to_node(cpu); | ||
324 | void *buffer, *ibuffer, *cea; | ||
288 | 325 | ||
289 | if (!x86_pmu.pebs) | 326 | if (!x86_pmu.pebs) |
290 | return 0; | 327 | return 0; |
291 | 328 | ||
292 | buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); | 329 | buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); |
293 | if (unlikely(!buffer)) | 330 | if (unlikely(!buffer)) |
294 | return -ENOMEM; | 331 | return -ENOMEM; |
295 | 332 | ||
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu) | |||
300 | if (x86_pmu.intel_cap.pebs_format < 2) { | 337 | if (x86_pmu.intel_cap.pebs_format < 2) { |
301 | ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); | 338 | ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); |
302 | if (!ibuffer) { | 339 | if (!ibuffer) { |
303 | kfree(buffer); | 340 | dsfree_pages(buffer, bsiz); |
304 | return -ENOMEM; | 341 | return -ENOMEM; |
305 | } | 342 | } |
306 | per_cpu(insn_buffer, cpu) = ibuffer; | 343 | per_cpu(insn_buffer, cpu) = ibuffer; |
307 | } | 344 | } |
308 | 345 | hwev->ds_pebs_vaddr = buffer; | |
309 | max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; | 346 | /* Update the cpu entry area mapping */ |
310 | 347 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; | |
311 | ds->pebs_buffer_base = (u64)(unsigned long)buffer; | 348 | ds->pebs_buffer_base = (unsigned long) cea; |
349 | ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); | ||
312 | ds->pebs_index = ds->pebs_buffer_base; | 350 | ds->pebs_index = ds->pebs_buffer_base; |
313 | ds->pebs_absolute_maximum = ds->pebs_buffer_base + | 351 | max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); |
314 | max * x86_pmu.pebs_record_size; | 352 | ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; |
315 | |||
316 | return 0; | 353 | return 0; |
317 | } | 354 | } |
318 | 355 | ||
319 | static void release_pebs_buffer(int cpu) | 356 | static void release_pebs_buffer(int cpu) |
320 | { | 357 | { |
321 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | 358 | struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); |
359 | struct debug_store *ds = hwev->ds; | ||
360 | void *cea; | ||
322 | 361 | ||
323 | if (!ds || !x86_pmu.pebs) | 362 | if (!ds || !x86_pmu.pebs) |
324 | return; | 363 | return; |
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu) | |||
326 | kfree(per_cpu(insn_buffer, cpu)); | 365 | kfree(per_cpu(insn_buffer, cpu)); |
327 | per_cpu(insn_buffer, cpu) = NULL; | 366 | per_cpu(insn_buffer, cpu) = NULL; |
328 | 367 | ||
329 | kfree((void *)(unsigned long)ds->pebs_buffer_base); | 368 | /* Clear the fixmap */ |
369 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; | ||
370 | ds_clear_cea(cea, x86_pmu.pebs_buffer_size); | ||
330 | ds->pebs_buffer_base = 0; | 371 | ds->pebs_buffer_base = 0; |
372 | dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); | ||
373 | hwev->ds_pebs_vaddr = NULL; | ||
331 | } | 374 | } |
332 | 375 | ||
333 | static int alloc_bts_buffer(int cpu) | 376 | static int alloc_bts_buffer(int cpu) |
334 | { | 377 | { |
335 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | 378 | struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); |
336 | int node = cpu_to_node(cpu); | 379 | struct debug_store *ds = hwev->ds; |
337 | int max, thresh; | 380 | void *buffer, *cea; |
338 | void *buffer; | 381 | int max; |
339 | 382 | ||
340 | if (!x86_pmu.bts) | 383 | if (!x86_pmu.bts) |
341 | return 0; | 384 | return 0; |
342 | 385 | ||
343 | buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); | 386 | buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); |
344 | if (unlikely(!buffer)) { | 387 | if (unlikely(!buffer)) { |
345 | WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); | 388 | WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); |
346 | return -ENOMEM; | 389 | return -ENOMEM; |
347 | } | 390 | } |
348 | 391 | hwev->ds_bts_vaddr = buffer; | |
349 | max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; | 392 | /* Update the fixmap */ |
350 | thresh = max / 16; | 393 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; |
351 | 394 | ds->bts_buffer_base = (unsigned long) cea; | |
352 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | 395 | ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); |
353 | ds->bts_index = ds->bts_buffer_base; | 396 | ds->bts_index = ds->bts_buffer_base; |
354 | ds->bts_absolute_maximum = ds->bts_buffer_base + | 397 | max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); |
355 | max * BTS_RECORD_SIZE; | 398 | ds->bts_absolute_maximum = ds->bts_buffer_base + max; |
356 | ds->bts_interrupt_threshold = ds->bts_absolute_maximum - | 399 | ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); |
357 | thresh * BTS_RECORD_SIZE; | ||
358 | |||
359 | return 0; | 400 | return 0; |
360 | } | 401 | } |
361 | 402 | ||
362 | static void release_bts_buffer(int cpu) | 403 | static void release_bts_buffer(int cpu) |
363 | { | 404 | { |
364 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | 405 | struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); |
406 | struct debug_store *ds = hwev->ds; | ||
407 | void *cea; | ||
365 | 408 | ||
366 | if (!ds || !x86_pmu.bts) | 409 | if (!ds || !x86_pmu.bts) |
367 | return; | 410 | return; |
368 | 411 | ||
369 | kfree((void *)(unsigned long)ds->bts_buffer_base); | 412 | /* Clear the fixmap */ |
413 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; | ||
414 | ds_clear_cea(cea, BTS_BUFFER_SIZE); | ||
370 | ds->bts_buffer_base = 0; | 415 | ds->bts_buffer_base = 0; |
416 | dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); | ||
417 | hwev->ds_bts_vaddr = NULL; | ||
371 | } | 418 | } |
372 | 419 | ||
373 | static int alloc_ds_buffer(int cpu) | 420 | static int alloc_ds_buffer(int cpu) |
374 | { | 421 | { |
375 | int node = cpu_to_node(cpu); | 422 | struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; |
376 | struct debug_store *ds; | ||
377 | |||
378 | ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); | ||
379 | if (unlikely(!ds)) | ||
380 | return -ENOMEM; | ||
381 | 423 | ||
424 | memset(ds, 0, sizeof(*ds)); | ||
382 | per_cpu(cpu_hw_events, cpu).ds = ds; | 425 | per_cpu(cpu_hw_events, cpu).ds = ds; |
383 | |||
384 | return 0; | 426 | return 0; |
385 | } | 427 | } |
386 | 428 | ||
387 | static void release_ds_buffer(int cpu) | 429 | static void release_ds_buffer(int cpu) |
388 | { | 430 | { |
389 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
390 | |||
391 | if (!ds) | ||
392 | return; | ||
393 | |||
394 | per_cpu(cpu_hw_events, cpu).ds = NULL; | 431 | per_cpu(cpu_hw_events, cpu).ds = NULL; |
395 | kfree(ds); | ||
396 | } | 432 | } |
397 | 433 | ||
398 | void release_ds_buffers(void) | 434 | void release_ds_buffers(void) |
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7aaadf9331f..8e4ea143ed96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h | |||
@@ -14,6 +14,8 @@ | |||
14 | 14 | ||
15 | #include <linux/perf_event.h> | 15 | #include <linux/perf_event.h> |
16 | 16 | ||
17 | #include <asm/intel_ds.h> | ||
18 | |||
17 | /* To enable MSR tracing please use the generic trace points. */ | 19 | /* To enable MSR tracing please use the generic trace points. */ |
18 | 20 | ||
19 | /* | 21 | /* |
@@ -77,8 +79,6 @@ struct amd_nb { | |||
77 | struct event_constraint event_constraints[X86_PMC_IDX_MAX]; | 79 | struct event_constraint event_constraints[X86_PMC_IDX_MAX]; |
78 | }; | 80 | }; |
79 | 81 | ||
80 | /* The maximal number of PEBS events: */ | ||
81 | #define MAX_PEBS_EVENTS 8 | ||
82 | #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) | 82 | #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) |
83 | 83 | ||
84 | /* | 84 | /* |
@@ -95,23 +95,6 @@ struct amd_nb { | |||
95 | PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ | 95 | PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ |
96 | PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) | 96 | PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) |
97 | 97 | ||
98 | /* | ||
99 | * A debug store configuration. | ||
100 | * | ||
101 | * We only support architectures that use 64bit fields. | ||
102 | */ | ||
103 | struct debug_store { | ||
104 | u64 bts_buffer_base; | ||
105 | u64 bts_index; | ||
106 | u64 bts_absolute_maximum; | ||
107 | u64 bts_interrupt_threshold; | ||
108 | u64 pebs_buffer_base; | ||
109 | u64 pebs_index; | ||
110 | u64 pebs_absolute_maximum; | ||
111 | u64 pebs_interrupt_threshold; | ||
112 | u64 pebs_event_reset[MAX_PEBS_EVENTS]; | ||
113 | }; | ||
114 | |||
115 | #define PEBS_REGS \ | 98 | #define PEBS_REGS \ |
116 | (PERF_REG_X86_AX | \ | 99 | (PERF_REG_X86_AX | \ |
117 | PERF_REG_X86_BX | \ | 100 | PERF_REG_X86_BX | \ |
@@ -216,6 +199,8 @@ struct cpu_hw_events { | |||
216 | * Intel DebugStore bits | 199 | * Intel DebugStore bits |
217 | */ | 200 | */ |
218 | struct debug_store *ds; | 201 | struct debug_store *ds; |
202 | void *ds_pebs_vaddr; | ||
203 | void *ds_bts_vaddr; | ||
219 | u64 pebs_enabled; | 204 | u64 pebs_enabled; |
220 | int n_pebs; | 205 | int n_pebs; |
221 | int n_large_pebs; | 206 | int n_large_pebs; |
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 2fbc69a0916e..4a7884b8dca5 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/percpu-defs.h> | 6 | #include <linux/percpu-defs.h> |
7 | #include <asm/processor.h> | 7 | #include <asm/processor.h> |
8 | #include <asm/intel_ds.h> | ||
8 | 9 | ||
9 | /* | 10 | /* |
10 | * cpu_entry_area is a percpu region that contains things needed by the CPU | 11 | * cpu_entry_area is a percpu region that contains things needed by the CPU |
@@ -40,6 +41,18 @@ struct cpu_entry_area { | |||
40 | */ | 41 | */ |
41 | char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; | 42 | char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; |
42 | #endif | 43 | #endif |
44 | #ifdef CONFIG_CPU_SUP_INTEL | ||
45 | /* | ||
46 | * Per CPU debug store for Intel performance monitoring. Wastes a | ||
47 | * full page at the moment. | ||
48 | */ | ||
49 | struct debug_store cpu_debug_store; | ||
50 | /* | ||
51 | * The actual PEBS/BTS buffers must be mapped to user space | ||
52 | * Reserve enough fixmap PTEs. | ||
53 | */ | ||
54 | struct debug_store_buffers cpu_debug_buffers; | ||
55 | #endif | ||
43 | }; | 56 | }; |
44 | 57 | ||
45 | #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) | 58 | #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) |
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 800104c8a3ed..07cdd1715705 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h | |||
@@ -197,11 +197,12 @@ | |||
197 | #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ | 197 | #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ |
198 | #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ | 198 | #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ |
199 | #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ | 199 | #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ |
200 | #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ | ||
200 | 201 | ||
201 | #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ | 202 | #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ |
202 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ | 203 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ |
203 | #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ | 204 | #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ |
204 | 205 | #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ | |
205 | #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ | 206 | #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ |
206 | #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ | 207 | #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ |
207 | #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ | 208 | #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ |
@@ -340,5 +341,6 @@ | |||
340 | #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ | 341 | #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ |
341 | #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ | 342 | #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ |
342 | #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ | 343 | #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ |
344 | #define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ | ||
343 | 345 | ||
344 | #endif /* _ASM_X86_CPUFEATURES_H */ | 346 | #endif /* _ASM_X86_CPUFEATURES_H */ |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ec8be07c0cda..13c5ee878a47 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in | |||
21 | 21 | ||
22 | desc->type = (info->read_exec_only ^ 1) << 1; | 22 | desc->type = (info->read_exec_only ^ 1) << 1; |
23 | desc->type |= info->contents << 2; | 23 | desc->type |= info->contents << 2; |
24 | /* Set the ACCESS bit so it can be mapped RO */ | ||
25 | desc->type |= 1; | ||
24 | 26 | ||
25 | desc->s = 1; | 27 | desc->s = 1; |
26 | desc->dpl = 0x3; | 28 | desc->dpl = 0x3; |
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 14d6d5007314..b027633e7300 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h | |||
@@ -50,6 +50,12 @@ | |||
50 | # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) | 50 | # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) |
51 | #endif | 51 | #endif |
52 | 52 | ||
53 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
54 | # define DISABLE_PTI 0 | ||
55 | #else | ||
56 | # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) | ||
57 | #endif | ||
58 | |||
53 | /* | 59 | /* |
54 | * Make sure to add features to the correct mask | 60 | * Make sure to add features to the correct mask |
55 | */ | 61 | */ |
@@ -60,7 +66,7 @@ | |||
60 | #define DISABLED_MASK4 (DISABLE_PCID) | 66 | #define DISABLED_MASK4 (DISABLE_PCID) |
61 | #define DISABLED_MASK5 0 | 67 | #define DISABLED_MASK5 0 |
62 | #define DISABLED_MASK6 0 | 68 | #define DISABLED_MASK6 0 |
63 | #define DISABLED_MASK7 0 | 69 | #define DISABLED_MASK7 (DISABLE_PTI) |
64 | #define DISABLED_MASK8 0 | 70 | #define DISABLED_MASK8 0 |
65 | #define DISABLED_MASK9 (DISABLE_MPX) | 71 | #define DISABLED_MASK9 (DISABLE_MPX) |
66 | #define DISABLED_MASK10 0 | 72 | #define DISABLED_MASK10 0 |
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h new file mode 100644 index 000000000000..62a9f4966b42 --- /dev/null +++ b/arch/x86/include/asm/intel_ds.h | |||
@@ -0,0 +1,36 @@ | |||
1 | #ifndef _ASM_INTEL_DS_H | ||
2 | #define _ASM_INTEL_DS_H | ||
3 | |||
4 | #include <linux/percpu-defs.h> | ||
5 | |||
6 | #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) | ||
7 | #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) | ||
8 | |||
9 | /* The maximal number of PEBS events: */ | ||
10 | #define MAX_PEBS_EVENTS 8 | ||
11 | |||
12 | /* | ||
13 | * A debug store configuration. | ||
14 | * | ||
15 | * We only support architectures that use 64bit fields. | ||
16 | */ | ||
17 | struct debug_store { | ||
18 | u64 bts_buffer_base; | ||
19 | u64 bts_index; | ||
20 | u64 bts_absolute_maximum; | ||
21 | u64 bts_interrupt_threshold; | ||
22 | u64 pebs_buffer_base; | ||
23 | u64 pebs_index; | ||
24 | u64 pebs_absolute_maximum; | ||
25 | u64 pebs_interrupt_threshold; | ||
26 | u64 pebs_event_reset[MAX_PEBS_EVENTS]; | ||
27 | } __aligned(PAGE_SIZE); | ||
28 | |||
29 | DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); | ||
30 | |||
31 | struct debug_store_buffers { | ||
32 | char bts_buffer[BTS_BUFFER_SIZE]; | ||
33 | char pebs_buffer[PEBS_BUFFER_SIZE]; | ||
34 | }; | ||
35 | |||
36 | #endif | ||
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5ede7cae1d67..c931b88982a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -50,10 +50,33 @@ struct ldt_struct { | |||
50 | * call gates. On native, we could merge the ldt_struct and LDT | 50 | * call gates. On native, we could merge the ldt_struct and LDT |
51 | * allocations, but it's not worth trying to optimize. | 51 | * allocations, but it's not worth trying to optimize. |
52 | */ | 52 | */ |
53 | struct desc_struct *entries; | 53 | struct desc_struct *entries; |
54 | unsigned int nr_entries; | 54 | unsigned int nr_entries; |
55 | |||
56 | /* | ||
57 | * If PTI is in use, then the entries array is not mapped while we're | ||
58 | * in user mode. The whole array will be aliased at the addressed | ||
59 | * given by ldt_slot_va(slot). We use two slots so that we can allocate | ||
60 | * and map, and enable a new LDT without invalidating the mapping | ||
61 | * of an older, still-in-use LDT. | ||
62 | * | ||
63 | * slot will be -1 if this LDT doesn't have an alias mapping. | ||
64 | */ | ||
65 | int slot; | ||
55 | }; | 66 | }; |
56 | 67 | ||
68 | /* This is a multiple of PAGE_SIZE. */ | ||
69 | #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) | ||
70 | |||
71 | static inline void *ldt_slot_va(int slot) | ||
72 | { | ||
73 | #ifdef CONFIG_X86_64 | ||
74 | return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); | ||
75 | #else | ||
76 | BUG(); | ||
77 | #endif | ||
78 | } | ||
79 | |||
57 | /* | 80 | /* |
58 | * Used for LDT copy/destruction. | 81 | * Used for LDT copy/destruction. |
59 | */ | 82 | */ |
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) | |||
64 | } | 87 | } |
65 | int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); | 88 | int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); |
66 | void destroy_context_ldt(struct mm_struct *mm); | 89 | void destroy_context_ldt(struct mm_struct *mm); |
90 | void ldt_arch_exit_mmap(struct mm_struct *mm); | ||
67 | #else /* CONFIG_MODIFY_LDT_SYSCALL */ | 91 | #else /* CONFIG_MODIFY_LDT_SYSCALL */ |
68 | static inline void init_new_context_ldt(struct mm_struct *mm) { } | 92 | static inline void init_new_context_ldt(struct mm_struct *mm) { } |
69 | static inline int ldt_dup_context(struct mm_struct *oldmm, | 93 | static inline int ldt_dup_context(struct mm_struct *oldmm, |
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, | |||
71 | { | 95 | { |
72 | return 0; | 96 | return 0; |
73 | } | 97 | } |
74 | static inline void destroy_context_ldt(struct mm_struct *mm) {} | 98 | static inline void destroy_context_ldt(struct mm_struct *mm) { } |
99 | static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } | ||
75 | #endif | 100 | #endif |
76 | 101 | ||
77 | static inline void load_mm_ldt(struct mm_struct *mm) | 102 | static inline void load_mm_ldt(struct mm_struct *mm) |
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) | |||
96 | * that we can see. | 121 | * that we can see. |
97 | */ | 122 | */ |
98 | 123 | ||
99 | if (unlikely(ldt)) | 124 | if (unlikely(ldt)) { |
100 | set_ldt(ldt->entries, ldt->nr_entries); | 125 | if (static_cpu_has(X86_FEATURE_PTI)) { |
101 | else | 126 | if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { |
127 | /* | ||
128 | * Whoops -- either the new LDT isn't mapped | ||
129 | * (if slot == -1) or is mapped into a bogus | ||
130 | * slot (if slot > 1). | ||
131 | */ | ||
132 | clear_LDT(); | ||
133 | return; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * If page table isolation is enabled, ldt->entries | ||
138 | * will not be mapped in the userspace pagetables. | ||
139 | * Tell the CPU to access the LDT through the alias | ||
140 | * at ldt_slot_va(ldt->slot). | ||
141 | */ | ||
142 | set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); | ||
143 | } else { | ||
144 | set_ldt(ldt->entries, ldt->nr_entries); | ||
145 | } | ||
146 | } else { | ||
102 | clear_LDT(); | 147 | clear_LDT(); |
148 | } | ||
103 | #else | 149 | #else |
104 | clear_LDT(); | 150 | clear_LDT(); |
105 | #endif | 151 | #endif |
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | |||
194 | static inline void arch_exit_mmap(struct mm_struct *mm) | 240 | static inline void arch_exit_mmap(struct mm_struct *mm) |
195 | { | 241 | { |
196 | paravirt_arch_exit_mmap(mm); | 242 | paravirt_arch_exit_mmap(mm); |
243 | ldt_arch_exit_mmap(mm); | ||
197 | } | 244 | } |
198 | 245 | ||
199 | #ifdef CONFIG_X86_64 | 246 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 4b5e1eafada7..aff42e1da6ee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} | |||
30 | */ | 30 | */ |
31 | extern gfp_t __userpte_alloc_gfp; | 31 | extern gfp_t __userpte_alloc_gfp; |
32 | 32 | ||
33 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
34 | /* | ||
35 | * Instead of one PGD, we acquire two PGDs. Being order-1, it is | ||
36 | * both 8k in size and 8k-aligned. That lets us just flip bit 12 | ||
37 | * in a pointer to swap between the two 4k halves. | ||
38 | */ | ||
39 | #define PGD_ALLOCATION_ORDER 1 | ||
40 | #else | ||
41 | #define PGD_ALLOCATION_ORDER 0 | ||
42 | #endif | ||
43 | |||
33 | /* | 44 | /* |
34 | * Allocate and free page tables. | 45 | * Allocate and free page tables. |
35 | */ | 46 | */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 95e2dfd75521..e42b8943cb1a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; | |||
28 | int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); | 28 | int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); |
29 | 29 | ||
30 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); | 30 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); |
31 | void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); | ||
31 | void ptdump_walk_pgd_level_checkwx(void); | 32 | void ptdump_walk_pgd_level_checkwx(void); |
32 | 33 | ||
33 | #ifdef CONFIG_DEBUG_WX | 34 | #ifdef CONFIG_DEBUG_WX |
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) | |||
841 | 842 | ||
842 | static inline int p4d_bad(p4d_t p4d) | 843 | static inline int p4d_bad(p4d_t p4d) |
843 | { | 844 | { |
844 | return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; | 845 | unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; |
846 | |||
847 | if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) | ||
848 | ignore_flags |= _PAGE_NX; | ||
849 | |||
850 | return (p4d_flags(p4d) & ~ignore_flags) != 0; | ||
845 | } | 851 | } |
846 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ | 852 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
847 | 853 | ||
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) | |||
875 | 881 | ||
876 | static inline int pgd_bad(pgd_t pgd) | 882 | static inline int pgd_bad(pgd_t pgd) |
877 | { | 883 | { |
878 | return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; | 884 | unsigned long ignore_flags = _PAGE_USER; |
885 | |||
886 | if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) | ||
887 | ignore_flags |= _PAGE_NX; | ||
888 | |||
889 | return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; | ||
879 | } | 890 | } |
880 | 891 | ||
881 | static inline int pgd_none(pgd_t pgd) | 892 | static inline int pgd_none(pgd_t pgd) |
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd) | |||
904 | * pgd_offset() returns a (pgd_t *) | 915 | * pgd_offset() returns a (pgd_t *) |
905 | * pgd_index() is used get the offset into the pgd page's array of pgd_t's; | 916 | * pgd_index() is used get the offset into the pgd page's array of pgd_t's; |
906 | */ | 917 | */ |
907 | #define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) | 918 | #define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) |
919 | /* | ||
920 | * a shortcut to get a pgd_t in a given mm | ||
921 | */ | ||
922 | #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) | ||
908 | /* | 923 | /* |
909 | * a shortcut which implies the use of the kernel's pgd, instead | 924 | * a shortcut which implies the use of the kernel's pgd, instead |
910 | * of a process's | 925 | * of a process's |
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud) | |||
1106 | */ | 1121 | */ |
1107 | static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) | 1122 | static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) |
1108 | { | 1123 | { |
1109 | memcpy(dst, src, count * sizeof(pgd_t)); | 1124 | memcpy(dst, src, count * sizeof(pgd_t)); |
1125 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
1126 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
1127 | return; | ||
1128 | /* Clone the user space pgd as well */ | ||
1129 | memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), | ||
1130 | count * sizeof(pgd_t)); | ||
1131 | #endif | ||
1110 | } | 1132 | } |
1111 | 1133 | ||
1112 | #define PTE_SHIFT ilog2(PTRS_PER_PTE) | 1134 | #define PTE_SHIFT ilog2(PTRS_PER_PTE) |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e9f05331e732..81462e9a34f6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) | |||
131 | #endif | 131 | #endif |
132 | } | 132 | } |
133 | 133 | ||
134 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
135 | /* | ||
136 | * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages | ||
137 | * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and | ||
138 | * the user one is in the last 4k. To switch between them, you | ||
139 | * just need to flip the 12th bit in their addresses. | ||
140 | */ | ||
141 | #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT | ||
142 | |||
143 | /* | ||
144 | * This generates better code than the inline assembly in | ||
145 | * __set_bit(). | ||
146 | */ | ||
147 | static inline void *ptr_set_bit(void *ptr, int bit) | ||
148 | { | ||
149 | unsigned long __ptr = (unsigned long)ptr; | ||
150 | |||
151 | __ptr |= BIT(bit); | ||
152 | return (void *)__ptr; | ||
153 | } | ||
154 | static inline void *ptr_clear_bit(void *ptr, int bit) | ||
155 | { | ||
156 | unsigned long __ptr = (unsigned long)ptr; | ||
157 | |||
158 | __ptr &= ~BIT(bit); | ||
159 | return (void *)__ptr; | ||
160 | } | ||
161 | |||
162 | static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) | ||
163 | { | ||
164 | return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); | ||
165 | } | ||
166 | |||
167 | static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) | ||
168 | { | ||
169 | return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); | ||
170 | } | ||
171 | |||
172 | static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) | ||
173 | { | ||
174 | return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); | ||
175 | } | ||
176 | |||
177 | static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) | ||
178 | { | ||
179 | return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); | ||
180 | } | ||
181 | #endif /* CONFIG_PAGE_TABLE_ISOLATION */ | ||
182 | |||
183 | /* | ||
184 | * Page table pages are page-aligned. The lower half of the top | ||
185 | * level is used for userspace and the top half for the kernel. | ||
186 | * | ||
187 | * Returns true for parts of the PGD that map userspace and | ||
188 | * false for the parts that map the kernel. | ||
189 | */ | ||
190 | static inline bool pgdp_maps_userspace(void *__ptr) | ||
191 | { | ||
192 | unsigned long ptr = (unsigned long)__ptr; | ||
193 | |||
194 | return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); | ||
195 | } | ||
196 | |||
197 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
198 | pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); | ||
199 | |||
200 | /* | ||
201 | * Take a PGD location (pgdp) and a pgd value that needs to be set there. | ||
202 | * Populates the user and returns the resulting PGD that must be set in | ||
203 | * the kernel copy of the page tables. | ||
204 | */ | ||
205 | static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) | ||
206 | { | ||
207 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
208 | return pgd; | ||
209 | return __pti_set_user_pgd(pgdp, pgd); | ||
210 | } | ||
211 | #else | ||
212 | static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) | ||
213 | { | ||
214 | return pgd; | ||
215 | } | ||
216 | #endif | ||
217 | |||
134 | static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) | 218 | static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) |
135 | { | 219 | { |
220 | #if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) | ||
221 | p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); | ||
222 | #else | ||
136 | *p4dp = p4d; | 223 | *p4dp = p4d; |
224 | #endif | ||
137 | } | 225 | } |
138 | 226 | ||
139 | static inline void native_p4d_clear(p4d_t *p4d) | 227 | static inline void native_p4d_clear(p4d_t *p4d) |
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d) | |||
147 | 235 | ||
148 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) | 236 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
149 | { | 237 | { |
238 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
239 | *pgdp = pti_set_user_pgd(pgdp, pgd); | ||
240 | #else | ||
150 | *pgdp = pgd; | 241 | *pgdp = pgd; |
242 | #endif | ||
151 | } | 243 | } |
152 | 244 | ||
153 | static inline void native_pgd_clear(pgd_t *pgd) | 245 | static inline void native_pgd_clear(pgd_t *pgd) |
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3d27831bc58d..b97a539bcdee 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t; | |||
79 | #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) | 79 | #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) |
80 | 80 | ||
81 | #ifdef CONFIG_X86_5LEVEL | 81 | #ifdef CONFIG_X86_5LEVEL |
82 | # define VMALLOC_SIZE_TB _AC(16384, UL) | 82 | # define VMALLOC_SIZE_TB _AC(12800, UL) |
83 | # define __VMALLOC_BASE _AC(0xff92000000000000, UL) | 83 | # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) |
84 | # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) | 84 | # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) |
85 | # define LDT_PGD_ENTRY _AC(-112, UL) | ||
86 | # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) | ||
85 | #else | 87 | #else |
86 | # define VMALLOC_SIZE_TB _AC(32, UL) | 88 | # define VMALLOC_SIZE_TB _AC(32, UL) |
87 | # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) | 89 | # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) |
88 | # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) | 90 | # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) |
91 | # define LDT_PGD_ENTRY _AC(-4, UL) | ||
92 | # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) | ||
89 | #endif | 93 | #endif |
90 | 94 | ||
91 | #ifdef CONFIG_RANDOMIZE_MEMORY | 95 | #ifdef CONFIG_RANDOMIZE_MEMORY |
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 43212a43ee69..6a60fea90b9d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h | |||
@@ -38,6 +38,11 @@ | |||
38 | #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) | 38 | #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) |
39 | #define CR3_PCID_MASK 0xFFFull | 39 | #define CR3_PCID_MASK 0xFFFull |
40 | #define CR3_NOFLUSH BIT_ULL(63) | 40 | #define CR3_NOFLUSH BIT_ULL(63) |
41 | |||
42 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
43 | # define X86_CR3_PTI_SWITCH_BIT 11 | ||
44 | #endif | ||
45 | |||
41 | #else | 46 | #else |
42 | /* | 47 | /* |
43 | * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save | 48 | * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cad8dab266bc..d3a67fba200a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -852,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x) | |||
852 | 852 | ||
853 | #else | 853 | #else |
854 | /* | 854 | /* |
855 | * User space process size. 47bits minus one guard page. The guard | 855 | * User space process size. This is the first address outside the user range. |
856 | * page is necessary on Intel CPUs: if a SYSCALL instruction is at | 856 | * There are a few constraints that determine this: |
857 | * the highest possible canonical userspace address, then that | 857 | * |
858 | * syscall will enter the kernel with a non-canonical return | 858 | * On Intel CPUs, if a SYSCALL instruction is at the highest canonical |
859 | * address, and SYSRET will explode dangerously. We avoid this | 859 | * address, then that syscall will enter the kernel with a |
860 | * particular problem by preventing anything from being mapped | 860 | * non-canonical return address, and SYSRET will explode dangerously. |
861 | * at the maximum canonical address. | 861 | * We avoid this particular problem by preventing anything executable |
862 | * from being mapped at the maximum canonical address. | ||
863 | * | ||
864 | * On AMD CPUs in the Ryzen family, there's a nasty bug in which the | ||
865 | * CPUs malfunction if they execute code from the highest canonical page. | ||
866 | * They'll speculate right off the end of the canonical space, and | ||
867 | * bad things happen. This is worked around in the same way as the | ||
868 | * Intel problem. | ||
869 | * | ||
870 | * With page table isolation enabled, we map the LDT in ... [stay tuned] | ||
862 | */ | 871 | */ |
863 | #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) | 872 | #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) |
864 | 873 | ||
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h new file mode 100644 index 000000000000..0b5ef05b2d2d --- /dev/null +++ b/arch/x86/include/asm/pti.h | |||
@@ -0,0 +1,14 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #ifndef _ASM_X86_PTI_H | ||
3 | #define _ASM_X86_PTI_H | ||
4 | #ifndef __ASSEMBLY__ | ||
5 | |||
6 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
7 | extern void pti_init(void); | ||
8 | extern void pti_check_boottime_disable(void); | ||
9 | #else | ||
10 | static inline void pti_check_boottime_disable(void) { } | ||
11 | #endif | ||
12 | |||
13 | #endif /* __ASSEMBLY__ */ | ||
14 | #endif /* _ASM_X86_PTI_H */ | ||
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e1884cf35257..f68f9c836cca 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -10,38 +10,90 @@ | |||
10 | #include <asm/special_insns.h> | 10 | #include <asm/special_insns.h> |
11 | #include <asm/smp.h> | 11 | #include <asm/smp.h> |
12 | #include <asm/invpcid.h> | 12 | #include <asm/invpcid.h> |
13 | #include <asm/pti.h> | ||
14 | #include <asm/processor-flags.h> | ||
13 | 15 | ||
14 | static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) | 16 | /* |
15 | { | 17 | * The x86 feature is called PCID (Process Context IDentifier). It is similar |
16 | /* | 18 | * to what is traditionally called ASID on the RISC processors. |
17 | * Bump the generation count. This also serves as a full barrier | 19 | * |
18 | * that synchronizes with switch_mm(): callers are required to order | 20 | * We don't use the traditional ASID implementation, where each process/mm gets |
19 | * their read of mm_cpumask after their writes to the paging | 21 | * its own ASID and flush/restart when we run out of ASID space. |
20 | * structures. | 22 | * |
21 | */ | 23 | * Instead we have a small per-cpu array of ASIDs and cache the last few mm's |
22 | return atomic64_inc_return(&mm->context.tlb_gen); | 24 | * that came by on this CPU, allowing cheaper switch_mm between processes on |
23 | } | 25 | * this CPU. |
26 | * | ||
27 | * We end up with different spaces for different things. To avoid confusion we | ||
28 | * use different names for each of them: | ||
29 | * | ||
30 | * ASID - [0, TLB_NR_DYN_ASIDS-1] | ||
31 | * the canonical identifier for an mm | ||
32 | * | ||
33 | * kPCID - [1, TLB_NR_DYN_ASIDS] | ||
34 | * the value we write into the PCID part of CR3; corresponds to the | ||
35 | * ASID+1, because PCID 0 is special. | ||
36 | * | ||
37 | * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] | ||
38 | * for KPTI each mm has two address spaces and thus needs two | ||
39 | * PCID values, but we can still do with a single ASID denomination | ||
40 | * for each mm. Corresponds to kPCID + 2048. | ||
41 | * | ||
42 | */ | ||
24 | 43 | ||
25 | /* There are 12 bits of space for ASIDS in CR3 */ | 44 | /* There are 12 bits of space for ASIDS in CR3 */ |
26 | #define CR3_HW_ASID_BITS 12 | 45 | #define CR3_HW_ASID_BITS 12 |
46 | |||
27 | /* | 47 | /* |
28 | * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for | 48 | * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for |
29 | * user/kernel switches | 49 | * user/kernel switches |
30 | */ | 50 | */ |
31 | #define PTI_CONSUMED_ASID_BITS 0 | 51 | #ifdef CONFIG_PAGE_TABLE_ISOLATION |
52 | # define PTI_CONSUMED_PCID_BITS 1 | ||
53 | #else | ||
54 | # define PTI_CONSUMED_PCID_BITS 0 | ||
55 | #endif | ||
56 | |||
57 | #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) | ||
32 | 58 | ||
33 | #define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) | ||
34 | /* | 59 | /* |
35 | * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account | 60 | * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account |
36 | * for them being zero-based. Another -1 is because ASID 0 is reserved for | 61 | * for them being zero-based. Another -1 is because PCID 0 is reserved for |
37 | * use by non-PCID-aware users. | 62 | * use by non-PCID-aware users. |
38 | */ | 63 | */ |
39 | #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) | 64 | #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) |
40 | 65 | ||
66 | /* | ||
67 | * 6 because 6 should be plenty and struct tlb_state will fit in two cache | ||
68 | * lines. | ||
69 | */ | ||
70 | #define TLB_NR_DYN_ASIDS 6 | ||
71 | |||
72 | /* | ||
73 | * Given @asid, compute kPCID | ||
74 | */ | ||
41 | static inline u16 kern_pcid(u16 asid) | 75 | static inline u16 kern_pcid(u16 asid) |
42 | { | 76 | { |
43 | VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); | 77 | VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); |
78 | |||
79 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
80 | /* | ||
81 | * Make sure that the dynamic ASID space does not confict with the | ||
82 | * bit we are using to switch between user and kernel ASIDs. | ||
83 | */ | ||
84 | BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); | ||
85 | |||
86 | /* | ||
87 | * The ASID being passed in here should have respected the | ||
88 | * MAX_ASID_AVAILABLE and thus never have the switch bit set. | ||
89 | */ | ||
90 | VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); | ||
91 | #endif | ||
44 | /* | 92 | /* |
93 | * The dynamically-assigned ASIDs that get passed in are small | ||
94 | * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, | ||
95 | * so do not bother to clear it. | ||
96 | * | ||
45 | * If PCID is on, ASID-aware code paths put the ASID+1 into the | 97 | * If PCID is on, ASID-aware code paths put the ASID+1 into the |
46 | * PCID bits. This serves two purposes. It prevents a nasty | 98 | * PCID bits. This serves two purposes. It prevents a nasty |
47 | * situation in which PCID-unaware code saves CR3, loads some other | 99 | * situation in which PCID-unaware code saves CR3, loads some other |
@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid) | |||
53 | return asid + 1; | 105 | return asid + 1; |
54 | } | 106 | } |
55 | 107 | ||
108 | /* | ||
109 | * Given @asid, compute uPCID | ||
110 | */ | ||
111 | static inline u16 user_pcid(u16 asid) | ||
112 | { | ||
113 | u16 ret = kern_pcid(asid); | ||
114 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
115 | ret |= 1 << X86_CR3_PTI_SWITCH_BIT; | ||
116 | #endif | ||
117 | return ret; | ||
118 | } | ||
119 | |||
56 | struct pgd_t; | 120 | struct pgd_t; |
57 | static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) | 121 | static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) |
58 | { | 122 | { |
@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void) | |||
95 | return !static_cpu_has(X86_FEATURE_PCID); | 159 | return !static_cpu_has(X86_FEATURE_PCID); |
96 | } | 160 | } |
97 | 161 | ||
98 | /* | ||
99 | * 6 because 6 should be plenty and struct tlb_state will fit in | ||
100 | * two cache lines. | ||
101 | */ | ||
102 | #define TLB_NR_DYN_ASIDS 6 | ||
103 | |||
104 | struct tlb_context { | 162 | struct tlb_context { |
105 | u64 ctx_id; | 163 | u64 ctx_id; |
106 | u64 tlb_gen; | 164 | u64 tlb_gen; |
@@ -135,6 +193,24 @@ struct tlb_state { | |||
135 | bool is_lazy; | 193 | bool is_lazy; |
136 | 194 | ||
137 | /* | 195 | /* |
196 | * If set we changed the page tables in such a way that we | ||
197 | * needed an invalidation of all contexts (aka. PCIDs / ASIDs). | ||
198 | * This tells us to go invalidate all the non-loaded ctxs[] | ||
199 | * on the next context switch. | ||
200 | * | ||
201 | * The current ctx was kept up-to-date as it ran and does not | ||
202 | * need to be invalidated. | ||
203 | */ | ||
204 | bool invalidate_other; | ||
205 | |||
206 | /* | ||
207 | * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate | ||
208 | * the corresponding user PCID needs a flush next time we | ||
209 | * switch to it; see SWITCH_TO_USER_CR3. | ||
210 | */ | ||
211 | unsigned short user_pcid_flush_mask; | ||
212 | |||
213 | /* | ||
138 | * Access to this CR4 shadow and to H/W CR4 is protected by | 214 | * Access to this CR4 shadow and to H/W CR4 is protected by |
139 | * disabling interrupts when modifying either one. | 215 | * disabling interrupts when modifying either one. |
140 | */ | 216 | */ |
@@ -215,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void) | |||
215 | } | 291 | } |
216 | 292 | ||
217 | /* | 293 | /* |
294 | * Mark all other ASIDs as invalid, preserves the current. | ||
295 | */ | ||
296 | static inline void invalidate_other_asid(void) | ||
297 | { | ||
298 | this_cpu_write(cpu_tlbstate.invalidate_other, true); | ||
299 | } | ||
300 | |||
301 | /* | ||
218 | * Save some of cr4 feature set we're using (e.g. Pentium 4MB | 302 | * Save some of cr4 feature set we're using (e.g. Pentium 4MB |
219 | * enable and PPro Global page enable), so that any CPU's that boot | 303 | * enable and PPro Global page enable), so that any CPU's that boot |
220 | * up after us can get the correct flags. This should only be used | 304 | * up after us can get the correct flags. This should only be used |
@@ -234,14 +318,41 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) | |||
234 | extern void initialize_tlbstate_and_flush(void); | 318 | extern void initialize_tlbstate_and_flush(void); |
235 | 319 | ||
236 | /* | 320 | /* |
321 | * Given an ASID, flush the corresponding user ASID. We can delay this | ||
322 | * until the next time we switch to it. | ||
323 | * | ||
324 | * See SWITCH_TO_USER_CR3. | ||
325 | */ | ||
326 | static inline void invalidate_user_asid(u16 asid) | ||
327 | { | ||
328 | /* There is no user ASID if address space separation is off */ | ||
329 | if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) | ||
330 | return; | ||
331 | |||
332 | /* | ||
333 | * We only have a single ASID if PCID is off and the CR3 | ||
334 | * write will have flushed it. | ||
335 | */ | ||
336 | if (!cpu_feature_enabled(X86_FEATURE_PCID)) | ||
337 | return; | ||
338 | |||
339 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
340 | return; | ||
341 | |||
342 | __set_bit(kern_pcid(asid), | ||
343 | (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); | ||
344 | } | ||
345 | |||
346 | /* | ||
237 | * flush the entire current user mapping | 347 | * flush the entire current user mapping |
238 | */ | 348 | */ |
239 | static inline void __native_flush_tlb(void) | 349 | static inline void __native_flush_tlb(void) |
240 | { | 350 | { |
351 | invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); | ||
241 | /* | 352 | /* |
242 | * If current->mm == NULL then we borrow a mm which may change during a | 353 | * If current->mm == NULL then we borrow a mm which may change |
243 | * task switch and therefore we must not be preempted while we write CR3 | 354 | * during a task switch and therefore we must not be preempted |
244 | * back: | 355 | * while we write CR3 back: |
245 | */ | 356 | */ |
246 | preempt_disable(); | 357 | preempt_disable(); |
247 | native_write_cr3(__native_read_cr3()); | 358 | native_write_cr3(__native_read_cr3()); |
@@ -259,6 +370,8 @@ static inline void __native_flush_tlb_global(void) | |||
259 | /* | 370 | /* |
260 | * Using INVPCID is considerably faster than a pair of writes | 371 | * Using INVPCID is considerably faster than a pair of writes |
261 | * to CR4 sandwiched inside an IRQ flag save/restore. | 372 | * to CR4 sandwiched inside an IRQ flag save/restore. |
373 | * | ||
374 | * Note, this works with CR4.PCIDE=0 or 1. | ||
262 | */ | 375 | */ |
263 | invpcid_flush_all(); | 376 | invpcid_flush_all(); |
264 | return; | 377 | return; |
@@ -285,7 +398,21 @@ static inline void __native_flush_tlb_global(void) | |||
285 | */ | 398 | */ |
286 | static inline void __native_flush_tlb_single(unsigned long addr) | 399 | static inline void __native_flush_tlb_single(unsigned long addr) |
287 | { | 400 | { |
401 | u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); | ||
402 | |||
288 | asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); | 403 | asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
404 | |||
405 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
406 | return; | ||
407 | |||
408 | /* | ||
409 | * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. | ||
410 | * Just use invalidate_user_asid() in case we are called early. | ||
411 | */ | ||
412 | if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) | ||
413 | invalidate_user_asid(loaded_mm_asid); | ||
414 | else | ||
415 | invpcid_flush_one(user_pcid(loaded_mm_asid), addr); | ||
289 | } | 416 | } |
290 | 417 | ||
291 | /* | 418 | /* |
@@ -301,14 +428,6 @@ static inline void __flush_tlb_all(void) | |||
301 | */ | 428 | */ |
302 | __flush_tlb(); | 429 | __flush_tlb(); |
303 | } | 430 | } |
304 | |||
305 | /* | ||
306 | * Note: if we somehow had PCID but not PGE, then this wouldn't work -- | ||
307 | * we'd end up flushing kernel translations for the current ASID but | ||
308 | * we might fail to flush kernel translations for other cached ASIDs. | ||
309 | * | ||
310 | * To avoid this issue, we force PCID off if PGE is off. | ||
311 | */ | ||
312 | } | 431 | } |
313 | 432 | ||
314 | /* | 433 | /* |
@@ -318,6 +437,16 @@ static inline void __flush_tlb_one(unsigned long addr) | |||
318 | { | 437 | { |
319 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); | 438 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
320 | __flush_tlb_single(addr); | 439 | __flush_tlb_single(addr); |
440 | |||
441 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
442 | return; | ||
443 | |||
444 | /* | ||
445 | * __flush_tlb_single() will have cleared the TLB entry for this ASID, | ||
446 | * but since kernel space is replicated across all, we must also | ||
447 | * invalidate all others. | ||
448 | */ | ||
449 | invalidate_other_asid(); | ||
321 | } | 450 | } |
322 | 451 | ||
323 | #define TLB_FLUSH_ALL -1UL | 452 | #define TLB_FLUSH_ALL -1UL |
@@ -378,6 +507,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) | |||
378 | void native_flush_tlb_others(const struct cpumask *cpumask, | 507 | void native_flush_tlb_others(const struct cpumask *cpumask, |
379 | const struct flush_tlb_info *info); | 508 | const struct flush_tlb_info *info); |
380 | 509 | ||
510 | static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) | ||
511 | { | ||
512 | /* | ||
513 | * Bump the generation count. This also serves as a full barrier | ||
514 | * that synchronizes with switch_mm(): callers are required to order | ||
515 | * their read of mm_cpumask after their writes to the paging | ||
516 | * structures. | ||
517 | */ | ||
518 | return atomic64_inc_return(&mm->context.tlb_gen); | ||
519 | } | ||
520 | |||
381 | static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, | 521 | static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, |
382 | struct mm_struct *mm) | 522 | struct mm_struct *mm) |
383 | { | 523 | { |
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d9a7c659009c..b986b2ca688a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h | |||
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #ifdef CONFIG_X86_VSYSCALL_EMULATION | 8 | #ifdef CONFIG_X86_VSYSCALL_EMULATION |
9 | extern void map_vsyscall(void); | 9 | extern void map_vsyscall(void); |
10 | extern void set_vsyscall_pgtable_user_bits(pgd_t *root); | ||
10 | 11 | ||
11 | /* | 12 | /* |
12 | * Called on instruction fetch fault in vsyscall page. | 13 | * Called on instruction fetch fault in vsyscall page. |
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 7e1e730396ae..bcba3c643e63 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h | |||
@@ -78,7 +78,12 @@ | |||
78 | #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) | 78 | #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) |
79 | #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ | 79 | #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ |
80 | #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) | 80 | #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) |
81 | #define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ | 81 | |
82 | #define X86_CR3_PCID_BITS 12 | ||
83 | #define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) | ||
84 | |||
85 | #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ | ||
86 | #define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) | ||
82 | 87 | ||
83 | /* | 88 | /* |
84 | * Intel CPU features in CR4 | 89 | * Intel CPU features in CR4 |
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 676b7cf4b62b..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <asm/sigframe.h> | 17 | #include <asm/sigframe.h> |
18 | #include <asm/bootparam.h> | 18 | #include <asm/bootparam.h> |
19 | #include <asm/suspend.h> | 19 | #include <asm/suspend.h> |
20 | #include <asm/tlbflush.h> | ||
20 | 21 | ||
21 | #ifdef CONFIG_XEN | 22 | #ifdef CONFIG_XEN |
22 | #include <xen/interface/xen.h> | 23 | #include <xen/interface/xen.h> |
@@ -94,6 +95,9 @@ void common(void) { | |||
94 | BLANK(); | 95 | BLANK(); |
95 | DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); | 96 | DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); |
96 | 97 | ||
98 | /* TLB state for the entry code */ | ||
99 | OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); | ||
100 | |||
97 | /* Layout info for cpu_entry_area */ | 101 | /* Layout info for cpu_entry_area */ |
98 | OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); | 102 | OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); |
99 | OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); | 103 | OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9757f07d738..c47de4ebf63a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) | |||
922 | } | 922 | } |
923 | 923 | ||
924 | setup_force_cpu_cap(X86_FEATURE_ALWAYS); | 924 | setup_force_cpu_cap(X86_FEATURE_ALWAYS); |
925 | |||
926 | /* Assume for now that ALL x86 CPUs are insecure */ | ||
927 | setup_force_cpu_bug(X86_BUG_CPU_INSECURE); | ||
928 | |||
925 | fpu__init_system(c); | 929 | fpu__init_system(c); |
926 | 930 | ||
927 | #ifdef CONFIG_X86_32 | 931 | #ifdef CONFIG_X86_32 |
@@ -1360,7 +1364,10 @@ void syscall_init(void) | |||
1360 | (entry_SYSCALL_64_trampoline - _entry_trampoline); | 1364 | (entry_SYSCALL_64_trampoline - _entry_trampoline); |
1361 | 1365 | ||
1362 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); | 1366 | wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); |
1363 | wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); | 1367 | if (static_cpu_has(X86_FEATURE_PTI)) |
1368 | wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); | ||
1369 | else | ||
1370 | wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); | ||
1364 | 1371 | ||
1365 | #ifdef CONFIG_IA32_EMULATION | 1372 | #ifdef CONFIG_IA32_EMULATION |
1366 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); | 1373 | wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 36b17e0febe8..5fa110699ed2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err) | |||
297 | unsigned long sp; | 297 | unsigned long sp; |
298 | #endif | 298 | #endif |
299 | printk(KERN_DEFAULT | 299 | printk(KERN_DEFAULT |
300 | "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, | 300 | "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, |
301 | IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", | 301 | IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", |
302 | IS_ENABLED(CONFIG_SMP) ? " SMP" : "", | 302 | IS_ENABLED(CONFIG_SMP) ? " SMP" : "", |
303 | debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", | 303 | debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", |
304 | IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); | 304 | IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", |
305 | IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? | ||
306 | (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); | ||
305 | 307 | ||
306 | if (notify_die(DIE_OOPS, str, regs, err, | 308 | if (notify_die(DIE_OOPS, str, regs, err, |
307 | current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) | 309 | current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag) | |||
341 | .balign PAGE_SIZE; \ | 341 | .balign PAGE_SIZE; \ |
342 | GLOBAL(name) | 342 | GLOBAL(name) |
343 | 343 | ||
344 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
345 | /* | ||
346 | * Each PGD needs to be 8k long and 8k aligned. We do not | ||
347 | * ever go out to userspace with these, so we do not | ||
348 | * strictly *need* the second page, but this allows us to | ||
349 | * have a single set_pgd() implementation that does not | ||
350 | * need to worry about whether it has 4k or 8k to work | ||
351 | * with. | ||
352 | * | ||
353 | * This ensures PGDs are 8k long: | ||
354 | */ | ||
355 | #define PTI_USER_PGD_FILL 512 | ||
356 | /* This ensures they are 8k-aligned: */ | ||
357 | #define NEXT_PGD_PAGE(name) \ | ||
358 | .balign 2 * PAGE_SIZE; \ | ||
359 | GLOBAL(name) | ||
360 | #else | ||
361 | #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) | ||
362 | #define PTI_USER_PGD_FILL 0 | ||
363 | #endif | ||
364 | |||
344 | /* Automate the creation of 1 to 1 mapping pmd entries */ | 365 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
345 | #define PMDS(START, PERM, COUNT) \ | 366 | #define PMDS(START, PERM, COUNT) \ |
346 | i = 0 ; \ | 367 | i = 0 ; \ |
@@ -350,13 +371,14 @@ GLOBAL(name) | |||
350 | .endr | 371 | .endr |
351 | 372 | ||
352 | __INITDATA | 373 | __INITDATA |
353 | NEXT_PAGE(early_top_pgt) | 374 | NEXT_PGD_PAGE(early_top_pgt) |
354 | .fill 511,8,0 | 375 | .fill 511,8,0 |
355 | #ifdef CONFIG_X86_5LEVEL | 376 | #ifdef CONFIG_X86_5LEVEL |
356 | .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC | 377 | .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC |
357 | #else | 378 | #else |
358 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC | 379 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC |
359 | #endif | 380 | #endif |
381 | .fill PTI_USER_PGD_FILL,8,0 | ||
360 | 382 | ||
361 | NEXT_PAGE(early_dynamic_pgts) | 383 | NEXT_PAGE(early_dynamic_pgts) |
362 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | 384 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts) | |||
364 | .data | 386 | .data |
365 | 387 | ||
366 | #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) | 388 | #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) |
367 | NEXT_PAGE(init_top_pgt) | 389 | NEXT_PGD_PAGE(init_top_pgt) |
368 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC | 390 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC |
369 | .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 | 391 | .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 |
370 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC | 392 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC |
371 | .org init_top_pgt + PGD_START_KERNEL*8, 0 | 393 | .org init_top_pgt + PGD_START_KERNEL*8, 0 |
372 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 394 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
373 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC | 395 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC |
396 | .fill PTI_USER_PGD_FILL,8,0 | ||
374 | 397 | ||
375 | NEXT_PAGE(level3_ident_pgt) | 398 | NEXT_PAGE(level3_ident_pgt) |
376 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC | 399 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC |
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt) | |||
381 | */ | 404 | */ |
382 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | 405 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
383 | #else | 406 | #else |
384 | NEXT_PAGE(init_top_pgt) | 407 | NEXT_PGD_PAGE(init_top_pgt) |
385 | .fill 512,8,0 | 408 | .fill 512,8,0 |
409 | .fill PTI_USER_PGD_FILL,8,0 | ||
386 | #endif | 410 | #endif |
387 | 411 | ||
388 | #ifdef CONFIG_X86_5LEVEL | 412 | #ifdef CONFIG_X86_5LEVEL |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a6b5d62f45a7..579cc4a66fdf 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | 25 | ||
26 | #include <asm/ldt.h> | 26 | #include <asm/ldt.h> |
27 | #include <asm/tlb.h> | ||
27 | #include <asm/desc.h> | 28 | #include <asm/desc.h> |
28 | #include <asm/mmu_context.h> | 29 | #include <asm/mmu_context.h> |
29 | #include <asm/syscalls.h> | 30 | #include <asm/syscalls.h> |
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void) | |||
51 | static void flush_ldt(void *__mm) | 52 | static void flush_ldt(void *__mm) |
52 | { | 53 | { |
53 | struct mm_struct *mm = __mm; | 54 | struct mm_struct *mm = __mm; |
54 | mm_context_t *pc; | ||
55 | 55 | ||
56 | if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) | 56 | if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) |
57 | return; | 57 | return; |
58 | 58 | ||
59 | pc = &mm->context; | 59 | load_mm_ldt(mm); |
60 | set_ldt(pc->ldt->entries, pc->ldt->nr_entries); | ||
61 | 60 | ||
62 | refresh_ldt_segments(); | 61 | refresh_ldt_segments(); |
63 | } | 62 | } |
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) | |||
94 | return NULL; | 93 | return NULL; |
95 | } | 94 | } |
96 | 95 | ||
96 | /* The new LDT isn't aliased for PTI yet. */ | ||
97 | new_ldt->slot = -1; | ||
98 | |||
97 | new_ldt->nr_entries = num_entries; | 99 | new_ldt->nr_entries = num_entries; |
98 | return new_ldt; | 100 | return new_ldt; |
99 | } | 101 | } |
100 | 102 | ||
103 | /* | ||
104 | * If PTI is enabled, this maps the LDT into the kernelmode and | ||
105 | * usermode tables for the given mm. | ||
106 | * | ||
107 | * There is no corresponding unmap function. Even if the LDT is freed, we | ||
108 | * leave the PTEs around until the slot is reused or the mm is destroyed. | ||
109 | * This is harmless: the LDT is always in ordinary memory, and no one will | ||
110 | * access the freed slot. | ||
111 | * | ||
112 | * If we wanted to unmap freed LDTs, we'd also need to do a flush to make | ||
113 | * it useful, and the flush would slow down modify_ldt(). | ||
114 | */ | ||
115 | static int | ||
116 | map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) | ||
117 | { | ||
118 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
119 | bool is_vmalloc, had_top_level_entry; | ||
120 | unsigned long va; | ||
121 | spinlock_t *ptl; | ||
122 | pgd_t *pgd; | ||
123 | int i; | ||
124 | |||
125 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
126 | return 0; | ||
127 | |||
128 | /* | ||
129 | * Any given ldt_struct should have map_ldt_struct() called at most | ||
130 | * once. | ||
131 | */ | ||
132 | WARN_ON(ldt->slot != -1); | ||
133 | |||
134 | /* | ||
135 | * Did we already have the top level entry allocated? We can't | ||
136 | * use pgd_none() for this because it doens't do anything on | ||
137 | * 4-level page table kernels. | ||
138 | */ | ||
139 | pgd = pgd_offset(mm, LDT_BASE_ADDR); | ||
140 | had_top_level_entry = (pgd->pgd != 0); | ||
141 | |||
142 | is_vmalloc = is_vmalloc_addr(ldt->entries); | ||
143 | |||
144 | for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { | ||
145 | unsigned long offset = i << PAGE_SHIFT; | ||
146 | const void *src = (char *)ldt->entries + offset; | ||
147 | unsigned long pfn; | ||
148 | pte_t pte, *ptep; | ||
149 | |||
150 | va = (unsigned long)ldt_slot_va(slot) + offset; | ||
151 | pfn = is_vmalloc ? vmalloc_to_pfn(src) : | ||
152 | page_to_pfn(virt_to_page(src)); | ||
153 | /* | ||
154 | * Treat the PTI LDT range as a *userspace* range. | ||
155 | * get_locked_pte() will allocate all needed pagetables | ||
156 | * and account for them in this mm. | ||
157 | */ | ||
158 | ptep = get_locked_pte(mm, va, &ptl); | ||
159 | if (!ptep) | ||
160 | return -ENOMEM; | ||
161 | /* | ||
162 | * Map it RO so the easy to find address is not a primary | ||
163 | * target via some kernel interface which misses a | ||
164 | * permission check. | ||
165 | */ | ||
166 | pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); | ||
167 | set_pte_at(mm, va, ptep, pte); | ||
168 | pte_unmap_unlock(ptep, ptl); | ||
169 | } | ||
170 | |||
171 | if (mm->context.ldt) { | ||
172 | /* | ||
173 | * We already had an LDT. The top-level entry should already | ||
174 | * have been allocated and synchronized with the usermode | ||
175 | * tables. | ||
176 | */ | ||
177 | WARN_ON(!had_top_level_entry); | ||
178 | if (static_cpu_has(X86_FEATURE_PTI)) | ||
179 | WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); | ||
180 | } else { | ||
181 | /* | ||
182 | * This is the first time we're mapping an LDT for this process. | ||
183 | * Sync the pgd to the usermode tables. | ||
184 | */ | ||
185 | WARN_ON(had_top_level_entry); | ||
186 | if (static_cpu_has(X86_FEATURE_PTI)) { | ||
187 | WARN_ON(kernel_to_user_pgdp(pgd)->pgd); | ||
188 | set_pgd(kernel_to_user_pgdp(pgd), *pgd); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | va = (unsigned long)ldt_slot_va(slot); | ||
193 | flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); | ||
194 | |||
195 | ldt->slot = slot; | ||
196 | #endif | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static void free_ldt_pgtables(struct mm_struct *mm) | ||
201 | { | ||
202 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
203 | struct mmu_gather tlb; | ||
204 | unsigned long start = LDT_BASE_ADDR; | ||
205 | unsigned long end = start + (1UL << PGDIR_SHIFT); | ||
206 | |||
207 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
208 | return; | ||
209 | |||
210 | tlb_gather_mmu(&tlb, mm, start, end); | ||
211 | free_pgd_range(&tlb, start, end, start, end); | ||
212 | tlb_finish_mmu(&tlb, start, end); | ||
213 | #endif | ||
214 | } | ||
215 | |||
101 | /* After calling this, the LDT is immutable. */ | 216 | /* After calling this, the LDT is immutable. */ |
102 | static void finalize_ldt_struct(struct ldt_struct *ldt) | 217 | static void finalize_ldt_struct(struct ldt_struct *ldt) |
103 | { | 218 | { |
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) | |||
156 | new_ldt->nr_entries * LDT_ENTRY_SIZE); | 271 | new_ldt->nr_entries * LDT_ENTRY_SIZE); |
157 | finalize_ldt_struct(new_ldt); | 272 | finalize_ldt_struct(new_ldt); |
158 | 273 | ||
274 | retval = map_ldt_struct(mm, new_ldt, 0); | ||
275 | if (retval) { | ||
276 | free_ldt_pgtables(mm); | ||
277 | free_ldt_struct(new_ldt); | ||
278 | goto out_unlock; | ||
279 | } | ||
159 | mm->context.ldt = new_ldt; | 280 | mm->context.ldt = new_ldt; |
160 | 281 | ||
161 | out_unlock: | 282 | out_unlock: |
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm) | |||
174 | mm->context.ldt = NULL; | 295 | mm->context.ldt = NULL; |
175 | } | 296 | } |
176 | 297 | ||
298 | void ldt_arch_exit_mmap(struct mm_struct *mm) | ||
299 | { | ||
300 | free_ldt_pgtables(mm); | ||
301 | } | ||
302 | |||
177 | static int read_ldt(void __user *ptr, unsigned long bytecount) | 303 | static int read_ldt(void __user *ptr, unsigned long bytecount) |
178 | { | 304 | { |
179 | struct mm_struct *mm = current->mm; | 305 | struct mm_struct *mm = current->mm; |
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
287 | new_ldt->entries[ldt_info.entry_number] = ldt; | 413 | new_ldt->entries[ldt_info.entry_number] = ldt; |
288 | finalize_ldt_struct(new_ldt); | 414 | finalize_ldt_struct(new_ldt); |
289 | 415 | ||
416 | /* | ||
417 | * If we are using PTI, map the new LDT into the userspace pagetables. | ||
418 | * If there is already an LDT, use the other slot so that other CPUs | ||
419 | * will continue to use the old LDT until install_ldt() switches | ||
420 | * them over to the new LDT. | ||
421 | */ | ||
422 | error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); | ||
423 | if (error) { | ||
424 | free_ldt_struct(old_ldt); | ||
425 | goto out_unlock; | ||
426 | } | ||
427 | |||
290 | install_ldt(mm, new_ldt); | 428 | install_ldt(mm, new_ldt); |
291 | free_ldt_struct(old_ldt); | 429 | free_ldt_struct(old_ldt); |
292 | error = 0; | 430 | error = 0; |
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c | |||
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, | |||
93 | cpu = get_cpu(); | 93 | cpu = get_cpu(); |
94 | 94 | ||
95 | while (n-- > 0) { | 95 | while (n-- > 0) { |
96 | if (LDT_empty(info) || LDT_zero(info)) { | 96 | if (LDT_empty(info) || LDT_zero(info)) |
97 | memset(desc, 0, sizeof(*desc)); | 97 | memset(desc, 0, sizeof(*desc)); |
98 | } else { | 98 | else |
99 | fill_ldt(desc, info); | 99 | fill_ldt(desc, info); |
100 | |||
101 | /* | ||
102 | * Always set the accessed bit so that the CPU | ||
103 | * doesn't try to write to the (read-only) GDT. | ||
104 | */ | ||
105 | desc->type |= 1; | ||
106 | } | ||
107 | ++info; | 100 | ++info; |
108 | ++desc; | 101 | ++desc; |
109 | } | 102 | } |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d2a8b5a24a44..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -61,11 +61,17 @@ jiffies_64 = jiffies; | |||
61 | . = ALIGN(HPAGE_SIZE); \ | 61 | . = ALIGN(HPAGE_SIZE); \ |
62 | __end_rodata_hpage_align = .; | 62 | __end_rodata_hpage_align = .; |
63 | 63 | ||
64 | #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); | ||
65 | #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); | ||
66 | |||
64 | #else | 67 | #else |
65 | 68 | ||
66 | #define X64_ALIGN_RODATA_BEGIN | 69 | #define X64_ALIGN_RODATA_BEGIN |
67 | #define X64_ALIGN_RODATA_END | 70 | #define X64_ALIGN_RODATA_END |
68 | 71 | ||
72 | #define ALIGN_ENTRY_TEXT_BEGIN | ||
73 | #define ALIGN_ENTRY_TEXT_END | ||
74 | |||
69 | #endif | 75 | #endif |
70 | 76 | ||
71 | PHDRS { | 77 | PHDRS { |
@@ -102,8 +108,10 @@ SECTIONS | |||
102 | CPUIDLE_TEXT | 108 | CPUIDLE_TEXT |
103 | LOCK_TEXT | 109 | LOCK_TEXT |
104 | KPROBES_TEXT | 110 | KPROBES_TEXT |
111 | ALIGN_ENTRY_TEXT_BEGIN | ||
105 | ENTRY_TEXT | 112 | ENTRY_TEXT |
106 | IRQENTRY_TEXT | 113 | IRQENTRY_TEXT |
114 | ALIGN_ENTRY_TEXT_END | ||
107 | SOFTIRQENTRY_TEXT | 115 | SOFTIRQENTRY_TEXT |
108 | *(.fixup) | 116 | *(.fixup) |
109 | *(.gnu.warning) | 117 | *(.gnu.warning) |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 52195ee3f6d5..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o | |||
41 | obj-$(CONFIG_ACPI_NUMA) += srat.o | 41 | obj-$(CONFIG_ACPI_NUMA) += srat.o |
42 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | 42 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
43 | 43 | ||
44 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o | 44 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o |
45 | obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o | 45 | obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o |
46 | obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o | 46 | obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o |
47 | obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o | ||
47 | 48 | ||
48 | obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o | 49 | obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o |
49 | obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o | 50 | obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o |
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index fe814fd5e014..b9283cc27622 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c | |||
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) | |||
38 | cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); | 38 | cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); |
39 | } | 39 | } |
40 | 40 | ||
41 | static void percpu_setup_debug_store(int cpu) | ||
42 | { | ||
43 | #ifdef CONFIG_CPU_SUP_INTEL | ||
44 | int npages; | ||
45 | void *cea; | ||
46 | |||
47 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) | ||
48 | return; | ||
49 | |||
50 | cea = &get_cpu_entry_area(cpu)->cpu_debug_store; | ||
51 | npages = sizeof(struct debug_store) / PAGE_SIZE; | ||
52 | BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); | ||
53 | cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, | ||
54 | PAGE_KERNEL); | ||
55 | |||
56 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; | ||
57 | /* | ||
58 | * Force the population of PMDs for not yet allocated per cpu | ||
59 | * memory like debug store buffers. | ||
60 | */ | ||
61 | npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; | ||
62 | for (; npages; npages--, cea += PAGE_SIZE) | ||
63 | cea_set_pte(cea, 0, PAGE_NONE); | ||
64 | #endif | ||
65 | } | ||
66 | |||
41 | /* Setup the fixmap mappings only once per-processor */ | 67 | /* Setup the fixmap mappings only once per-processor */ |
42 | static void __init setup_cpu_entry_area(int cpu) | 68 | static void __init setup_cpu_entry_area(int cpu) |
43 | { | 69 | { |
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu) | |||
109 | cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, | 135 | cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, |
110 | __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); | 136 | __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); |
111 | #endif | 137 | #endif |
138 | percpu_setup_debug_store(cpu); | ||
112 | } | 139 | } |
113 | 140 | ||
114 | static __init void setup_cpu_entry_area_ptes(void) | 141 | static __init void setup_cpu_entry_area_ptes(void) |
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c | |||
@@ -5,7 +5,7 @@ | |||
5 | 5 | ||
6 | static int ptdump_show(struct seq_file *m, void *v) | 6 | static int ptdump_show(struct seq_file *m, void *v) |
7 | { | 7 | { |
8 | ptdump_walk_pgd_level(m, NULL); | 8 | ptdump_walk_pgd_level_debugfs(m, NULL, false); |
9 | return 0; | 9 | return 0; |
10 | } | 10 | } |
11 | 11 | ||
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = { | |||
22 | .release = single_release, | 22 | .release = single_release, |
23 | }; | 23 | }; |
24 | 24 | ||
25 | static struct dentry *pe; | 25 | static int ptdump_show_curknl(struct seq_file *m, void *v) |
26 | { | ||
27 | if (current->mm->pgd) { | ||
28 | down_read(¤t->mm->mmap_sem); | ||
29 | ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); | ||
30 | up_read(¤t->mm->mmap_sem); | ||
31 | } | ||
32 | return 0; | ||
33 | } | ||
34 | |||
35 | static int ptdump_open_curknl(struct inode *inode, struct file *filp) | ||
36 | { | ||
37 | return single_open(filp, ptdump_show_curknl, NULL); | ||
38 | } | ||
39 | |||
40 | static const struct file_operations ptdump_curknl_fops = { | ||
41 | .owner = THIS_MODULE, | ||
42 | .open = ptdump_open_curknl, | ||
43 | .read = seq_read, | ||
44 | .llseek = seq_lseek, | ||
45 | .release = single_release, | ||
46 | }; | ||
47 | |||
48 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
49 | static struct dentry *pe_curusr; | ||
50 | |||
51 | static int ptdump_show_curusr(struct seq_file *m, void *v) | ||
52 | { | ||
53 | if (current->mm->pgd) { | ||
54 | down_read(¤t->mm->mmap_sem); | ||
55 | ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); | ||
56 | up_read(¤t->mm->mmap_sem); | ||
57 | } | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | static int ptdump_open_curusr(struct inode *inode, struct file *filp) | ||
62 | { | ||
63 | return single_open(filp, ptdump_show_curusr, NULL); | ||
64 | } | ||
65 | |||
66 | static const struct file_operations ptdump_curusr_fops = { | ||
67 | .owner = THIS_MODULE, | ||
68 | .open = ptdump_open_curusr, | ||
69 | .read = seq_read, | ||
70 | .llseek = seq_lseek, | ||
71 | .release = single_release, | ||
72 | }; | ||
73 | #endif | ||
74 | |||
75 | static struct dentry *dir, *pe_knl, *pe_curknl; | ||
26 | 76 | ||
27 | static int __init pt_dump_debug_init(void) | 77 | static int __init pt_dump_debug_init(void) |
28 | { | 78 | { |
29 | pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, | 79 | dir = debugfs_create_dir("page_tables", NULL); |
30 | &ptdump_fops); | 80 | if (!dir) |
31 | if (!pe) | ||
32 | return -ENOMEM; | 81 | return -ENOMEM; |
33 | 82 | ||
83 | pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, | ||
84 | &ptdump_fops); | ||
85 | if (!pe_knl) | ||
86 | goto err; | ||
87 | |||
88 | pe_curknl = debugfs_create_file("current_kernel", 0400, | ||
89 | dir, NULL, &ptdump_curknl_fops); | ||
90 | if (!pe_curknl) | ||
91 | goto err; | ||
92 | |||
93 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
94 | pe_curusr = debugfs_create_file("current_user", 0400, | ||
95 | dir, NULL, &ptdump_curusr_fops); | ||
96 | if (!pe_curusr) | ||
97 | goto err; | ||
98 | #endif | ||
34 | return 0; | 99 | return 0; |
100 | err: | ||
101 | debugfs_remove_recursive(dir); | ||
102 | return -ENOMEM; | ||
35 | } | 103 | } |
36 | 104 | ||
37 | static void __exit pt_dump_debug_exit(void) | 105 | static void __exit pt_dump_debug_exit(void) |
38 | { | 106 | { |
39 | debugfs_remove_recursive(pe); | 107 | debugfs_remove_recursive(dir); |
40 | } | 108 | } |
41 | 109 | ||
42 | module_init(pt_dump_debug_init); | 110 | module_init(pt_dump_debug_init); |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 43dedbfb7257..f56902c1f04b 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -52,12 +52,18 @@ enum address_markers_idx { | |||
52 | USER_SPACE_NR = 0, | 52 | USER_SPACE_NR = 0, |
53 | KERNEL_SPACE_NR, | 53 | KERNEL_SPACE_NR, |
54 | LOW_KERNEL_NR, | 54 | LOW_KERNEL_NR, |
55 | #if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) | ||
56 | LDT_NR, | ||
57 | #endif | ||
55 | VMALLOC_START_NR, | 58 | VMALLOC_START_NR, |
56 | VMEMMAP_START_NR, | 59 | VMEMMAP_START_NR, |
57 | #ifdef CONFIG_KASAN | 60 | #ifdef CONFIG_KASAN |
58 | KASAN_SHADOW_START_NR, | 61 | KASAN_SHADOW_START_NR, |
59 | KASAN_SHADOW_END_NR, | 62 | KASAN_SHADOW_END_NR, |
60 | #endif | 63 | #endif |
64 | #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) | ||
65 | LDT_NR, | ||
66 | #endif | ||
61 | CPU_ENTRY_AREA_NR, | 67 | CPU_ENTRY_AREA_NR, |
62 | #ifdef CONFIG_X86_ESPFIX64 | 68 | #ifdef CONFIG_X86_ESPFIX64 |
63 | ESPFIX_START_NR, | 69 | ESPFIX_START_NR, |
@@ -82,6 +88,9 @@ static struct addr_marker address_markers[] = { | |||
82 | [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, | 88 | [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, |
83 | [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, | 89 | [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, |
84 | #endif | 90 | #endif |
91 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | ||
92 | [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, | ||
93 | #endif | ||
85 | [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, | 94 | [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, |
86 | #ifdef CONFIG_X86_ESPFIX64 | 95 | #ifdef CONFIG_X86_ESPFIX64 |
87 | [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, | 96 | [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, |
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx) | |||
467 | } | 476 | } |
468 | 477 | ||
469 | static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, | 478 | static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, |
470 | bool checkwx) | 479 | bool checkwx, bool dmesg) |
471 | { | 480 | { |
472 | #ifdef CONFIG_X86_64 | 481 | #ifdef CONFIG_X86_64 |
473 | pgd_t *start = (pgd_t *) &init_top_pgt; | 482 | pgd_t *start = (pgd_t *) &init_top_pgt; |
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, | |||
480 | 489 | ||
481 | if (pgd) { | 490 | if (pgd) { |
482 | start = pgd; | 491 | start = pgd; |
483 | st.to_dmesg = true; | 492 | st.to_dmesg = dmesg; |
484 | } | 493 | } |
485 | 494 | ||
486 | st.check_wx = checkwx; | 495 | st.check_wx = checkwx; |
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, | |||
518 | 527 | ||
519 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) | 528 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) |
520 | { | 529 | { |
521 | ptdump_walk_pgd_level_core(m, pgd, false); | 530 | ptdump_walk_pgd_level_core(m, pgd, false, true); |
531 | } | ||
532 | |||
533 | void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) | ||
534 | { | ||
535 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
536 | if (user && static_cpu_has(X86_FEATURE_PTI)) | ||
537 | pgd = kernel_to_user_pgdp(pgd); | ||
538 | #endif | ||
539 | ptdump_walk_pgd_level_core(m, pgd, false, false); | ||
540 | } | ||
541 | EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); | ||
542 | |||
543 | static void ptdump_walk_user_pgd_level_checkwx(void) | ||
544 | { | ||
545 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
546 | pgd_t *pgd = (pgd_t *) &init_top_pgt; | ||
547 | |||
548 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
549 | return; | ||
550 | |||
551 | pr_info("x86/mm: Checking user space page tables\n"); | ||
552 | pgd = kernel_to_user_pgdp(pgd); | ||
553 | ptdump_walk_pgd_level_core(NULL, pgd, true, false); | ||
554 | #endif | ||
522 | } | 555 | } |
523 | EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); | ||
524 | 556 | ||
525 | void ptdump_walk_pgd_level_checkwx(void) | 557 | void ptdump_walk_pgd_level_checkwx(void) |
526 | { | 558 | { |
527 | ptdump_walk_pgd_level_core(NULL, NULL, true); | 559 | ptdump_walk_pgd_level_core(NULL, NULL, true, false); |
560 | ptdump_walk_user_pgd_level_checkwx(); | ||
528 | } | 561 | } |
529 | 562 | ||
530 | static int __init pt_dump_init(void) | 563 | static int __init pt_dump_init(void) |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6fdf91ef130a..8ca324d07282 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <asm/kaslr.h> | 20 | #include <asm/kaslr.h> |
21 | #include <asm/hypervisor.h> | 21 | #include <asm/hypervisor.h> |
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/pti.h> | ||
23 | 24 | ||
24 | /* | 25 | /* |
25 | * We need to define the tracepoints somewhere, and tlb.c | 26 | * We need to define the tracepoints somewhere, and tlb.c |
@@ -160,6 +161,12 @@ struct map_range { | |||
160 | 161 | ||
161 | static int page_size_mask; | 162 | static int page_size_mask; |
162 | 163 | ||
164 | static void enable_global_pages(void) | ||
165 | { | ||
166 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
167 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
168 | } | ||
169 | |||
163 | static void __init probe_page_size_mask(void) | 170 | static void __init probe_page_size_mask(void) |
164 | { | 171 | { |
165 | /* | 172 | /* |
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void) | |||
177 | cr4_set_bits_and_update_boot(X86_CR4_PSE); | 184 | cr4_set_bits_and_update_boot(X86_CR4_PSE); |
178 | 185 | ||
179 | /* Enable PGE if available */ | 186 | /* Enable PGE if available */ |
187 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
180 | if (boot_cpu_has(X86_FEATURE_PGE)) { | 188 | if (boot_cpu_has(X86_FEATURE_PGE)) { |
181 | cr4_set_bits_and_update_boot(X86_CR4_PGE); | 189 | cr4_set_bits_and_update_boot(X86_CR4_PGE); |
182 | __supported_pte_mask |= _PAGE_GLOBAL; | 190 | enable_global_pages(); |
183 | } else | 191 | } |
184 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
185 | 192 | ||
186 | /* Enable 1 GB linear kernel mappings if available: */ | 193 | /* Enable 1 GB linear kernel mappings if available: */ |
187 | if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { | 194 | if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { |
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void) | |||
194 | 201 | ||
195 | static void setup_pcid(void) | 202 | static void setup_pcid(void) |
196 | { | 203 | { |
197 | #ifdef CONFIG_X86_64 | 204 | if (!IS_ENABLED(CONFIG_X86_64)) |
198 | if (boot_cpu_has(X86_FEATURE_PCID)) { | 205 | return; |
199 | if (boot_cpu_has(X86_FEATURE_PGE)) { | 206 | |
200 | /* | 207 | if (!boot_cpu_has(X86_FEATURE_PCID)) |
201 | * This can't be cr4_set_bits_and_update_boot() -- | 208 | return; |
202 | * the trampoline code can't handle CR4.PCIDE and | 209 | |
203 | * it wouldn't do any good anyway. Despite the name, | 210 | if (boot_cpu_has(X86_FEATURE_PGE)) { |
204 | * cr4_set_bits_and_update_boot() doesn't actually | 211 | /* |
205 | * cause the bits in question to remain set all the | 212 | * This can't be cr4_set_bits_and_update_boot() -- the |
206 | * way through the secondary boot asm. | 213 | * trampoline code can't handle CR4.PCIDE and it wouldn't |
207 | * | 214 | * do any good anyway. Despite the name, |
208 | * Instead, we brute-force it and set CR4.PCIDE | 215 | * cr4_set_bits_and_update_boot() doesn't actually cause |
209 | * manually in start_secondary(). | 216 | * the bits in question to remain set all the way through |
210 | */ | 217 | * the secondary boot asm. |
211 | cr4_set_bits(X86_CR4_PCIDE); | 218 | * |
212 | } else { | 219 | * Instead, we brute-force it and set CR4.PCIDE manually in |
213 | /* | 220 | * start_secondary(). |
214 | * flush_tlb_all(), as currently implemented, won't | 221 | */ |
215 | * work if PCID is on but PGE is not. Since that | 222 | cr4_set_bits(X86_CR4_PCIDE); |
216 | * combination doesn't exist on real hardware, there's | 223 | |
217 | * no reason to try to fully support it, but it's | 224 | /* |
218 | * polite to avoid corrupting data if we're on | 225 | * INVPCID's single-context modes (2/3) only work if we set |
219 | * an improperly configured VM. | 226 | * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable |
220 | */ | 227 | * on systems that have X86_CR4_PCIDE clear, or that have |
221 | setup_clear_cpu_cap(X86_FEATURE_PCID); | 228 | * no INVPCID support at all. |
222 | } | 229 | */ |
230 | if (boot_cpu_has(X86_FEATURE_INVPCID)) | ||
231 | setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); | ||
232 | } else { | ||
233 | /* | ||
234 | * flush_tlb_all(), as currently implemented, won't work if | ||
235 | * PCID is on but PGE is not. Since that combination | ||
236 | * doesn't exist on real hardware, there's no reason to try | ||
237 | * to fully support it, but it's polite to avoid corrupting | ||
238 | * data if we're on an improperly configured VM. | ||
239 | */ | ||
240 | setup_clear_cpu_cap(X86_FEATURE_PCID); | ||
223 | } | 241 | } |
224 | #endif | ||
225 | } | 242 | } |
226 | 243 | ||
227 | #ifdef CONFIG_X86_32 | 244 | #ifdef CONFIG_X86_32 |
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void) | |||
622 | { | 639 | { |
623 | unsigned long end; | 640 | unsigned long end; |
624 | 641 | ||
642 | pti_check_boottime_disable(); | ||
625 | probe_page_size_mask(); | 643 | probe_page_size_mask(); |
626 | setup_pcid(); | 644 | setup_pcid(); |
627 | 645 | ||
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void) | |||
845 | free_area_init_nodes(max_zone_pfns); | 863 | free_area_init_nodes(max_zone_pfns); |
846 | } | 864 | } |
847 | 865 | ||
848 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { | 866 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { |
849 | .loaded_mm = &init_mm, | 867 | .loaded_mm = &init_mm, |
850 | .next_asid = 1, | 868 | .next_asid = 1, |
851 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ | 869 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 96d456a94b03..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd) | |||
355 | kmem_cache_free(pgd_cache, pgd); | 355 | kmem_cache_free(pgd_cache, pgd); |
356 | } | 356 | } |
357 | #else | 357 | #else |
358 | |||
358 | static inline pgd_t *_pgd_alloc(void) | 359 | static inline pgd_t *_pgd_alloc(void) |
359 | { | 360 | { |
360 | return (pgd_t *)__get_free_page(PGALLOC_GFP); | 361 | return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); |
361 | } | 362 | } |
362 | 363 | ||
363 | static inline void _pgd_free(pgd_t *pgd) | 364 | static inline void _pgd_free(pgd_t *pgd) |
364 | { | 365 | { |
365 | free_page((unsigned long)pgd); | 366 | free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); |
366 | } | 367 | } |
367 | #endif /* CONFIG_X86_PAE */ | 368 | #endif /* CONFIG_X86_PAE */ |
368 | 369 | ||
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..bce8aea65606 --- /dev/null +++ b/arch/x86/mm/pti.c | |||
@@ -0,0 +1,387 @@ | |||
1 | /* | ||
2 | * Copyright(c) 2017 Intel Corporation. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of version 2 of the GNU General Public License as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * This code is based in part on work published here: | ||
14 | * | ||
15 | * https://github.com/IAIK/KAISER | ||
16 | * | ||
17 | * The original work was written by and and signed off by for the Linux | ||
18 | * kernel by: | ||
19 | * | ||
20 | * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> | ||
21 | * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> | ||
22 | * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> | ||
23 | * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> | ||
24 | * | ||
25 | * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> | ||
26 | * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and | ||
27 | * Andy Lutomirsky <luto@amacapital.net> | ||
28 | */ | ||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/string.h> | ||
32 | #include <linux/types.h> | ||
33 | #include <linux/bug.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/mm.h> | ||
37 | #include <linux/uaccess.h> | ||
38 | |||
39 | #include <asm/cpufeature.h> | ||
40 | #include <asm/hypervisor.h> | ||
41 | #include <asm/vsyscall.h> | ||
42 | #include <asm/cmdline.h> | ||
43 | #include <asm/pti.h> | ||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/pgalloc.h> | ||
46 | #include <asm/tlbflush.h> | ||
47 | #include <asm/desc.h> | ||
48 | |||
49 | #undef pr_fmt | ||
50 | #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt | ||
51 | |||
52 | /* Backporting helper */ | ||
53 | #ifndef __GFP_NOTRACK | ||
54 | #define __GFP_NOTRACK 0 | ||
55 | #endif | ||
56 | |||
57 | static void __init pti_print_if_insecure(const char *reason) | ||
58 | { | ||
59 | if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) | ||
60 | pr_info("%s\n", reason); | ||
61 | } | ||
62 | |||
63 | static void __init pti_print_if_secure(const char *reason) | ||
64 | { | ||
65 | if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) | ||
66 | pr_info("%s\n", reason); | ||
67 | } | ||
68 | |||
69 | void __init pti_check_boottime_disable(void) | ||
70 | { | ||
71 | char arg[5]; | ||
72 | int ret; | ||
73 | |||
74 | if (hypervisor_is_type(X86_HYPER_XEN_PV)) { | ||
75 | pti_print_if_insecure("disabled on XEN PV."); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); | ||
80 | if (ret > 0) { | ||
81 | if (ret == 3 && !strncmp(arg, "off", 3)) { | ||
82 | pti_print_if_insecure("disabled on command line."); | ||
83 | return; | ||
84 | } | ||
85 | if (ret == 2 && !strncmp(arg, "on", 2)) { | ||
86 | pti_print_if_secure("force enabled on command line."); | ||
87 | goto enable; | ||
88 | } | ||
89 | if (ret == 4 && !strncmp(arg, "auto", 4)) | ||
90 | goto autosel; | ||
91 | } | ||
92 | |||
93 | if (cmdline_find_option_bool(boot_command_line, "nopti")) { | ||
94 | pti_print_if_insecure("disabled on command line."); | ||
95 | return; | ||
96 | } | ||
97 | |||
98 | autosel: | ||
99 | if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) | ||
100 | return; | ||
101 | enable: | ||
102 | setup_force_cpu_cap(X86_FEATURE_PTI); | ||
103 | } | ||
104 | |||
105 | pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) | ||
106 | { | ||
107 | /* | ||
108 | * Changes to the high (kernel) portion of the kernelmode page | ||
109 | * tables are not automatically propagated to the usermode tables. | ||
110 | * | ||
111 | * Users should keep in mind that, unlike the kernelmode tables, | ||
112 | * there is no vmalloc_fault equivalent for the usermode tables. | ||
113 | * Top-level entries added to init_mm's usermode pgd after boot | ||
114 | * will not be automatically propagated to other mms. | ||
115 | */ | ||
116 | if (!pgdp_maps_userspace(pgdp)) | ||
117 | return pgd; | ||
118 | |||
119 | /* | ||
120 | * The user page tables get the full PGD, accessible from | ||
121 | * userspace: | ||
122 | */ | ||
123 | kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; | ||
124 | |||
125 | /* | ||
126 | * If this is normal user memory, make it NX in the kernel | ||
127 | * pagetables so that, if we somehow screw up and return to | ||
128 | * usermode with the kernel CR3 loaded, we'll get a page fault | ||
129 | * instead of allowing user code to execute with the wrong CR3. | ||
130 | * | ||
131 | * As exceptions, we don't set NX if: | ||
132 | * - _PAGE_USER is not set. This could be an executable | ||
133 | * EFI runtime mapping or something similar, and the kernel | ||
134 | * may execute from it | ||
135 | * - we don't have NX support | ||
136 | * - we're clearing the PGD (i.e. the new pgd is not present). | ||
137 | */ | ||
138 | if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && | ||
139 | (__supported_pte_mask & _PAGE_NX)) | ||
140 | pgd.pgd |= _PAGE_NX; | ||
141 | |||
142 | /* return the copy of the PGD we want the kernel to use: */ | ||
143 | return pgd; | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | * Walk the user copy of the page tables (optionally) trying to allocate | ||
148 | * page table pages on the way down. | ||
149 | * | ||
150 | * Returns a pointer to a P4D on success, or NULL on failure. | ||
151 | */ | ||
152 | static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) | ||
153 | { | ||
154 | pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); | ||
155 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | ||
156 | |||
157 | if (address < PAGE_OFFSET) { | ||
158 | WARN_ONCE(1, "attempt to walk user address\n"); | ||
159 | return NULL; | ||
160 | } | ||
161 | |||
162 | if (pgd_none(*pgd)) { | ||
163 | unsigned long new_p4d_page = __get_free_page(gfp); | ||
164 | if (!new_p4d_page) | ||
165 | return NULL; | ||
166 | |||
167 | if (pgd_none(*pgd)) { | ||
168 | set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); | ||
169 | new_p4d_page = 0; | ||
170 | } | ||
171 | if (new_p4d_page) | ||
172 | free_page(new_p4d_page); | ||
173 | } | ||
174 | BUILD_BUG_ON(pgd_large(*pgd) != 0); | ||
175 | |||
176 | return p4d_offset(pgd, address); | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Walk the user copy of the page tables (optionally) trying to allocate | ||
181 | * page table pages on the way down. | ||
182 | * | ||
183 | * Returns a pointer to a PMD on success, or NULL on failure. | ||
184 | */ | ||
185 | static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) | ||
186 | { | ||
187 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | ||
188 | p4d_t *p4d = pti_user_pagetable_walk_p4d(address); | ||
189 | pud_t *pud; | ||
190 | |||
191 | BUILD_BUG_ON(p4d_large(*p4d) != 0); | ||
192 | if (p4d_none(*p4d)) { | ||
193 | unsigned long new_pud_page = __get_free_page(gfp); | ||
194 | if (!new_pud_page) | ||
195 | return NULL; | ||
196 | |||
197 | if (p4d_none(*p4d)) { | ||
198 | set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); | ||
199 | new_pud_page = 0; | ||
200 | } | ||
201 | if (new_pud_page) | ||
202 | free_page(new_pud_page); | ||
203 | } | ||
204 | |||
205 | pud = pud_offset(p4d, address); | ||
206 | /* The user page tables do not use large mappings: */ | ||
207 | if (pud_large(*pud)) { | ||
208 | WARN_ON(1); | ||
209 | return NULL; | ||
210 | } | ||
211 | if (pud_none(*pud)) { | ||
212 | unsigned long new_pmd_page = __get_free_page(gfp); | ||
213 | if (!new_pmd_page) | ||
214 | return NULL; | ||
215 | |||
216 | if (pud_none(*pud)) { | ||
217 | set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); | ||
218 | new_pmd_page = 0; | ||
219 | } | ||
220 | if (new_pmd_page) | ||
221 | free_page(new_pmd_page); | ||
222 | } | ||
223 | |||
224 | return pmd_offset(pud, address); | ||
225 | } | ||
226 | |||
227 | #ifdef CONFIG_X86_VSYSCALL_EMULATION | ||
228 | /* | ||
229 | * Walk the shadow copy of the page tables (optionally) trying to allocate | ||
230 | * page table pages on the way down. Does not support large pages. | ||
231 | * | ||
232 | * Note: this is only used when mapping *new* kernel data into the | ||
233 | * user/shadow page tables. It is never used for userspace data. | ||
234 | * | ||
235 | * Returns a pointer to a PTE on success, or NULL on failure. | ||
236 | */ | ||
237 | static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) | ||
238 | { | ||
239 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | ||
240 | pmd_t *pmd = pti_user_pagetable_walk_pmd(address); | ||
241 | pte_t *pte; | ||
242 | |||
243 | /* We can't do anything sensible if we hit a large mapping. */ | ||
244 | if (pmd_large(*pmd)) { | ||
245 | WARN_ON(1); | ||
246 | return NULL; | ||
247 | } | ||
248 | |||
249 | if (pmd_none(*pmd)) { | ||
250 | unsigned long new_pte_page = __get_free_page(gfp); | ||
251 | if (!new_pte_page) | ||
252 | return NULL; | ||
253 | |||
254 | if (pmd_none(*pmd)) { | ||
255 | set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); | ||
256 | new_pte_page = 0; | ||
257 | } | ||
258 | if (new_pte_page) | ||
259 | free_page(new_pte_page); | ||
260 | } | ||
261 | |||
262 | pte = pte_offset_kernel(pmd, address); | ||
263 | if (pte_flags(*pte) & _PAGE_USER) { | ||
264 | WARN_ONCE(1, "attempt to walk to user pte\n"); | ||
265 | return NULL; | ||
266 | } | ||
267 | return pte; | ||
268 | } | ||
269 | |||
270 | static void __init pti_setup_vsyscall(void) | ||
271 | { | ||
272 | pte_t *pte, *target_pte; | ||
273 | unsigned int level; | ||
274 | |||
275 | pte = lookup_address(VSYSCALL_ADDR, &level); | ||
276 | if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) | ||
277 | return; | ||
278 | |||
279 | target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); | ||
280 | if (WARN_ON(!target_pte)) | ||
281 | return; | ||
282 | |||
283 | *target_pte = *pte; | ||
284 | set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); | ||
285 | } | ||
286 | #else | ||
287 | static void __init pti_setup_vsyscall(void) { } | ||
288 | #endif | ||
289 | |||
290 | static void __init | ||
291 | pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) | ||
292 | { | ||
293 | unsigned long addr; | ||
294 | |||
295 | /* | ||
296 | * Clone the populated PMDs which cover start to end. These PMD areas | ||
297 | * can have holes. | ||
298 | */ | ||
299 | for (addr = start; addr < end; addr += PMD_SIZE) { | ||
300 | pmd_t *pmd, *target_pmd; | ||
301 | pgd_t *pgd; | ||
302 | p4d_t *p4d; | ||
303 | pud_t *pud; | ||
304 | |||
305 | pgd = pgd_offset_k(addr); | ||
306 | if (WARN_ON(pgd_none(*pgd))) | ||
307 | return; | ||
308 | p4d = p4d_offset(pgd, addr); | ||
309 | if (WARN_ON(p4d_none(*p4d))) | ||
310 | return; | ||
311 | pud = pud_offset(p4d, addr); | ||
312 | if (pud_none(*pud)) | ||
313 | continue; | ||
314 | pmd = pmd_offset(pud, addr); | ||
315 | if (pmd_none(*pmd)) | ||
316 | continue; | ||
317 | |||
318 | target_pmd = pti_user_pagetable_walk_pmd(addr); | ||
319 | if (WARN_ON(!target_pmd)) | ||
320 | return; | ||
321 | |||
322 | /* | ||
323 | * Copy the PMD. That is, the kernelmode and usermode | ||
324 | * tables will share the last-level page tables of this | ||
325 | * address range | ||
326 | */ | ||
327 | *target_pmd = pmd_clear_flags(*pmd, clear); | ||
328 | } | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Clone a single p4d (i.e. a top-level entry on 4-level systems and a | ||
333 | * next-level entry on 5-level systems. | ||
334 | */ | ||
335 | static void __init pti_clone_p4d(unsigned long addr) | ||
336 | { | ||
337 | p4d_t *kernel_p4d, *user_p4d; | ||
338 | pgd_t *kernel_pgd; | ||
339 | |||
340 | user_p4d = pti_user_pagetable_walk_p4d(addr); | ||
341 | kernel_pgd = pgd_offset_k(addr); | ||
342 | kernel_p4d = p4d_offset(kernel_pgd, addr); | ||
343 | *user_p4d = *kernel_p4d; | ||
344 | } | ||
345 | |||
346 | /* | ||
347 | * Clone the CPU_ENTRY_AREA into the user space visible page table. | ||
348 | */ | ||
349 | static void __init pti_clone_user_shared(void) | ||
350 | { | ||
351 | pti_clone_p4d(CPU_ENTRY_AREA_BASE); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Clone the ESPFIX P4D into the user space visinble page table | ||
356 | */ | ||
357 | static void __init pti_setup_espfix64(void) | ||
358 | { | ||
359 | #ifdef CONFIG_X86_ESPFIX64 | ||
360 | pti_clone_p4d(ESPFIX_BASE_ADDR); | ||
361 | #endif | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Clone the populated PMDs of the entry and irqentry text and force it RO. | ||
366 | */ | ||
367 | static void __init pti_clone_entry_text(void) | ||
368 | { | ||
369 | pti_clone_pmds((unsigned long) __entry_text_start, | ||
370 | (unsigned long) __irqentry_text_end, _PAGE_RW); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Initialize kernel page table isolation | ||
375 | */ | ||
376 | void __init pti_init(void) | ||
377 | { | ||
378 | if (!static_cpu_has(X86_FEATURE_PTI)) | ||
379 | return; | ||
380 | |||
381 | pr_info("enabled\n"); | ||
382 | |||
383 | pti_clone_user_shared(); | ||
384 | pti_clone_entry_text(); | ||
385 | pti_setup_espfix64(); | ||
386 | pti_setup_vsyscall(); | ||
387 | } | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0a1be3adc97e..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -28,6 +28,38 @@ | |||
28 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi | 28 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi |
29 | */ | 29 | */ |
30 | 30 | ||
31 | /* | ||
32 | * We get here when we do something requiring a TLB invalidation | ||
33 | * but could not go invalidate all of the contexts. We do the | ||
34 | * necessary invalidation by clearing out the 'ctx_id' which | ||
35 | * forces a TLB flush when the context is loaded. | ||
36 | */ | ||
37 | void clear_asid_other(void) | ||
38 | { | ||
39 | u16 asid; | ||
40 | |||
41 | /* | ||
42 | * This is only expected to be set if we have disabled | ||
43 | * kernel _PAGE_GLOBAL pages. | ||
44 | */ | ||
45 | if (!static_cpu_has(X86_FEATURE_PTI)) { | ||
46 | WARN_ON_ONCE(1); | ||
47 | return; | ||
48 | } | ||
49 | |||
50 | for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { | ||
51 | /* Do not need to flush the current asid */ | ||
52 | if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) | ||
53 | continue; | ||
54 | /* | ||
55 | * Make sure the next time we go to switch to | ||
56 | * this asid, we do a flush: | ||
57 | */ | ||
58 | this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); | ||
59 | } | ||
60 | this_cpu_write(cpu_tlbstate.invalidate_other, false); | ||
61 | } | ||
62 | |||
31 | atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); | 63 | atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); |
32 | 64 | ||
33 | 65 | ||
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, | |||
42 | return; | 74 | return; |
43 | } | 75 | } |
44 | 76 | ||
77 | if (this_cpu_read(cpu_tlbstate.invalidate_other)) | ||
78 | clear_asid_other(); | ||
79 | |||
45 | for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { | 80 | for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { |
46 | if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != | 81 | if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != |
47 | next->context.ctx_id) | 82 | next->context.ctx_id) |
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, | |||
65 | *need_flush = true; | 100 | *need_flush = true; |
66 | } | 101 | } |
67 | 102 | ||
103 | static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) | ||
104 | { | ||
105 | unsigned long new_mm_cr3; | ||
106 | |||
107 | if (need_flush) { | ||
108 | invalidate_user_asid(new_asid); | ||
109 | new_mm_cr3 = build_cr3(pgdir, new_asid); | ||
110 | } else { | ||
111 | new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Caution: many callers of this function expect | ||
116 | * that load_cr3() is serializing and orders TLB | ||
117 | * fills with respect to the mm_cpumask writes. | ||
118 | */ | ||
119 | write_cr3(new_mm_cr3); | ||
120 | } | ||
121 | |||
68 | void leave_mm(int cpu) | 122 | void leave_mm(int cpu) |
69 | { | 123 | { |
70 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); | 124 | struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); |
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
195 | if (need_flush) { | 249 | if (need_flush) { |
196 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); | 250 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
197 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); | 251 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
198 | write_cr3(build_cr3(next->pgd, new_asid)); | 252 | load_new_mm_cr3(next->pgd, new_asid, true); |
199 | 253 | ||
200 | /* | 254 | /* |
201 | * NB: This gets called via leave_mm() in the idle path | 255 | * NB: This gets called via leave_mm() in the idle path |
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
208 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | 262 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
209 | } else { | 263 | } else { |
210 | /* The new ASID is already up to date. */ | 264 | /* The new ASID is already up to date. */ |
211 | write_cr3(build_cr3_noflush(next->pgd, new_asid)); | 265 | load_new_mm_cr3(next->pgd, new_asid, false); |
212 | 266 | ||
213 | /* See above wrt _rcuidle. */ | 267 | /* See above wrt _rcuidle. */ |
214 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); | 268 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); |
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6a151ce70e86..d87ac96e37ed 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c | |||
@@ -196,6 +196,9 @@ static pgd_t *efi_pgd; | |||
196 | * because we want to avoid inserting EFI region mappings (EFI_VA_END | 196 | * because we want to avoid inserting EFI region mappings (EFI_VA_END |
197 | * to EFI_VA_START) into the standard kernel page tables. Everything | 197 | * to EFI_VA_START) into the standard kernel page tables. Everything |
198 | * else can be shared, see efi_sync_low_kernel_mappings(). | 198 | * else can be shared, see efi_sync_low_kernel_mappings(). |
199 | * | ||
200 | * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the | ||
201 | * allocation. | ||
199 | */ | 202 | */ |
200 | int __init efi_alloc_page_tables(void) | 203 | int __init efi_alloc_page_tables(void) |
201 | { | 204 | { |
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void) | |||
208 | return 0; | 211 | return 0; |
209 | 212 | ||
210 | gfp_mask = GFP_KERNEL | __GFP_ZERO; | 213 | gfp_mask = GFP_KERNEL | __GFP_ZERO; |
211 | efi_pgd = (pgd_t *)__get_free_page(gfp_mask); | 214 | efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); |
212 | if (!efi_pgd) | 215 | if (!efi_pgd) |
213 | return -ENOMEM; | 216 | return -ENOMEM; |
214 | 217 | ||
diff --git a/include/linux/pti.h b/include/linux/pti.h new file mode 100644 index 000000000000..0174883a935a --- /dev/null +++ b/include/linux/pti.h | |||
@@ -0,0 +1,11 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #ifndef _INCLUDE_PTI_H | ||
3 | #define _INCLUDE_PTI_H | ||
4 | |||
5 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
6 | #include <asm/pti.h> | ||
7 | #else | ||
8 | static inline void pti_init(void) { } | ||
9 | #endif | ||
10 | |||
11 | #endif | ||
diff --git a/init/main.c b/init/main.c index 7b606fc48482..a8100b954839 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -75,6 +75,7 @@ | |||
75 | #include <linux/slab.h> | 75 | #include <linux/slab.h> |
76 | #include <linux/perf_event.h> | 76 | #include <linux/perf_event.h> |
77 | #include <linux/ptrace.h> | 77 | #include <linux/ptrace.h> |
78 | #include <linux/pti.h> | ||
78 | #include <linux/blkdev.h> | 79 | #include <linux/blkdev.h> |
79 | #include <linux/elevator.h> | 80 | #include <linux/elevator.h> |
80 | #include <linux/sched_clock.h> | 81 | #include <linux/sched_clock.h> |
@@ -506,6 +507,8 @@ static void __init mm_init(void) | |||
506 | ioremap_huge_init(); | 507 | ioremap_huge_init(); |
507 | /* Should be run before the first non-init thread is created */ | 508 | /* Should be run before the first non-init thread is created */ |
508 | init_espfix_bsp(); | 509 | init_espfix_bsp(); |
510 | /* Should be run after espfix64 is set up. */ | ||
511 | pti_init(); | ||
509 | } | 512 | } |
510 | 513 | ||
511 | asmlinkage __visible void __init start_kernel(void) | 514 | asmlinkage __visible void __init start_kernel(void) |
diff --git a/security/Kconfig b/security/Kconfig index e8e449444e65..a623d13bf288 100644 --- a/security/Kconfig +++ b/security/Kconfig | |||
@@ -54,6 +54,16 @@ config SECURITY_NETWORK | |||
54 | implement socket and networking access controls. | 54 | implement socket and networking access controls. |
55 | If you are unsure how to answer this question, answer N. | 55 | If you are unsure how to answer this question, answer N. |
56 | 56 | ||
57 | config PAGE_TABLE_ISOLATION | ||
58 | bool "Remove the kernel mapping in user mode" | ||
59 | depends on X86_64 && !UML | ||
60 | help | ||
61 | This feature reduces the number of hardware side channels by | ||
62 | ensuring that the majority of kernel addresses are not mapped | ||
63 | into userspace. | ||
64 | |||
65 | See Documentation/x86/pagetable-isolation.txt for more details. | ||
66 | |||
57 | config SECURITY_INFINIBAND | 67 | config SECURITY_INFINIBAND |
58 | bool "Infiniband Security Hooks" | 68 | bool "Infiniband Security Hooks" |
59 | depends on SECURITY && INFINIBAND | 69 | depends on SECURITY && INFINIBAND |
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 0304ffb714f2..1aef72df20a1 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c | |||
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt, | |||
122 | * NB: Different Linux versions do different things with the | 122 | * NB: Different Linux versions do different things with the |
123 | * accessed bit in set_thread_area(). | 123 | * accessed bit in set_thread_area(). |
124 | */ | 124 | */ |
125 | if (ar != expected_ar && | 125 | if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { |
126 | (ldt || ar != (expected_ar | AR_ACCESSED))) { | ||
127 | printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", | 126 | printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", |
128 | (ldt ? "LDT" : "GDT"), index, ar, expected_ar); | 127 | (ldt ? "LDT" : "GDT"), index, ar, expected_ar); |
129 | nerrs++; | 128 | nerrs++; |