aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 20:54:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 20:54:17 -0400
commiteac341194426ba7ead3444923b9eba491ae4feeb (patch)
tree7daa0ba9c1fc21b603dd45e738e5887f80db939b
parentd191c82d4d9bd0bb3b945fc458cc65053ef868a0 (diff)
parentd878efce73fe86db34ddb2013260adf571a701a7 (diff)
Merge branch 'x86/pti' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 PTI updates from Thomas Gleixner: "The Speck brigade sadly provides yet another large set of patches destroying the perfomance which we carefully built and preserved - PTI support for 32bit PAE. The missing counter part to the 64bit PTI code implemented by Joerg. - A set of fixes for the Global Bit mechanics for non PCID CPUs which were setting the Global Bit too widely and therefore possibly exposing interesting memory needlessly. - Protection against userspace-userspace SpectreRSB - Support for the upcoming Enhanced IBRS mode, which is preferred over IBRS. Unfortunately we dont know the performance impact of this, but it's expected to be less horrible than the IBRS hammering. - Cleanups and simplifications" * 'x86/pti' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits) x86/mm/pti: Move user W+X check into pti_finalize() x86/relocs: Add __end_rodata_aligned to S_REL x86/mm/pti: Clone kernel-image on PTE level for 32 bit x86/mm/pti: Don't clear permissions in pti_clone_pmd() x86/mm/pti: Fix 32 bit PCID check x86/mm/init: Remove freed kernel image areas from alias mapping x86/mm/init: Add helper for freeing kernel image pages x86/mm/init: Pass unconverted symbol addresses to free_init_pages() mm: Allow non-direct-map arguments to free_reserved_area() x86/mm/pti: Clear Global bit more aggressively x86/speculation: Support Enhanced IBRS on future CPUs x86/speculation: Protect against userspace-userspace spectreRSB x86/kexec: Allocate 8k PGDs for PTI Revert "perf/core: Make sure the ring-buffer is mapped in all page-tables" x86/mm: Remove in_nmi() warning from vmalloc_fault() x86/entry/32: Check for VM86 mode in slow-path check perf/core: Make sure the ring-buffer is mapped in all page-tables x86/pti: Check the return value of pti_user_pagetable_walk_pmd() x86/pti: Check the return value of pti_user_pagetable_walk_p4d() x86/entry/32: Add debug code to check entry/exit CR3 ...
-rw-r--r--arch/x86/entry/entry_32.S632
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h5
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h3
-rw-r--r--arch/x86/include/asm/pgtable-3level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h6
-rw-r--r--arch/x86/include/asm/pgtable.h94
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h89
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h3
-rw-r--r--arch/x86/include/asm/pgtable_types.h28
-rw-r--r--arch/x86/include/asm/processor-flags.h8
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/pti.h3
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/switch_to.h16
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c10
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c58
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/head_32.S20
-rw-r--r--arch/x86/kernel/ldt.c137
-rw-r--r--arch/x86/kernel/machine_kexec_32.c5
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S17
-rw-r--r--arch/x86/mm/dump_pagetables.c27
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c37
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/pageattr.c19
-rw-r--r--arch/x86/mm/pgtable.c105
-rw-r--r--arch/x86/mm/pti.c261
-rw-r--r--arch/x86/tools/relocs.c1
-rw-r--r--include/linux/pti.h1
-rw-r--r--init/main.c7
-rw-r--r--mm/page_alloc.c16
-rw-r--r--security/Kconfig2
45 files changed, 1271 insertions, 413 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index c371bfee137a..2767c625a52c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -65,7 +65,7 @@
65# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 65# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
66#else 66#else
67# define preempt_stop(clobbers) 67# define preempt_stop(clobbers)
68# define resume_kernel restore_all 68# define resume_kernel restore_all_kernel
69#endif 69#endif
70 70
71.macro TRACE_IRQS_IRET 71.macro TRACE_IRQS_IRET
@@ -77,6 +77,8 @@
77#endif 77#endif
78.endm 78.endm
79 79
80#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
81
80/* 82/*
81 * User gs save/restore 83 * User gs save/restore
82 * 84 *
@@ -154,7 +156,52 @@
154 156
155#endif /* CONFIG_X86_32_LAZY_GS */ 157#endif /* CONFIG_X86_32_LAZY_GS */
156 158
157.macro SAVE_ALL pt_regs_ax=%eax 159/* Unconditionally switch to user cr3 */
160.macro SWITCH_TO_USER_CR3 scratch_reg:req
161 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
162
163 movl %cr3, \scratch_reg
164 orl $PTI_SWITCH_MASK, \scratch_reg
165 movl \scratch_reg, %cr3
166.Lend_\@:
167.endm
168
169.macro BUG_IF_WRONG_CR3 no_user_check=0
170#ifdef CONFIG_DEBUG_ENTRY
171 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
172 .if \no_user_check == 0
173 /* coming from usermode? */
174 testl $SEGMENT_RPL_MASK, PT_CS(%esp)
175 jz .Lend_\@
176 .endif
177 /* On user-cr3? */
178 movl %cr3, %eax
179 testl $PTI_SWITCH_MASK, %eax
180 jnz .Lend_\@
181 /* From userspace with kernel cr3 - BUG */
182 ud2
183.Lend_\@:
184#endif
185.endm
186
187/*
188 * Switch to kernel cr3 if not already loaded and return current cr3 in
189 * \scratch_reg
190 */
191.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
192 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
193 movl %cr3, \scratch_reg
194 /* Test if we are already on kernel CR3 */
195 testl $PTI_SWITCH_MASK, \scratch_reg
196 jz .Lend_\@
197 andl $(~PTI_SWITCH_MASK), \scratch_reg
198 movl \scratch_reg, %cr3
199 /* Return original CR3 in \scratch_reg */
200 orl $PTI_SWITCH_MASK, \scratch_reg
201.Lend_\@:
202.endm
203
204.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
158 cld 205 cld
159 PUSH_GS 206 PUSH_GS
160 pushl %fs 207 pushl %fs
@@ -173,6 +220,29 @@
173 movl $(__KERNEL_PERCPU), %edx 220 movl $(__KERNEL_PERCPU), %edx
174 movl %edx, %fs 221 movl %edx, %fs
175 SET_KERNEL_GS %edx 222 SET_KERNEL_GS %edx
223
224 /* Switch to kernel stack if necessary */
225.if \switch_stacks > 0
226 SWITCH_TO_KERNEL_STACK
227.endif
228
229.endm
230
231.macro SAVE_ALL_NMI cr3_reg:req
232 SAVE_ALL
233
234 BUG_IF_WRONG_CR3
235
236 /*
237 * Now switch the CR3 when PTI is enabled.
238 *
239 * We can enter with either user or kernel cr3, the code will
240 * store the old cr3 in \cr3_reg and switches to the kernel cr3
241 * if necessary.
242 */
243 SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
244
245.Lend_\@:
176.endm 246.endm
177 247
178/* 248/*
@@ -221,6 +291,349 @@
221 POP_GS_EX 291 POP_GS_EX
222.endm 292.endm
223 293
294.macro RESTORE_ALL_NMI cr3_reg:req pop=0
295 /*
296 * Now switch the CR3 when PTI is enabled.
297 *
298 * We enter with kernel cr3 and switch the cr3 to the value
299 * stored on \cr3_reg, which is either a user or a kernel cr3.
300 */
301 ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
302
303 testl $PTI_SWITCH_MASK, \cr3_reg
304 jz .Lswitched_\@
305
306 /* User cr3 in \cr3_reg - write it to hardware cr3 */
307 movl \cr3_reg, %cr3
308
309.Lswitched_\@:
310
311 BUG_IF_WRONG_CR3
312
313 RESTORE_REGS pop=\pop
314.endm
315
316.macro CHECK_AND_APPLY_ESPFIX
317#ifdef CONFIG_X86_ESPFIX32
318#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
319
320 ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
321
322 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
323 /*
324 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
325 * are returning to the kernel.
326 * See comments in process.c:copy_thread() for details.
327 */
328 movb PT_OLDSS(%esp), %ah
329 movb PT_CS(%esp), %al
330 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
331 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
332 jne .Lend_\@ # returning to user-space with LDT SS
333
334 /*
335 * Setup and switch to ESPFIX stack
336 *
337 * We're returning to userspace with a 16 bit stack. The CPU will not
338 * restore the high word of ESP for us on executing iret... This is an
339 * "official" bug of all the x86-compatible CPUs, which we can work
340 * around to make dosemu and wine happy. We do this by preloading the
341 * high word of ESP with the high word of the userspace ESP while
342 * compensating for the offset by changing to the ESPFIX segment with
343 * a base address that matches for the difference.
344 */
345 mov %esp, %edx /* load kernel esp */
346 mov PT_OLDESP(%esp), %eax /* load userspace esp */
347 mov %dx, %ax /* eax: new kernel esp */
348 sub %eax, %edx /* offset (low word is 0) */
349 shr $16, %edx
350 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
351 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
352 pushl $__ESPFIX_SS
353 pushl %eax /* new kernel esp */
354 /*
355 * Disable interrupts, but do not irqtrace this section: we
356 * will soon execute iret and the tracer was already set to
357 * the irqstate after the IRET:
358 */
359 DISABLE_INTERRUPTS(CLBR_ANY)
360 lss (%esp), %esp /* switch to espfix segment */
361.Lend_\@:
362#endif /* CONFIG_X86_ESPFIX32 */
363.endm
364
365/*
366 * Called with pt_regs fully populated and kernel segments loaded,
367 * so we can access PER_CPU and use the integer registers.
368 *
369 * We need to be very careful here with the %esp switch, because an NMI
370 * can happen everywhere. If the NMI handler finds itself on the
371 * entry-stack, it will overwrite the task-stack and everything we
372 * copied there. So allocate the stack-frame on the task-stack and
373 * switch to it before we do any copying.
374 */
375
376#define CS_FROM_ENTRY_STACK (1 << 31)
377#define CS_FROM_USER_CR3 (1 << 30)
378
379.macro SWITCH_TO_KERNEL_STACK
380
381 ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
382
383 BUG_IF_WRONG_CR3
384
385 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
386
387 /*
388 * %eax now contains the entry cr3 and we carry it forward in
389 * that register for the time this macro runs
390 */
391
392 /* Are we on the entry stack? Bail out if not! */
393 movl PER_CPU_VAR(cpu_entry_area), %ecx
394 addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
395 subl %esp, %ecx /* ecx = (end of entry_stack) - esp */
396 cmpl $SIZEOF_entry_stack, %ecx
397 jae .Lend_\@
398
399 /* Load stack pointer into %esi and %edi */
400 movl %esp, %esi
401 movl %esi, %edi
402
403 /* Move %edi to the top of the entry stack */
404 andl $(MASK_entry_stack), %edi
405 addl $(SIZEOF_entry_stack), %edi
406
407 /* Load top of task-stack into %edi */
408 movl TSS_entry2task_stack(%edi), %edi
409
410 /*
411 * Clear unused upper bits of the dword containing the word-sized CS
412 * slot in pt_regs in case hardware didn't clear it for us.
413 */
414 andl $(0x0000ffff), PT_CS(%esp)
415
416 /* Special case - entry from kernel mode via entry stack */
417#ifdef CONFIG_VM86
418 movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
419 movb PT_CS(%esp), %cl
420 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
421#else
422 movl PT_CS(%esp), %ecx
423 andl $SEGMENT_RPL_MASK, %ecx
424#endif
425 cmpl $USER_RPL, %ecx
426 jb .Lentry_from_kernel_\@
427
428 /* Bytes to copy */
429 movl $PTREGS_SIZE, %ecx
430
431#ifdef CONFIG_VM86
432 testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)
433 jz .Lcopy_pt_regs_\@
434
435 /*
436 * Stack-frame contains 4 additional segment registers when
437 * coming from VM86 mode
438 */
439 addl $(4 * 4), %ecx
440
441#endif
442.Lcopy_pt_regs_\@:
443
444 /* Allocate frame on task-stack */
445 subl %ecx, %edi
446
447 /* Switch to task-stack */
448 movl %edi, %esp
449
450 /*
451 * We are now on the task-stack and can safely copy over the
452 * stack-frame
453 */
454 shrl $2, %ecx
455 cld
456 rep movsl
457
458 jmp .Lend_\@
459
460.Lentry_from_kernel_\@:
461
462 /*
463 * This handles the case when we enter the kernel from
464 * kernel-mode and %esp points to the entry-stack. When this
465 * happens we need to switch to the task-stack to run C code,
466 * but switch back to the entry-stack again when we approach
467 * iret and return to the interrupted code-path. This usually
468 * happens when we hit an exception while restoring user-space
469 * segment registers on the way back to user-space or when the
470 * sysenter handler runs with eflags.tf set.
471 *
472 * When we switch to the task-stack here, we can't trust the
473 * contents of the entry-stack anymore, as the exception handler
474 * might be scheduled out or moved to another CPU. Therefore we
475 * copy the complete entry-stack to the task-stack and set a
476 * marker in the iret-frame (bit 31 of the CS dword) to detect
477 * what we've done on the iret path.
478 *
479 * On the iret path we copy everything back and switch to the
480 * entry-stack, so that the interrupted kernel code-path
481 * continues on the same stack it was interrupted with.
482 *
483 * Be aware that an NMI can happen anytime in this code.
484 *
485 * %esi: Entry-Stack pointer (same as %esp)
486 * %edi: Top of the task stack
487 * %eax: CR3 on kernel entry
488 */
489
490 /* Calculate number of bytes on the entry stack in %ecx */
491 movl %esi, %ecx
492
493 /* %ecx to the top of entry-stack */
494 andl $(MASK_entry_stack), %ecx
495 addl $(SIZEOF_entry_stack), %ecx
496
497 /* Number of bytes on the entry stack to %ecx */
498 sub %esi, %ecx
499
500 /* Mark stackframe as coming from entry stack */
501 orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
502
503 /*
504 * Test the cr3 used to enter the kernel and add a marker
505 * so that we can switch back to it before iret.
506 */
507 testl $PTI_SWITCH_MASK, %eax
508 jz .Lcopy_pt_regs_\@
509 orl $CS_FROM_USER_CR3, PT_CS(%esp)
510
511 /*
512 * %esi and %edi are unchanged, %ecx contains the number of
513 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
514 * the stack-frame on task-stack and copy everything over
515 */
516 jmp .Lcopy_pt_regs_\@
517
518.Lend_\@:
519.endm
520
521/*
522 * Switch back from the kernel stack to the entry stack.
523 *
524 * The %esp register must point to pt_regs on the task stack. It will
525 * first calculate the size of the stack-frame to copy, depending on
526 * whether we return to VM86 mode or not. With that it uses 'rep movsl'
527 * to copy the contents of the stack over to the entry stack.
528 *
529 * We must be very careful here, as we can't trust the contents of the
530 * task-stack once we switched to the entry-stack. When an NMI happens
531 * while on the entry-stack, the NMI handler will switch back to the top
532 * of the task stack, overwriting our stack-frame we are about to copy.
533 * Therefore we switch the stack only after everything is copied over.
534 */
535.macro SWITCH_TO_ENTRY_STACK
536
537 ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
538
539 /* Bytes to copy */
540 movl $PTREGS_SIZE, %ecx
541
542#ifdef CONFIG_VM86
543 testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
544 jz .Lcopy_pt_regs_\@
545
546 /* Additional 4 registers to copy when returning to VM86 mode */
547 addl $(4 * 4), %ecx
548
549.Lcopy_pt_regs_\@:
550#endif
551
552 /* Initialize source and destination for movsl */
553 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
554 subl %ecx, %edi
555 movl %esp, %esi
556
557 /* Save future stack pointer in %ebx */
558 movl %edi, %ebx
559
560 /* Copy over the stack-frame */
561 shrl $2, %ecx
562 cld
563 rep movsl
564
565 /*
566 * Switch to entry-stack - needs to happen after everything is
567 * copied because the NMI handler will overwrite the task-stack
568 * when on entry-stack
569 */
570 movl %ebx, %esp
571
572.Lend_\@:
573.endm
574
575/*
576 * This macro handles the case when we return to kernel-mode on the iret
577 * path and have to switch back to the entry stack and/or user-cr3
578 *
579 * See the comments below the .Lentry_from_kernel_\@ label in the
580 * SWITCH_TO_KERNEL_STACK macro for more details.
581 */
582.macro PARANOID_EXIT_TO_KERNEL_MODE
583
584 /*
585 * Test if we entered the kernel with the entry-stack. Most
586 * likely we did not, because this code only runs on the
587 * return-to-kernel path.
588 */
589 testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
590 jz .Lend_\@
591
592 /* Unlikely slow-path */
593
594 /* Clear marker from stack-frame */
595 andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
596
597 /* Copy the remaining task-stack contents to entry-stack */
598 movl %esp, %esi
599 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
600
601 /* Bytes on the task-stack to ecx */
602 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
603 subl %esi, %ecx
604
605 /* Allocate stack-frame on entry-stack */
606 subl %ecx, %edi
607
608 /*
609 * Save future stack-pointer, we must not switch until the
610 * copy is done, otherwise the NMI handler could destroy the
611 * contents of the task-stack we are about to copy.
612 */
613 movl %edi, %ebx
614
615 /* Do the copy */
616 shrl $2, %ecx
617 cld
618 rep movsl
619
620 /* Safe to switch to entry-stack now */
621 movl %ebx, %esp
622
623 /*
624 * We came from entry-stack and need to check if we also need to
625 * switch back to user cr3.
626 */
627 testl $CS_FROM_USER_CR3, PT_CS(%esp)
628 jz .Lend_\@
629
630 /* Clear marker from stack-frame */
631 andl $(~CS_FROM_USER_CR3), PT_CS(%esp)
632
633 SWITCH_TO_USER_CR3 scratch_reg=%eax
634
635.Lend_\@:
636.endm
224/* 637/*
225 * %eax: prev task 638 * %eax: prev task
226 * %edx: next task 639 * %edx: next task
@@ -351,9 +764,9 @@ ENTRY(resume_kernel)
351 DISABLE_INTERRUPTS(CLBR_ANY) 764 DISABLE_INTERRUPTS(CLBR_ANY)
352.Lneed_resched: 765.Lneed_resched:
353 cmpl $0, PER_CPU_VAR(__preempt_count) 766 cmpl $0, PER_CPU_VAR(__preempt_count)
354 jnz restore_all 767 jnz restore_all_kernel
355 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? 768 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
356 jz restore_all 769 jz restore_all_kernel
357 call preempt_schedule_irq 770 call preempt_schedule_irq
358 jmp .Lneed_resched 771 jmp .Lneed_resched
359END(resume_kernel) 772END(resume_kernel)
@@ -412,7 +825,21 @@ ENTRY(xen_sysenter_target)
412 * 0(%ebp) arg6 825 * 0(%ebp) arg6
413 */ 826 */
414ENTRY(entry_SYSENTER_32) 827ENTRY(entry_SYSENTER_32)
415 movl TSS_sysenter_sp0(%esp), %esp 828 /*
829 * On entry-stack with all userspace-regs live - save and
830 * restore eflags and %eax to use it as scratch-reg for the cr3
831 * switch.
832 */
833 pushfl
834 pushl %eax
835 BUG_IF_WRONG_CR3 no_user_check=1
836 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
837 popl %eax
838 popfl
839
840 /* Stack empty again, switch to task stack */
841 movl TSS_entry2task_stack(%esp), %esp
842
416.Lsysenter_past_esp: 843.Lsysenter_past_esp:
417 pushl $__USER_DS /* pt_regs->ss */ 844 pushl $__USER_DS /* pt_regs->ss */
418 pushl %ebp /* pt_regs->sp (stashed in bp) */ 845 pushl %ebp /* pt_regs->sp (stashed in bp) */
@@ -421,7 +848,7 @@ ENTRY(entry_SYSENTER_32)
421 pushl $__USER_CS /* pt_regs->cs */ 848 pushl $__USER_CS /* pt_regs->cs */
422 pushl $0 /* pt_regs->ip = 0 (placeholder) */ 849 pushl $0 /* pt_regs->ip = 0 (placeholder) */
423 pushl %eax /* pt_regs->orig_ax */ 850 pushl %eax /* pt_regs->orig_ax */
424 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 851 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
425 852
426 /* 853 /*
427 * SYSENTER doesn't filter flags, so we need to clear NT, AC 854 * SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -460,25 +887,49 @@ ENTRY(entry_SYSENTER_32)
460 887
461/* Opportunistic SYSEXIT */ 888/* Opportunistic SYSEXIT */
462 TRACE_IRQS_ON /* User mode traces as IRQs on. */ 889 TRACE_IRQS_ON /* User mode traces as IRQs on. */
890
891 /*
892 * Setup entry stack - we keep the pointer in %eax and do the
893 * switch after almost all user-state is restored.
894 */
895
896 /* Load entry stack pointer and allocate frame for eflags/eax */
897 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
898 subl $(2*4), %eax
899
900 /* Copy eflags and eax to entry stack */
901 movl PT_EFLAGS(%esp), %edi
902 movl PT_EAX(%esp), %esi
903 movl %edi, (%eax)
904 movl %esi, 4(%eax)
905
906 /* Restore user registers and segments */
463 movl PT_EIP(%esp), %edx /* pt_regs->ip */ 907 movl PT_EIP(%esp), %edx /* pt_regs->ip */
464 movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 908 movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
4651: mov PT_FS(%esp), %fs 9091: mov PT_FS(%esp), %fs
466 PTGS_TO_GS 910 PTGS_TO_GS
911
467 popl %ebx /* pt_regs->bx */ 912 popl %ebx /* pt_regs->bx */
468 addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ 913 addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
469 popl %esi /* pt_regs->si */ 914 popl %esi /* pt_regs->si */
470 popl %edi /* pt_regs->di */ 915 popl %edi /* pt_regs->di */
471 popl %ebp /* pt_regs->bp */ 916 popl %ebp /* pt_regs->bp */
472 popl %eax /* pt_regs->ax */ 917
918 /* Switch to entry stack */
919 movl %eax, %esp
920
921 /* Now ready to switch the cr3 */
922 SWITCH_TO_USER_CR3 scratch_reg=%eax
473 923
474 /* 924 /*
475 * Restore all flags except IF. (We restore IF separately because 925 * Restore all flags except IF. (We restore IF separately because
476 * STI gives a one-instruction window in which we won't be interrupted, 926 * STI gives a one-instruction window in which we won't be interrupted,
477 * whereas POPF does not.) 927 * whereas POPF does not.)
478 */ 928 */
479 addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
480 btrl $X86_EFLAGS_IF_BIT, (%esp) 929 btrl $X86_EFLAGS_IF_BIT, (%esp)
930 BUG_IF_WRONG_CR3 no_user_check=1
481 popfl 931 popfl
932 popl %eax
482 933
483 /* 934 /*
484 * Return back to the vDSO, which will pop ecx and edx. 935 * Return back to the vDSO, which will pop ecx and edx.
@@ -532,7 +983,8 @@ ENDPROC(entry_SYSENTER_32)
532ENTRY(entry_INT80_32) 983ENTRY(entry_INT80_32)
533 ASM_CLAC 984 ASM_CLAC
534 pushl %eax /* pt_regs->orig_ax */ 985 pushl %eax /* pt_regs->orig_ax */
535 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 986
987 SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
536 988
537 /* 989 /*
538 * User mode is traced as though IRQs are on, and the interrupt gate 990 * User mode is traced as though IRQs are on, and the interrupt gate
@@ -546,24 +998,17 @@ ENTRY(entry_INT80_32)
546 998
547restore_all: 999restore_all:
548 TRACE_IRQS_IRET 1000 TRACE_IRQS_IRET
1001 SWITCH_TO_ENTRY_STACK
549.Lrestore_all_notrace: 1002.Lrestore_all_notrace:
550#ifdef CONFIG_X86_ESPFIX32 1003 CHECK_AND_APPLY_ESPFIX
551 ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
552
553 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
554 /*
555 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
556 * are returning to the kernel.
557 * See comments in process.c:copy_thread() for details.
558 */
559 movb PT_OLDSS(%esp), %ah
560 movb PT_CS(%esp), %al
561 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
562 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
563 je .Lldt_ss # returning to user-space with LDT SS
564#endif
565.Lrestore_nocheck: 1004.Lrestore_nocheck:
566 RESTORE_REGS 4 # skip orig_eax/error_code 1005 /* Switch back to user CR3 */
1006 SWITCH_TO_USER_CR3 scratch_reg=%eax
1007
1008 BUG_IF_WRONG_CR3
1009
1010 /* Restore user state */
1011 RESTORE_REGS pop=4 # skip orig_eax/error_code
567.Lirq_return: 1012.Lirq_return:
568 /* 1013 /*
569 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization 1014 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@@ -572,46 +1017,33 @@ restore_all:
572 */ 1017 */
573 INTERRUPT_RETURN 1018 INTERRUPT_RETURN
574 1019
1020restore_all_kernel:
1021 TRACE_IRQS_IRET
1022 PARANOID_EXIT_TO_KERNEL_MODE
1023 BUG_IF_WRONG_CR3
1024 RESTORE_REGS 4
1025 jmp .Lirq_return
1026
575.section .fixup, "ax" 1027.section .fixup, "ax"
576ENTRY(iret_exc ) 1028ENTRY(iret_exc )
577 pushl $0 # no error code 1029 pushl $0 # no error code
578 pushl $do_iret_error 1030 pushl $do_iret_error
579 jmp common_exception
580.previous
581 _ASM_EXTABLE(.Lirq_return, iret_exc)
582 1031
583#ifdef CONFIG_X86_ESPFIX32 1032#ifdef CONFIG_DEBUG_ENTRY
584.Lldt_ss:
585/*
586 * Setup and switch to ESPFIX stack
587 *
588 * We're returning to userspace with a 16 bit stack. The CPU will not
589 * restore the high word of ESP for us on executing iret... This is an
590 * "official" bug of all the x86-compatible CPUs, which we can work
591 * around to make dosemu and wine happy. We do this by preloading the
592 * high word of ESP with the high word of the userspace ESP while
593 * compensating for the offset by changing to the ESPFIX segment with
594 * a base address that matches for the difference.
595 */
596#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
597 mov %esp, %edx /* load kernel esp */
598 mov PT_OLDESP(%esp), %eax /* load userspace esp */
599 mov %dx, %ax /* eax: new kernel esp */
600 sub %eax, %edx /* offset (low word is 0) */
601 shr $16, %edx
602 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
603 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
604 pushl $__ESPFIX_SS
605 pushl %eax /* new kernel esp */
606 /* 1033 /*
607 * Disable interrupts, but do not irqtrace this section: we 1034 * The stack-frame here is the one that iret faulted on, so its a
608 * will soon execute iret and the tracer was already set to 1035 * return-to-user frame. We are on kernel-cr3 because we come here from
609 * the irqstate after the IRET: 1036 * the fixup code. This confuses the CR3 checker, so switch to user-cr3
1037 * as the checker expects it.
610 */ 1038 */
611 DISABLE_INTERRUPTS(CLBR_ANY) 1039 pushl %eax
612 lss (%esp), %esp /* switch to espfix segment */ 1040 SWITCH_TO_USER_CR3 scratch_reg=%eax
613 jmp .Lrestore_nocheck 1041 popl %eax
614#endif 1042#endif
1043
1044 jmp common_exception
1045.previous
1046 _ASM_EXTABLE(.Lirq_return, iret_exc)
615ENDPROC(entry_INT80_32) 1047ENDPROC(entry_INT80_32)
616 1048
617.macro FIXUP_ESPFIX_STACK 1049.macro FIXUP_ESPFIX_STACK
@@ -671,7 +1103,8 @@ END(irq_entries_start)
671common_interrupt: 1103common_interrupt:
672 ASM_CLAC 1104 ASM_CLAC
673 addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ 1105 addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
674 SAVE_ALL 1106
1107 SAVE_ALL switch_stacks=1
675 ENCODE_FRAME_POINTER 1108 ENCODE_FRAME_POINTER
676 TRACE_IRQS_OFF 1109 TRACE_IRQS_OFF
677 movl %esp, %eax 1110 movl %esp, %eax
@@ -679,16 +1112,16 @@ common_interrupt:
679 jmp ret_from_intr 1112 jmp ret_from_intr
680ENDPROC(common_interrupt) 1113ENDPROC(common_interrupt)
681 1114
682#define BUILD_INTERRUPT3(name, nr, fn) \ 1115#define BUILD_INTERRUPT3(name, nr, fn) \
683ENTRY(name) \ 1116ENTRY(name) \
684 ASM_CLAC; \ 1117 ASM_CLAC; \
685 pushl $~(nr); \ 1118 pushl $~(nr); \
686 SAVE_ALL; \ 1119 SAVE_ALL switch_stacks=1; \
687 ENCODE_FRAME_POINTER; \ 1120 ENCODE_FRAME_POINTER; \
688 TRACE_IRQS_OFF \ 1121 TRACE_IRQS_OFF \
689 movl %esp, %eax; \ 1122 movl %esp, %eax; \
690 call fn; \ 1123 call fn; \
691 jmp ret_from_intr; \ 1124 jmp ret_from_intr; \
692ENDPROC(name) 1125ENDPROC(name)
693 1126
694#define BUILD_INTERRUPT(name, nr) \ 1127#define BUILD_INTERRUPT(name, nr) \
@@ -920,16 +1353,20 @@ common_exception:
920 pushl %es 1353 pushl %es
921 pushl %ds 1354 pushl %ds
922 pushl %eax 1355 pushl %eax
1356 movl $(__USER_DS), %eax
1357 movl %eax, %ds
1358 movl %eax, %es
1359 movl $(__KERNEL_PERCPU), %eax
1360 movl %eax, %fs
923 pushl %ebp 1361 pushl %ebp
924 pushl %edi 1362 pushl %edi
925 pushl %esi 1363 pushl %esi
926 pushl %edx 1364 pushl %edx
927 pushl %ecx 1365 pushl %ecx
928 pushl %ebx 1366 pushl %ebx
1367 SWITCH_TO_KERNEL_STACK
929 ENCODE_FRAME_POINTER 1368 ENCODE_FRAME_POINTER
930 cld 1369 cld
931 movl $(__KERNEL_PERCPU), %ecx
932 movl %ecx, %fs
933 UNWIND_ESPFIX_STACK 1370 UNWIND_ESPFIX_STACK
934 GS_TO_REG %ecx 1371 GS_TO_REG %ecx
935 movl PT_GS(%esp), %edi # get the function address 1372 movl PT_GS(%esp), %edi # get the function address
@@ -937,9 +1374,6 @@ common_exception:
937 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1374 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
938 REG_TO_PTGS %ecx 1375 REG_TO_PTGS %ecx
939 SET_KERNEL_GS %ecx 1376 SET_KERNEL_GS %ecx
940 movl $(__USER_DS), %ecx
941 movl %ecx, %ds
942 movl %ecx, %es
943 TRACE_IRQS_OFF 1377 TRACE_IRQS_OFF
944 movl %esp, %eax # pt_regs pointer 1378 movl %esp, %eax # pt_regs pointer
945 CALL_NOSPEC %edi 1379 CALL_NOSPEC %edi
@@ -948,40 +1382,12 @@ END(common_exception)
948 1382
949ENTRY(debug) 1383ENTRY(debug)
950 /* 1384 /*
951 * #DB can happen at the first instruction of 1385 * Entry from sysenter is now handled in common_exception
952 * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
953 * happens, then we will be running on a very small stack. We
954 * need to detect this condition and switch to the thread
955 * stack before calling any C code at all.
956 *
957 * If you edit this code, keep in mind that NMIs can happen in here.
958 */ 1386 */
959 ASM_CLAC 1387 ASM_CLAC
960 pushl $-1 # mark this as an int 1388 pushl $-1 # mark this as an int
961 SAVE_ALL 1389 pushl $do_debug
962 ENCODE_FRAME_POINTER 1390 jmp common_exception
963 xorl %edx, %edx # error code 0
964 movl %esp, %eax # pt_regs pointer
965
966 /* Are we currently on the SYSENTER stack? */
967 movl PER_CPU_VAR(cpu_entry_area), %ecx
968 addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
969 subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
970 cmpl $SIZEOF_entry_stack, %ecx
971 jb .Ldebug_from_sysenter_stack
972
973 TRACE_IRQS_OFF
974 call do_debug
975 jmp ret_from_exception
976
977.Ldebug_from_sysenter_stack:
978 /* We're on the SYSENTER stack. Switch off. */
979 movl %esp, %ebx
980 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
981 TRACE_IRQS_OFF
982 call do_debug
983 movl %ebx, %esp
984 jmp ret_from_exception
985END(debug) 1391END(debug)
986 1392
987/* 1393/*
@@ -993,6 +1399,7 @@ END(debug)
993 */ 1399 */
994ENTRY(nmi) 1400ENTRY(nmi)
995 ASM_CLAC 1401 ASM_CLAC
1402
996#ifdef CONFIG_X86_ESPFIX32 1403#ifdef CONFIG_X86_ESPFIX32
997 pushl %eax 1404 pushl %eax
998 movl %ss, %eax 1405 movl %ss, %eax
@@ -1002,7 +1409,7 @@ ENTRY(nmi)
1002#endif 1409#endif
1003 1410
1004 pushl %eax # pt_regs->orig_ax 1411 pushl %eax # pt_regs->orig_ax
1005 SAVE_ALL 1412 SAVE_ALL_NMI cr3_reg=%edi
1006 ENCODE_FRAME_POINTER 1413 ENCODE_FRAME_POINTER
1007 xorl %edx, %edx # zero error code 1414 xorl %edx, %edx # zero error code
1008 movl %esp, %eax # pt_regs pointer 1415 movl %esp, %eax # pt_regs pointer
@@ -1016,7 +1423,7 @@ ENTRY(nmi)
1016 1423
1017 /* Not on SYSENTER stack. */ 1424 /* Not on SYSENTER stack. */
1018 call do_nmi 1425 call do_nmi
1019 jmp .Lrestore_all_notrace 1426 jmp .Lnmi_return
1020 1427
1021.Lnmi_from_sysenter_stack: 1428.Lnmi_from_sysenter_stack:
1022 /* 1429 /*
@@ -1027,7 +1434,11 @@ ENTRY(nmi)
1027 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1434 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1028 call do_nmi 1435 call do_nmi
1029 movl %ebx, %esp 1436 movl %ebx, %esp
1030 jmp .Lrestore_all_notrace 1437
1438.Lnmi_return:
1439 CHECK_AND_APPLY_ESPFIX
1440 RESTORE_ALL_NMI cr3_reg=%edi pop=4
1441 jmp .Lirq_return
1031 1442
1032#ifdef CONFIG_X86_ESPFIX32 1443#ifdef CONFIG_X86_ESPFIX32
1033.Lnmi_espfix_stack: 1444.Lnmi_espfix_stack:
@@ -1042,12 +1453,12 @@ ENTRY(nmi)
1042 pushl 16(%esp) 1453 pushl 16(%esp)
1043 .endr 1454 .endr
1044 pushl %eax 1455 pushl %eax
1045 SAVE_ALL 1456 SAVE_ALL_NMI cr3_reg=%edi
1046 ENCODE_FRAME_POINTER 1457 ENCODE_FRAME_POINTER
1047 FIXUP_ESPFIX_STACK # %eax == %esp 1458 FIXUP_ESPFIX_STACK # %eax == %esp
1048 xorl %edx, %edx # zero error code 1459 xorl %edx, %edx # zero error code
1049 call do_nmi 1460 call do_nmi
1050 RESTORE_REGS 1461 RESTORE_ALL_NMI cr3_reg=%edi
1051 lss 12+4(%esp), %esp # back to espfix stack 1462 lss 12+4(%esp), %esp # back to espfix stack
1052 jmp .Lirq_return 1463 jmp .Lirq_return
1053#endif 1464#endif
@@ -1056,7 +1467,8 @@ END(nmi)
1056ENTRY(int3) 1467ENTRY(int3)
1057 ASM_CLAC 1468 ASM_CLAC
1058 pushl $-1 # mark this as an int 1469 pushl $-1 # mark this as an int
1059 SAVE_ALL 1470
1471 SAVE_ALL switch_stacks=1
1060 ENCODE_FRAME_POINTER 1472 ENCODE_FRAME_POINTER
1061 TRACE_IRQS_OFF 1473 TRACE_IRQS_OFF
1062 xorl %edx, %edx # zero error code 1474 xorl %edx, %edx # zero error code
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 7fff98fa5855..b5c60faf8429 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,6 +219,7 @@
219#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ 219#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
220#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ 220#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
221#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ 221#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
222#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */
222 223
223/* Virtualization flags: Linux defined, word 8 */ 224/* Virtualization flags: Linux defined, word 8 */
224#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ 225#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bbc796eb0a3b..eeeb9289c764 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -71,12 +71,7 @@ struct ldt_struct {
71 71
72static inline void *ldt_slot_va(int slot) 72static inline void *ldt_slot_va(int slot)
73{ 73{
74#ifdef CONFIG_X86_64
75 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); 74 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
76#else
77 BUG();
78 return (void *)fix_to_virt(FIX_HOLE);
79#endif
80} 75}
81 76
82/* 77/*
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f6f6c63da62f..fd2a8c1b88bc 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -214,7 +214,7 @@ enum spectre_v2_mitigation {
214 SPECTRE_V2_RETPOLINE_MINIMAL_AMD, 214 SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
215 SPECTRE_V2_RETPOLINE_GENERIC, 215 SPECTRE_V2_RETPOLINE_GENERIC,
216 SPECTRE_V2_RETPOLINE_AMD, 216 SPECTRE_V2_RETPOLINE_AMD,
217 SPECTRE_V2_IBRS, 217 SPECTRE_V2_IBRS_ENHANCED,
218}; 218};
219 219
220/* The Speculative Store Bypass disable variants */ 220/* The Speculative Store Bypass disable variants */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 685ffe8a0eaf..c399ea5eea41 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
19 19
20static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 20static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
21{ 21{
22#ifdef CONFIG_PAGE_TABLE_ISOLATION
23 pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd);
24#endif
22 *pmdp = pmd; 25 *pmdp = pmd;
23} 26}
24 27
@@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
58#ifdef CONFIG_SMP 61#ifdef CONFIG_SMP
59static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) 62static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
60{ 63{
64#ifdef CONFIG_PAGE_TABLE_ISOLATION
65 pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0));
66#endif
61 return __pmd(xchg((pmdval_t *)xp, 0)); 67 return __pmd(xchg((pmdval_t *)xp, 0));
62} 68}
63#else 69#else
@@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
67#ifdef CONFIG_SMP 73#ifdef CONFIG_SMP
68static inline pud_t native_pudp_get_and_clear(pud_t *xp) 74static inline pud_t native_pudp_get_and_clear(pud_t *xp)
69{ 75{
76#ifdef CONFIG_PAGE_TABLE_ISOLATION
77 pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0));
78#endif
70 return __pud(xchg((pudval_t *)xp, 0)); 79 return __pud(xchg((pudval_t *)xp, 0));
71} 80}
72#else 81#else
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index f982ef808e7e..6deb6cd236e3 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -35,4 +35,7 @@ typedef union {
35 35
36#define PTRS_PER_PTE 1024 36#define PTRS_PER_PTE 1024
37 37
38/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
39#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
40
38#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ 41#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index f24df59c40b2..f2ca3139ca22 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
98 98
99static inline void native_set_pud(pud_t *pudp, pud_t pud) 99static inline void native_set_pud(pud_t *pudp, pud_t pud)
100{ 100{
101#ifdef CONFIG_PAGE_TABLE_ISOLATION
102 pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
103#endif
101 set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); 104 set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
102} 105}
103 106
@@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
229{ 232{
230 union split_pud res, *orig = (union split_pud *)pudp; 233 union split_pud res, *orig = (union split_pud *)pudp;
231 234
235#ifdef CONFIG_PAGE_TABLE_ISOLATION
236 pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
237#endif
238
232 /* xchg acts as a barrier before setting of the high bits */ 239 /* xchg acts as a barrier before setting of the high bits */
233 res.pud_low = xchg(&orig->pud_low, 0); 240 res.pud_low = xchg(&orig->pud_low, 0);
234 res.pud_high = orig->pud_high; 241 res.pud_high = orig->pud_high;
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 6a59a6d0cc50..858358a82b14 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -21,9 +21,10 @@ typedef union {
21#endif /* !__ASSEMBLY__ */ 21#endif /* !__ASSEMBLY__ */
22 22
23#ifdef CONFIG_PARAVIRT 23#ifdef CONFIG_PARAVIRT
24#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) 24#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \
25 (pv_info.shared_kernel_pmd)))
25#else 26#else
26#define SHARED_KERNEL_PMD 1 27#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
27#endif 28#endif
28 29
29/* 30/*
@@ -45,5 +46,6 @@ typedef union {
45#define PTRS_PER_PTE 512 46#define PTRS_PER_PTE 512
46 47
47#define MAX_POSSIBLE_PHYSMEM_BITS 36 48#define MAX_POSSIBLE_PHYSMEM_BITS 36
49#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
48 50
49#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ 51#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5715647fc4fe..a1cb3339da8d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
30void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 30void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
31void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); 31void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
32void ptdump_walk_pgd_level_checkwx(void); 32void ptdump_walk_pgd_level_checkwx(void);
33void ptdump_walk_user_pgd_level_checkwx(void);
33 34
34#ifdef CONFIG_DEBUG_WX 35#ifdef CONFIG_DEBUG_WX
35#define debug_checkwx() ptdump_walk_pgd_level_checkwx() 36#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
37#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
36#else 38#else
37#define debug_checkwx() do { } while (0) 39#define debug_checkwx() do { } while (0)
40#define debug_checkwx_user() do { } while (0)
38#endif 41#endif
39 42
40/* 43/*
@@ -640,8 +643,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
640 643
641pmd_t *populate_extra_pmd(unsigned long vaddr); 644pmd_t *populate_extra_pmd(unsigned long vaddr);
642pte_t *populate_extra_pte(unsigned long vaddr); 645pte_t *populate_extra_pte(unsigned long vaddr);
646
647#ifdef CONFIG_PAGE_TABLE_ISOLATION
648pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
649
650/*
651 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
652 * Populates the user and returns the resulting PGD that must be set in
653 * the kernel copy of the page tables.
654 */
655static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
656{
657 if (!static_cpu_has(X86_FEATURE_PTI))
658 return pgd;
659 return __pti_set_user_pgtbl(pgdp, pgd);
660}
661#else /* CONFIG_PAGE_TABLE_ISOLATION */
662static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
663{
664 return pgd;
665}
666#endif /* CONFIG_PAGE_TABLE_ISOLATION */
667
643#endif /* __ASSEMBLY__ */ 668#endif /* __ASSEMBLY__ */
644 669
670
645#ifdef CONFIG_X86_32 671#ifdef CONFIG_X86_32
646# include <asm/pgtable_32.h> 672# include <asm/pgtable_32.h>
647#else 673#else
@@ -1154,6 +1180,70 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
1154 } 1180 }
1155} 1181}
1156#endif 1182#endif
1183/*
1184 * Page table pages are page-aligned. The lower half of the top
1185 * level is used for userspace and the top half for the kernel.
1186 *
1187 * Returns true for parts of the PGD that map userspace and
1188 * false for the parts that map the kernel.
1189 */
1190static inline bool pgdp_maps_userspace(void *__ptr)
1191{
1192 unsigned long ptr = (unsigned long)__ptr;
1193
1194 return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
1195}
1196
1197static inline int pgd_large(pgd_t pgd) { return 0; }
1198
1199#ifdef CONFIG_PAGE_TABLE_ISOLATION
1200/*
1201 * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
1202 * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
1203 * the user one is in the last 4k. To switch between them, you
1204 * just need to flip the 12th bit in their addresses.
1205 */
1206#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
1207
1208/*
1209 * This generates better code than the inline assembly in
1210 * __set_bit().
1211 */
1212static inline void *ptr_set_bit(void *ptr, int bit)
1213{
1214 unsigned long __ptr = (unsigned long)ptr;
1215
1216 __ptr |= BIT(bit);
1217 return (void *)__ptr;
1218}
1219static inline void *ptr_clear_bit(void *ptr, int bit)
1220{
1221 unsigned long __ptr = (unsigned long)ptr;
1222
1223 __ptr &= ~BIT(bit);
1224 return (void *)__ptr;
1225}
1226
1227static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
1228{
1229 return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1230}
1231
1232static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
1233{
1234 return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1235}
1236
1237static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
1238{
1239 return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1240}
1241
1242static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
1243{
1244 return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1245}
1246#endif /* CONFIG_PAGE_TABLE_ISOLATION */
1157 1247
1158/* 1248/*
1159 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 1249 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 88a056b01db4..b3ec519e3982 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { }
34void paging_init(void); 34void paging_init(void);
35void sync_initial_page_table(void); 35void sync_initial_page_table(void);
36 36
37static inline int pgd_large(pgd_t pgd) { return 0; }
38
39/* 37/*
40 * Define this if things work differently on an i386 and an i486: 38 * Define this if things work differently on an i386 and an i486:
41 * it will (on an i486) warn about kernel memory accesses that are 39 * it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index d9a001a4a872..b0bc0fff5f1f 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -50,13 +50,18 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
50 ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ 50 ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
51 & PMD_MASK) 51 & PMD_MASK)
52 52
53#define PKMAP_BASE \ 53#define LDT_BASE_ADDR \
54 ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) 54 ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
55 55
56#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
57
58#define PKMAP_BASE \
59 ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
60
56#ifdef CONFIG_HIGHMEM 61#ifdef CONFIG_HIGHMEM
57# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) 62# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
58#else 63#else
59# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) 64# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
60#endif 65#endif
61 66
62#define MODULES_VADDR VMALLOC_START 67#define MODULES_VADDR VMALLOC_START
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 3c5385f9a88f..acb6970e7bcf 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -132,90 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
132#endif 132#endif
133} 133}
134 134
135#ifdef CONFIG_PAGE_TABLE_ISOLATION
136/*
137 * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
138 * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
139 * the user one is in the last 4k. To switch between them, you
140 * just need to flip the 12th bit in their addresses.
141 */
142#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
143
144/*
145 * This generates better code than the inline assembly in
146 * __set_bit().
147 */
148static inline void *ptr_set_bit(void *ptr, int bit)
149{
150 unsigned long __ptr = (unsigned long)ptr;
151
152 __ptr |= BIT(bit);
153 return (void *)__ptr;
154}
155static inline void *ptr_clear_bit(void *ptr, int bit)
156{
157 unsigned long __ptr = (unsigned long)ptr;
158
159 __ptr &= ~BIT(bit);
160 return (void *)__ptr;
161}
162
163static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
164{
165 return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
166}
167
168static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
169{
170 return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
171}
172
173static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
174{
175 return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
176}
177
178static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
179{
180 return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
181}
182#endif /* CONFIG_PAGE_TABLE_ISOLATION */
183
184/*
185 * Page table pages are page-aligned. The lower half of the top
186 * level is used for userspace and the top half for the kernel.
187 *
188 * Returns true for parts of the PGD that map userspace and
189 * false for the parts that map the kernel.
190 */
191static inline bool pgdp_maps_userspace(void *__ptr)
192{
193 unsigned long ptr = (unsigned long)__ptr;
194
195 return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
196}
197
198#ifdef CONFIG_PAGE_TABLE_ISOLATION
199pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
200
201/*
202 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
203 * Populates the user and returns the resulting PGD that must be set in
204 * the kernel copy of the page tables.
205 */
206static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
207{
208 if (!static_cpu_has(X86_FEATURE_PTI))
209 return pgd;
210 return __pti_set_user_pgd(pgdp, pgd);
211}
212#else
213static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
214{
215 return pgd;
216}
217#endif
218
219static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) 135static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
220{ 136{
221 pgd_t pgd; 137 pgd_t pgd;
@@ -226,7 +142,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
226 } 142 }
227 143
228 pgd = native_make_pgd(native_p4d_val(p4d)); 144 pgd = native_make_pgd(native_p4d_val(p4d));
229 pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); 145 pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
230 *p4dp = native_make_p4d(native_pgd_val(pgd)); 146 *p4dp = native_make_p4d(native_pgd_val(pgd));
231} 147}
232 148
@@ -237,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
237 153
238static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 154static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
239{ 155{
240 *pgdp = pti_set_user_pgd(pgdp, pgd); 156 *pgdp = pti_set_user_pgtbl(pgdp, pgd);
241} 157}
242 158
243static inline void native_pgd_clear(pgd_t *pgd) 159static inline void native_pgd_clear(pgd_t *pgd)
@@ -255,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
255/* 171/*
256 * Level 4 access. 172 * Level 4 access.
257 */ 173 */
258static inline int pgd_large(pgd_t pgd) { return 0; }
259#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) 174#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
260 175
261/* PUD - Level3 access */ 176/* PUD - Level3 access */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 054765ab2da2..04edd2d58211 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d;
115#define LDT_PGD_ENTRY_L5 -112UL 115#define LDT_PGD_ENTRY_L5 -112UL
116#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) 116#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
117#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) 117#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
118#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
118 119
119#define __VMALLOC_BASE_L4 0xffffc90000000000UL 120#define __VMALLOC_BASE_L4 0xffffc90000000000UL
120#define __VMALLOC_BASE_L5 0xffa0000000000000UL 121#define __VMALLOC_BASE_L5 0xffa0000000000000UL
@@ -153,4 +154,6 @@ extern unsigned int ptrs_per_p4d;
153 154
154#define EARLY_DYNAMIC_PAGE_TABLES 64 155#define EARLY_DYNAMIC_PAGE_TABLES 64
155 156
157#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
158
156#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ 159#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 99fff853c944..b64acb08a62b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -50,6 +50,7 @@
50#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 50#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
51#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) 51#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
52#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) 52#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
53#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
53#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 54#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
54#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 55#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
55#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 56#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
266 267
267typedef struct { pgdval_t pgd; } pgd_t; 268typedef struct { pgdval_t pgd; } pgd_t;
268 269
270#ifdef CONFIG_X86_PAE
271
272/*
273 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
274 * use it here.
275 */
276
277#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
278#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
279
280/*
281 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
282 * All other bits are Reserved MBZ
283 */
284#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
285 _PAGE_PWT | _PAGE_PCD | \
286 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
287
288#else
289/* No need to mask any bits for !PAE */
290#define PGD_ALLOWED_BITS (~0ULL)
291#endif
292
269static inline pgd_t native_make_pgd(pgdval_t val) 293static inline pgd_t native_make_pgd(pgdval_t val)
270{ 294{
271 return (pgd_t) { val }; 295 return (pgd_t) { val & PGD_ALLOWED_BITS };
272} 296}
273 297
274static inline pgdval_t native_pgd_val(pgd_t pgd) 298static inline pgdval_t native_pgd_val(pgd_t pgd)
275{ 299{
276 return pgd.pgd; 300 return pgd.pgd & PGD_ALLOWED_BITS;
277} 301}
278 302
279static inline pgdval_t pgd_flags(pgd_t pgd) 303static inline pgdval_t pgd_flags(pgd_t pgd)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 625a52a5594f..02c2cbda4a74 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -39,10 +39,6 @@
39#define CR3_PCID_MASK 0xFFFull 39#define CR3_PCID_MASK 0xFFFull
40#define CR3_NOFLUSH BIT_ULL(63) 40#define CR3_NOFLUSH BIT_ULL(63)
41 41
42#ifdef CONFIG_PAGE_TABLE_ISOLATION
43# define X86_CR3_PTI_PCID_USER_BIT 11
44#endif
45
46#else 42#else
47/* 43/*
48 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 44 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -53,4 +49,8 @@
53#define CR3_NOFLUSH 0 49#define CR3_NOFLUSH 0
54#endif 50#endif
55 51
52#ifdef CONFIG_PAGE_TABLE_ISOLATION
53# define X86_CR3_PTI_PCID_USER_BIT 11
54#endif
55
56#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ 56#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cfd29ee8c3da..59663c08c949 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -966,6 +966,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
966 966
967extern unsigned long arch_align_stack(unsigned long sp); 967extern unsigned long arch_align_stack(unsigned long sp);
968extern void free_init_pages(char *what, unsigned long begin, unsigned long end); 968extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
969extern void free_kernel_image_pages(void *begin, void *end);
969 970
970void default_idle(void); 971void default_idle(void);
971#ifdef CONFIG_XEN 972#ifdef CONFIG_XEN
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index 38a17f1d5c9d..5df09a0b80b8 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -6,10 +6,9 @@
6#ifdef CONFIG_PAGE_TABLE_ISOLATION 6#ifdef CONFIG_PAGE_TABLE_ISOLATION
7extern void pti_init(void); 7extern void pti_init(void);
8extern void pti_check_boottime_disable(void); 8extern void pti_check_boottime_disable(void);
9extern void pti_clone_kernel_text(void); 9extern void pti_finalize(void);
10#else 10#else
11static inline void pti_check_boottime_disable(void) { } 11static inline void pti_check_boottime_disable(void) { }
12static inline void pti_clone_kernel_text(void) { }
13#endif 12#endif
14 13
15#endif /* __ASSEMBLY__ */ 14#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 5c019d23d06b..4a911a382ade 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,6 +7,7 @@
7 7
8extern char __brk_base[], __brk_limit[]; 8extern char __brk_base[], __brk_limit[];
9extern struct exception_table_entry __stop___ex_table[]; 9extern struct exception_table_entry __stop___ex_table[];
10extern char __end_rodata_aligned[];
10 11
11#if defined(CONFIG_X86_64) 12#if defined(CONFIG_X86_64)
12extern char __end_rodata_hpage_align[]; 13extern char __end_rodata_hpage_align[];
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index bd090367236c..34cffcef7375 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages);
46int set_memory_4k(unsigned long addr, int numpages); 46int set_memory_4k(unsigned long addr, int numpages);
47int set_memory_encrypted(unsigned long addr, int numpages); 47int set_memory_encrypted(unsigned long addr, int numpages);
48int set_memory_decrypted(unsigned long addr, int numpages); 48int set_memory_decrypted(unsigned long addr, int numpages);
49int set_memory_np_noalias(unsigned long addr, int numpages);
49 50
50int set_memory_array_uc(unsigned long *addr, int addrinarray); 51int set_memory_array_uc(unsigned long *addr, int addrinarray);
51int set_memory_array_wc(unsigned long *addr, int addrinarray); 52int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index eb5f7999a893..36bd243843d6 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -87,15 +87,25 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
87#endif 87#endif
88 88
89/* This is used when switching tasks or entering/exiting vm86 mode. */ 89/* This is used when switching tasks or entering/exiting vm86 mode. */
90static inline void update_sp0(struct task_struct *task) 90static inline void update_task_stack(struct task_struct *task)
91{ 91{
92 /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ 92 /* sp0 always points to the entry trampoline stack, which is constant: */
93#ifdef CONFIG_X86_32 93#ifdef CONFIG_X86_32
94 load_sp0(task->thread.sp0); 94 if (static_cpu_has(X86_FEATURE_XENPV))
95 load_sp0(task->thread.sp0);
96 else
97 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
95#else 98#else
99 /*
100 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
101 * doesn't work on x86-32 because sp1 and
102 * cpu_current_top_of_stack have different values (because of
103 * the non-zero stack-padding on 32bit).
104 */
96 if (static_cpu_has(X86_FEATURE_XENPV)) 105 if (static_cpu_has(X86_FEATURE_XENPV))
97 load_sp0(task_top_of_stack(task)); 106 load_sp0(task_top_of_stack(task));
98#endif 107#endif
108
99} 109}
100 110
101#endif /* _ASM_X86_SWITCH_TO_H */ 111#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index dcb008c320fe..01de31db300d 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -103,4 +103,9 @@ void common(void) {
103 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); 103 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
104 OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); 104 OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
105 DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); 105 DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
106 DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
107
108 /* Offset for sp0 and sp1 into the tss_struct */
109 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
110 OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
106} 111}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a4a3be399f4b..82826f2275cc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -46,8 +46,14 @@ void foo(void)
46 OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); 46 OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
47 BLANK(); 47 BLANK();
48 48
49 /* Offset from the sysenter stack to tss.sp0 */ 49 /*
50 DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - 50 * Offset from the entry stack to task stack stored in TSS. Kernel entry
51 * happens on the per-cpu entry-stack, and the asm code switches to the
52 * task-stack pointer stored in x86_tss.sp1, which is a copy of
53 * task->thread.sp0 where entry code can find it.
54 */
55 DEFINE(TSS_entry2task_stack,
56 offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
51 offsetofend(struct cpu_entry_area, entry_stack_page.stack)); 57 offsetofend(struct cpu_entry_area, entry_stack_page.stack));
52 58
53#ifdef CONFIG_STACKPROTECTOR 59#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b2dcd161f514..3b9405e7ba2b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -65,8 +65,6 @@ int main(void)
65#undef ENTRY 65#undef ENTRY
66 66
67 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 67 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
68 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
69 OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
70 BLANK(); 68 BLANK();
71 69
72#ifdef CONFIG_STACKPROTECTOR 70#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5c0ea39311fe..405a9a61bb89 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -130,6 +130,7 @@ static const char *spectre_v2_strings[] = {
130 [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", 130 [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
131 [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", 131 [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
132 [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", 132 [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
133 [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
133}; 134};
134 135
135#undef pr_fmt 136#undef pr_fmt
@@ -313,23 +314,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
313 return cmd; 314 return cmd;
314} 315}
315 316
316/* Check for Skylake-like CPUs (for RSB handling) */
317static bool __init is_skylake_era(void)
318{
319 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
320 boot_cpu_data.x86 == 6) {
321 switch (boot_cpu_data.x86_model) {
322 case INTEL_FAM6_SKYLAKE_MOBILE:
323 case INTEL_FAM6_SKYLAKE_DESKTOP:
324 case INTEL_FAM6_SKYLAKE_X:
325 case INTEL_FAM6_KABYLAKE_MOBILE:
326 case INTEL_FAM6_KABYLAKE_DESKTOP:
327 return true;
328 }
329 }
330 return false;
331}
332
333static void __init spectre_v2_select_mitigation(void) 317static void __init spectre_v2_select_mitigation(void)
334{ 318{
335 enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); 319 enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -349,6 +333,13 @@ static void __init spectre_v2_select_mitigation(void)
349 333
350 case SPECTRE_V2_CMD_FORCE: 334 case SPECTRE_V2_CMD_FORCE:
351 case SPECTRE_V2_CMD_AUTO: 335 case SPECTRE_V2_CMD_AUTO:
336 if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
337 mode = SPECTRE_V2_IBRS_ENHANCED;
338 /* Force it so VMEXIT will restore correctly */
339 x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
340 wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
341 goto specv2_set_mode;
342 }
352 if (IS_ENABLED(CONFIG_RETPOLINE)) 343 if (IS_ENABLED(CONFIG_RETPOLINE))
353 goto retpoline_auto; 344 goto retpoline_auto;
354 break; 345 break;
@@ -386,26 +377,20 @@ retpoline_auto:
386 setup_force_cpu_cap(X86_FEATURE_RETPOLINE); 377 setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
387 } 378 }
388 379
380specv2_set_mode:
389 spectre_v2_enabled = mode; 381 spectre_v2_enabled = mode;
390 pr_info("%s\n", spectre_v2_strings[mode]); 382 pr_info("%s\n", spectre_v2_strings[mode]);
391 383
392 /* 384 /*
393 * If neither SMEP nor PTI are available, there is a risk of 385 * If spectre v2 protection has been enabled, unconditionally fill
394 * hitting userspace addresses in the RSB after a context switch 386 * RSB during a context switch; this protects against two independent
395 * from a shallow call stack to a deeper one. To prevent this fill 387 * issues:
396 * the entire RSB, even when using IBRS.
397 * 388 *
398 * Skylake era CPUs have a separate issue with *underflow* of the 389 * - RSB underflow (and switch to BTB) on Skylake+
399 * RSB, when they will predict 'ret' targets from the generic BTB. 390 * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
400 * The proper mitigation for this is IBRS. If IBRS is not supported
401 * or deactivated in favour of retpolines the RSB fill on context
402 * switch is required.
403 */ 391 */
404 if ((!boot_cpu_has(X86_FEATURE_PTI) && 392 setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
405 !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { 393 pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
406 setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
407 pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
408 }
409 394
410 /* Initialize Indirect Branch Prediction Barrier if supported */ 395 /* Initialize Indirect Branch Prediction Barrier if supported */
411 if (boot_cpu_has(X86_FEATURE_IBPB)) { 396 if (boot_cpu_has(X86_FEATURE_IBPB)) {
@@ -415,9 +400,16 @@ retpoline_auto:
415 400
416 /* 401 /*
417 * Retpoline means the kernel is safe because it has no indirect 402 * Retpoline means the kernel is safe because it has no indirect
418 * branches. But firmware isn't, so use IBRS to protect that. 403 * branches. Enhanced IBRS protects firmware too, so, enable restricted
404 * speculation around firmware calls only when Enhanced IBRS isn't
405 * supported.
406 *
407 * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
408 * the user might select retpoline on the kernel command line and if
409 * the CPU supports Enhanced IBRS, kernel might un-intentionally not
410 * enable IBRS around firmware calls.
419 */ 411 */
420 if (boot_cpu_has(X86_FEATURE_IBRS)) { 412 if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
421 setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); 413 setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
422 pr_info("Enabling Restricted Speculation for firmware calls\n"); 414 pr_info("Enabling Restricted Speculation for firmware calls\n");
423 } 415 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index eb4cb3efd20e..df28e931d732 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1005,6 +1005,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
1005 !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) 1005 !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
1006 setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); 1006 setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
1007 1007
1008 if (ia32_cap & ARCH_CAP_IBRS_ALL)
1009 setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
1010
1008 if (x86_match_cpu(cpu_no_meltdown)) 1011 if (x86_match_cpu(cpu_no_meltdown))
1009 return; 1012 return;
1010 1013
@@ -1804,11 +1807,12 @@ void cpu_init(void)
1804 enter_lazy_tlb(&init_mm, curr); 1807 enter_lazy_tlb(&init_mm, curr);
1805 1808
1806 /* 1809 /*
1807 * Initialize the TSS. Don't bother initializing sp0, as the initial 1810 * Initialize the TSS. sp0 points to the entry trampoline stack
1808 * task never enters user mode. 1811 * regardless of what task is running.
1809 */ 1812 */
1810 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); 1813 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
1811 load_TR_desc(); 1814 load_TR_desc();
1815 load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
1812 1816
1813 load_mm_ldt(&init_mm); 1817 load_mm_ldt(&init_mm);
1814 1818
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index abe6df15a8fb..30f9cb2c0b55 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -512,11 +512,18 @@ ENTRY(initial_code)
512ENTRY(setup_once_ref) 512ENTRY(setup_once_ref)
513 .long setup_once 513 .long setup_once
514 514
515#ifdef CONFIG_PAGE_TABLE_ISOLATION
516#define PGD_ALIGN (2 * PAGE_SIZE)
517#define PTI_USER_PGD_FILL 1024
518#else
519#define PGD_ALIGN (PAGE_SIZE)
520#define PTI_USER_PGD_FILL 0
521#endif
515/* 522/*
516 * BSS section 523 * BSS section
517 */ 524 */
518__PAGE_ALIGNED_BSS 525__PAGE_ALIGNED_BSS
519 .align PAGE_SIZE 526 .align PGD_ALIGN
520#ifdef CONFIG_X86_PAE 527#ifdef CONFIG_X86_PAE
521.globl initial_pg_pmd 528.globl initial_pg_pmd
522initial_pg_pmd: 529initial_pg_pmd:
@@ -526,14 +533,17 @@ initial_pg_pmd:
526initial_page_table: 533initial_page_table:
527 .fill 1024,4,0 534 .fill 1024,4,0
528#endif 535#endif
536 .align PGD_ALIGN
529initial_pg_fixmap: 537initial_pg_fixmap:
530 .fill 1024,4,0 538 .fill 1024,4,0
531.globl empty_zero_page
532empty_zero_page:
533 .fill 4096,1,0
534.globl swapper_pg_dir 539.globl swapper_pg_dir
540 .align PGD_ALIGN
535swapper_pg_dir: 541swapper_pg_dir:
536 .fill 1024,4,0 542 .fill 1024,4,0
543 .fill PTI_USER_PGD_FILL,4,0
544.globl empty_zero_page
545empty_zero_page:
546 .fill 4096,1,0
537EXPORT_SYMBOL(empty_zero_page) 547EXPORT_SYMBOL(empty_zero_page)
538 548
539/* 549/*
@@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page)
542#ifdef CONFIG_X86_PAE 552#ifdef CONFIG_X86_PAE
543__PAGE_ALIGNED_DATA 553__PAGE_ALIGNED_DATA
544 /* Page-aligned for the benefit of paravirt? */ 554 /* Page-aligned for the benefit of paravirt? */
545 .align PAGE_SIZE 555 .align PGD_ALIGN
546ENTRY(initial_page_table) 556ENTRY(initial_page_table)
547 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 557 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
548# if KPMDS == 3 558# if KPMDS == 3
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c9b14020f4dd..733e6ace0fa4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -100,6 +100,102 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
100 return new_ldt; 100 return new_ldt;
101} 101}
102 102
103#ifdef CONFIG_PAGE_TABLE_ISOLATION
104
105static void do_sanity_check(struct mm_struct *mm,
106 bool had_kernel_mapping,
107 bool had_user_mapping)
108{
109 if (mm->context.ldt) {
110 /*
111 * We already had an LDT. The top-level entry should already
112 * have been allocated and synchronized with the usermode
113 * tables.
114 */
115 WARN_ON(!had_kernel_mapping);
116 if (static_cpu_has(X86_FEATURE_PTI))
117 WARN_ON(!had_user_mapping);
118 } else {
119 /*
120 * This is the first time we're mapping an LDT for this process.
121 * Sync the pgd to the usermode tables.
122 */
123 WARN_ON(had_kernel_mapping);
124 if (static_cpu_has(X86_FEATURE_PTI))
125 WARN_ON(had_user_mapping);
126 }
127}
128
129#ifdef CONFIG_X86_PAE
130
131static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
132{
133 p4d_t *p4d;
134 pud_t *pud;
135
136 if (pgd->pgd == 0)
137 return NULL;
138
139 p4d = p4d_offset(pgd, va);
140 if (p4d_none(*p4d))
141 return NULL;
142
143 pud = pud_offset(p4d, va);
144 if (pud_none(*pud))
145 return NULL;
146
147 return pmd_offset(pud, va);
148}
149
150static void map_ldt_struct_to_user(struct mm_struct *mm)
151{
152 pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
153 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
154 pmd_t *k_pmd, *u_pmd;
155
156 k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
157 u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
158
159 if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
160 set_pmd(u_pmd, *k_pmd);
161}
162
163static void sanity_check_ldt_mapping(struct mm_struct *mm)
164{
165 pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
166 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
167 bool had_kernel, had_user;
168 pmd_t *k_pmd, *u_pmd;
169
170 k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
171 u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
172 had_kernel = (k_pmd->pmd != 0);
173 had_user = (u_pmd->pmd != 0);
174
175 do_sanity_check(mm, had_kernel, had_user);
176}
177
178#else /* !CONFIG_X86_PAE */
179
180static void map_ldt_struct_to_user(struct mm_struct *mm)
181{
182 pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
183
184 if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
185 set_pgd(kernel_to_user_pgdp(pgd), *pgd);
186}
187
188static void sanity_check_ldt_mapping(struct mm_struct *mm)
189{
190 pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
191 bool had_kernel = (pgd->pgd != 0);
192 bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
193
194 do_sanity_check(mm, had_kernel, had_user);
195}
196
197#endif /* CONFIG_X86_PAE */
198
103/* 199/*
104 * If PTI is enabled, this maps the LDT into the kernelmode and 200 * If PTI is enabled, this maps the LDT into the kernelmode and
105 * usermode tables for the given mm. 201 * usermode tables for the given mm.
@@ -115,9 +211,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
115static int 211static int
116map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) 212map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
117{ 213{
118#ifdef CONFIG_PAGE_TABLE_ISOLATION
119 bool is_vmalloc, had_top_level_entry;
120 unsigned long va; 214 unsigned long va;
215 bool is_vmalloc;
121 spinlock_t *ptl; 216 spinlock_t *ptl;
122 pgd_t *pgd; 217 pgd_t *pgd;
123 int i; 218 int i;
@@ -131,13 +226,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
131 */ 226 */
132 WARN_ON(ldt->slot != -1); 227 WARN_ON(ldt->slot != -1);
133 228
229 /* Check if the current mappings are sane */
230 sanity_check_ldt_mapping(mm);
231
134 /* 232 /*
135 * Did we already have the top level entry allocated? We can't 233 * Did we already have the top level entry allocated? We can't
136 * use pgd_none() for this because it doens't do anything on 234 * use pgd_none() for this because it doens't do anything on
137 * 4-level page table kernels. 235 * 4-level page table kernels.
138 */ 236 */
139 pgd = pgd_offset(mm, LDT_BASE_ADDR); 237 pgd = pgd_offset(mm, LDT_BASE_ADDR);
140 had_top_level_entry = (pgd->pgd != 0);
141 238
142 is_vmalloc = is_vmalloc_addr(ldt->entries); 239 is_vmalloc = is_vmalloc_addr(ldt->entries);
143 240
@@ -172,41 +269,31 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
172 pte_unmap_unlock(ptep, ptl); 269 pte_unmap_unlock(ptep, ptl);
173 } 270 }
174 271
175 if (mm->context.ldt) { 272 /* Propagate LDT mapping to the user page-table */
176 /* 273 map_ldt_struct_to_user(mm);
177 * We already had an LDT. The top-level entry should already
178 * have been allocated and synchronized with the usermode
179 * tables.
180 */
181 WARN_ON(!had_top_level_entry);
182 if (static_cpu_has(X86_FEATURE_PTI))
183 WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
184 } else {
185 /*
186 * This is the first time we're mapping an LDT for this process.
187 * Sync the pgd to the usermode tables.
188 */
189 WARN_ON(had_top_level_entry);
190 if (static_cpu_has(X86_FEATURE_PTI)) {
191 WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
192 set_pgd(kernel_to_user_pgdp(pgd), *pgd);
193 }
194 }
195 274
196 va = (unsigned long)ldt_slot_va(slot); 275 va = (unsigned long)ldt_slot_va(slot);
197 flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); 276 flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
198 277
199 ldt->slot = slot; 278 ldt->slot = slot;
200#endif
201 return 0; 279 return 0;
202} 280}
203 281
282#else /* !CONFIG_PAGE_TABLE_ISOLATION */
283
284static int
285map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
286{
287 return 0;
288}
289#endif /* CONFIG_PAGE_TABLE_ISOLATION */
290
204static void free_ldt_pgtables(struct mm_struct *mm) 291static void free_ldt_pgtables(struct mm_struct *mm)
205{ 292{
206#ifdef CONFIG_PAGE_TABLE_ISOLATION 293#ifdef CONFIG_PAGE_TABLE_ISOLATION
207 struct mmu_gather tlb; 294 struct mmu_gather tlb;
208 unsigned long start = LDT_BASE_ADDR; 295 unsigned long start = LDT_BASE_ADDR;
209 unsigned long end = start + (1UL << PGDIR_SHIFT); 296 unsigned long end = LDT_END_ADDR;
210 297
211 if (!static_cpu_has(X86_FEATURE_PTI)) 298 if (!static_cpu_has(X86_FEATURE_PTI))
212 return; 299 return;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d1ab07ec8c9a..5409c2800ab5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -56,7 +56,7 @@ static void load_segments(void)
56 56
57static void machine_kexec_free_page_tables(struct kimage *image) 57static void machine_kexec_free_page_tables(struct kimage *image)
58{ 58{
59 free_page((unsigned long)image->arch.pgd); 59 free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
60 image->arch.pgd = NULL; 60 image->arch.pgd = NULL;
61#ifdef CONFIG_X86_PAE 61#ifdef CONFIG_X86_PAE
62 free_page((unsigned long)image->arch.pmd0); 62 free_page((unsigned long)image->arch.pmd0);
@@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image)
72 72
73static int machine_kexec_alloc_page_tables(struct kimage *image) 73static int machine_kexec_alloc_page_tables(struct kimage *image)
74{ 74{
75 image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); 75 image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
76 PGD_ALLOCATION_ORDER);
76#ifdef CONFIG_X86_PAE 77#ifdef CONFIG_X86_PAE
77 image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); 78 image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
78 image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); 79 image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 30ca2d1a9231..c93fcfdf1673 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
57 */ 57 */
58 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, 58 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
59 59
60#ifdef CONFIG_X86_64
61 /* 60 /*
62 * .sp1 is cpu_current_top_of_stack. The init task never 61 * .sp1 is cpu_current_top_of_stack. The init task never
63 * runs user code, but cpu_current_top_of_stack should still 62 * runs user code, but cpu_current_top_of_stack should still
64 * be well defined before the first context switch. 63 * be well defined before the first context switch.
65 */ 64 */
66 .sp1 = TOP_OF_INIT_STACK, 65 .sp1 = TOP_OF_INIT_STACK,
67#endif
68 66
69#ifdef CONFIG_X86_32 67#ifdef CONFIG_X86_32
70 .ss0 = __KERNEL_DS, 68 .ss0 = __KERNEL_DS,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0ae659de21eb..2924fd447e61 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
285 * current_thread_info(). Refresh the SYSENTER configuration in 285 * current_thread_info(). Refresh the SYSENTER configuration in
286 * case prev or next is vm86. 286 * case prev or next is vm86.
287 */ 287 */
288 update_sp0(next_p); 288 update_task_stack(next_p);
289 refresh_sysenter_cs(next); 289 refresh_sysenter_cs(next);
290 this_cpu_write(cpu_current_top_of_stack, 290 this_cpu_write(cpu_current_top_of_stack,
291 (unsigned long)task_stack_page(next_p) + 291 (unsigned long)task_stack_page(next_p) +
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 12bb445fb98d..476e3ddf8890 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
478 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); 478 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
479 479
480 /* Reload sp0. */ 480 /* Reload sp0. */
481 update_sp0(next_p); 481 update_task_stack(next_p);
482 482
483 /* 483 /*
484 * Now maybe reload the debug registers and handle I/O bitmaps 484 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9d0b5af7db91..1c03e4aa6474 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
149 preempt_disable(); 149 preempt_disable();
150 tsk->thread.sp0 = vm86->saved_sp0; 150 tsk->thread.sp0 = vm86->saved_sp0;
151 tsk->thread.sysenter_cs = __KERNEL_CS; 151 tsk->thread.sysenter_cs = __KERNEL_CS;
152 update_sp0(tsk); 152 update_task_stack(tsk);
153 refresh_sysenter_cs(&tsk->thread); 153 refresh_sysenter_cs(&tsk->thread);
154 vm86->saved_sp0 = 0; 154 vm86->saved_sp0 = 0;
155 preempt_enable(); 155 preempt_enable();
@@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
374 refresh_sysenter_cs(&tsk->thread); 374 refresh_sysenter_cs(&tsk->thread);
375 } 375 }
376 376
377 update_sp0(tsk); 377 update_task_stack(tsk);
378 preempt_enable(); 378 preempt_enable();
379 379
380 if (vm86->flags & VM86_SCREEN_BITMAP) 380 if (vm86->flags & VM86_SCREEN_BITMAP)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 5e1458f609a1..8bde0a419f86 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -55,19 +55,22 @@ jiffies_64 = jiffies;
55 * so we can enable protection checks as well as retain 2MB large page 55 * so we can enable protection checks as well as retain 2MB large page
56 * mappings for kernel text. 56 * mappings for kernel text.
57 */ 57 */
58#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); 58#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
59 59
60#define X64_ALIGN_RODATA_END \ 60#define X86_ALIGN_RODATA_END \
61 . = ALIGN(HPAGE_SIZE); \ 61 . = ALIGN(HPAGE_SIZE); \
62 __end_rodata_hpage_align = .; 62 __end_rodata_hpage_align = .; \
63 __end_rodata_aligned = .;
63 64
64#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); 65#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
65#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); 66#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
66 67
67#else 68#else
68 69
69#define X64_ALIGN_RODATA_BEGIN 70#define X86_ALIGN_RODATA_BEGIN
70#define X64_ALIGN_RODATA_END 71#define X86_ALIGN_RODATA_END \
72 . = ALIGN(PAGE_SIZE); \
73 __end_rodata_aligned = .;
71 74
72#define ALIGN_ENTRY_TEXT_BEGIN 75#define ALIGN_ENTRY_TEXT_BEGIN
73#define ALIGN_ENTRY_TEXT_END 76#define ALIGN_ENTRY_TEXT_END
@@ -141,9 +144,9 @@ SECTIONS
141 144
142 /* .text should occupy whole number of pages */ 145 /* .text should occupy whole number of pages */
143 . = ALIGN(PAGE_SIZE); 146 . = ALIGN(PAGE_SIZE);
144 X64_ALIGN_RODATA_BEGIN 147 X86_ALIGN_RODATA_BEGIN
145 RO_DATA(PAGE_SIZE) 148 RO_DATA(PAGE_SIZE)
146 X64_ALIGN_RODATA_END 149 X86_ALIGN_RODATA_END
147 150
148 /* Data */ 151 /* Data */
149 .data : AT(ADDR(.data) - LOAD_OFFSET) { 152 .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2f3c9196b834..a12afff146d1 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = {
111 [END_OF_SPACE_NR] = { -1, NULL } 111 [END_OF_SPACE_NR] = { -1, NULL }
112}; 112};
113 113
114#define INIT_PGD ((pgd_t *) &init_top_pgt)
115
114#else /* CONFIG_X86_64 */ 116#else /* CONFIG_X86_64 */
115 117
116enum address_markers_idx { 118enum address_markers_idx {
@@ -121,6 +123,9 @@ enum address_markers_idx {
121#ifdef CONFIG_HIGHMEM 123#ifdef CONFIG_HIGHMEM
122 PKMAP_BASE_NR, 124 PKMAP_BASE_NR,
123#endif 125#endif
126#ifdef CONFIG_MODIFY_LDT_SYSCALL
127 LDT_NR,
128#endif
124 CPU_ENTRY_AREA_NR, 129 CPU_ENTRY_AREA_NR,
125 FIXADDR_START_NR, 130 FIXADDR_START_NR,
126 END_OF_SPACE_NR, 131 END_OF_SPACE_NR,
@@ -134,11 +139,16 @@ static struct addr_marker address_markers[] = {
134#ifdef CONFIG_HIGHMEM 139#ifdef CONFIG_HIGHMEM
135 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 140 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
136#endif 141#endif
142#ifdef CONFIG_MODIFY_LDT_SYSCALL
143 [LDT_NR] = { 0UL, "LDT remap" },
144#endif
137 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 145 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
138 [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 146 [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
139 [END_OF_SPACE_NR] = { -1, NULL } 147 [END_OF_SPACE_NR] = { -1, NULL }
140}; 148};
141 149
150#define INIT_PGD (swapper_pg_dir)
151
142#endif /* !CONFIG_X86_64 */ 152#endif /* !CONFIG_X86_64 */
143 153
144/* Multipliers for offsets within the PTEs */ 154/* Multipliers for offsets within the PTEs */
@@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx)
496static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 506static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
497 bool checkwx, bool dmesg) 507 bool checkwx, bool dmesg)
498{ 508{
499#ifdef CONFIG_X86_64 509 pgd_t *start = INIT_PGD;
500 pgd_t *start = (pgd_t *) &init_top_pgt;
501#else
502 pgd_t *start = swapper_pg_dir;
503#endif
504 pgprotval_t prot, eff; 510 pgprotval_t prot, eff;
505 int i; 511 int i;
506 struct pg_state st = {}; 512 struct pg_state st = {};
@@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
563} 569}
564EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 570EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
565 571
566static void ptdump_walk_user_pgd_level_checkwx(void) 572void ptdump_walk_user_pgd_level_checkwx(void)
567{ 573{
568#ifdef CONFIG_PAGE_TABLE_ISOLATION 574#ifdef CONFIG_PAGE_TABLE_ISOLATION
569 pgd_t *pgd = (pgd_t *) &init_top_pgt; 575 pgd_t *pgd = INIT_PGD;
570 576
571 if (!static_cpu_has(X86_FEATURE_PTI)) 577 if (!(__supported_pte_mask & _PAGE_NX) ||
578 !static_cpu_has(X86_FEATURE_PTI))
572 return; 579 return;
573 580
574 pr_info("x86/mm: Checking user space page tables\n"); 581 pr_info("x86/mm: Checking user space page tables\n");
@@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void)
580void ptdump_walk_pgd_level_checkwx(void) 587void ptdump_walk_pgd_level_checkwx(void)
581{ 588{
582 ptdump_walk_pgd_level_core(NULL, NULL, true, false); 589 ptdump_walk_pgd_level_core(NULL, NULL, true, false);
583 ptdump_walk_user_pgd_level_checkwx();
584} 590}
585 591
586static int __init pt_dump_init(void) 592static int __init pt_dump_init(void)
@@ -609,6 +615,9 @@ static int __init pt_dump_init(void)
609# endif 615# endif
610 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 616 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
611 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 617 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
618# ifdef CONFIG_MODIFY_LDT_SYSCALL
619 address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
620# endif
612#endif 621#endif
613 return 0; 622 return 0;
614} 623}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2aafa6ab6103..db1c042e9853 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -317,8 +317,6 @@ static noinline int vmalloc_fault(unsigned long address)
317 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 317 if (!(address >= VMALLOC_START && address < VMALLOC_END))
318 return -1; 318 return -1;
319 319
320 WARN_ON_ONCE(in_nmi());
321
322 /* 320 /*
323 * Synchronize this task's top level page-table 321 * Synchronize this task's top level page-table
324 * with the 'reference' page table. 322 * with the 'reference' page table.
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cee58a972cb2..74b157ac078d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -773,13 +773,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
773 } 773 }
774} 774}
775 775
776/*
777 * begin/end can be in the direct map or the "high kernel mapping"
778 * used for the kernel image only. free_init_pages() will do the
779 * right thing for either kind of address.
780 */
781void free_kernel_image_pages(void *begin, void *end)
782{
783 unsigned long begin_ul = (unsigned long)begin;
784 unsigned long end_ul = (unsigned long)end;
785 unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
786
787
788 free_init_pages("unused kernel image", begin_ul, end_ul);
789
790 /*
791 * PTI maps some of the kernel into userspace. For performance,
792 * this includes some kernel areas that do not contain secrets.
793 * Those areas might be adjacent to the parts of the kernel image
794 * being freed, which may contain secrets. Remove the "high kernel
795 * image mapping" for these freed areas, ensuring they are not even
796 * potentially vulnerable to Meltdown regardless of the specific
797 * optimizations PTI is currently using.
798 *
799 * The "noalias" prevents unmapping the direct map alias which is
800 * needed to access the freed pages.
801 *
802 * This is only valid for 64bit kernels. 32bit has only one mapping
803 * which can't be treated in this way for obvious reasons.
804 */
805 if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
806 set_memory_np_noalias(begin_ul, len_pages);
807}
808
776void __ref free_initmem(void) 809void __ref free_initmem(void)
777{ 810{
778 e820__reallocate_tables(); 811 e820__reallocate_tables();
779 812
780 free_init_pages("unused kernel", 813 free_kernel_image_pages(&__init_begin, &__init_end);
781 (unsigned long)(&__init_begin),
782 (unsigned long)(&__init_end));
783} 814}
784 815
785#ifdef CONFIG_BLK_DEV_INITRD 816#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..dd519f372169 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1283,20 +1283,10 @@ void mark_rodata_ro(void)
1283 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 1283 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1284#endif 1284#endif
1285 1285
1286 free_init_pages("unused kernel", 1286 free_kernel_image_pages((void *)text_end, (void *)rodata_start);
1287 (unsigned long) __va(__pa_symbol(text_end)), 1287 free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
1288 (unsigned long) __va(__pa_symbol(rodata_start)));
1289 free_init_pages("unused kernel",
1290 (unsigned long) __va(__pa_symbol(rodata_end)),
1291 (unsigned long) __va(__pa_symbol(_sdata)));
1292 1288
1293 debug_checkwx(); 1289 debug_checkwx();
1294
1295 /*
1296 * Do this after all of the manipulation of the
1297 * kernel text page tables are complete.
1298 */
1299 pti_clone_kernel_text();
1300} 1290}
1301 1291
1302int kern_addr_valid(unsigned long addr) 1292int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3bded76e8d5c..0a74996a1149 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock);
53#define CPA_FLUSHTLB 1 53#define CPA_FLUSHTLB 1
54#define CPA_ARRAY 2 54#define CPA_ARRAY 2
55#define CPA_PAGES_ARRAY 4 55#define CPA_PAGES_ARRAY 4
56#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
56 57
57#ifdef CONFIG_PROC_FS 58#ifdef CONFIG_PROC_FS
58static unsigned long direct_pages_count[PG_LEVEL_NUM]; 59static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1486 1487
1487 /* No alias checking for _NX bit modifications */ 1488 /* No alias checking for _NX bit modifications */
1488 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 1489 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1490 /* Has caller explicitly disabled alias checking? */
1491 if (in_flag & CPA_NO_CHECK_ALIAS)
1492 checkalias = 0;
1489 1493
1490 ret = __change_page_attr_set_clr(&cpa, checkalias); 1494 ret = __change_page_attr_set_clr(&cpa, checkalias);
1491 1495
@@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages)
1772 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1776 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1773} 1777}
1774 1778
1779int set_memory_np_noalias(unsigned long addr, int numpages)
1780{
1781 int cpa_flags = CPA_NO_CHECK_ALIAS;
1782
1783 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1784 __pgprot(_PAGE_PRESENT), 0,
1785 cpa_flags, NULL);
1786}
1787
1775int set_memory_4k(unsigned long addr, int numpages) 1788int set_memory_4k(unsigned long addr, int numpages)
1776{ 1789{
1777 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1790 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
@@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages)
1784 __pgprot(_PAGE_GLOBAL), 0); 1797 __pgprot(_PAGE_GLOBAL), 0);
1785} 1798}
1786 1799
1800int set_memory_global(unsigned long addr, int numpages)
1801{
1802 return change_page_attr_set(&addr, numpages,
1803 __pgprot(_PAGE_GLOBAL), 0);
1804}
1805
1787static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) 1806static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
1788{ 1807{
1789 struct cpa_data cpa; 1808 struct cpa_data cpa;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0f1683fcb196..3ef095c70ae3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd)
182 */ 182 */
183#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD 183#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
184 184
185/*
186 * We allocate separate PMDs for the kernel part of the user page-table
187 * when PTI is enabled. We need them to map the per-process LDT into the
188 * user-space page-table.
189 */
190#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
191 KERNEL_PGD_PTRS : 0)
192
185void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 193void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
186{ 194{
187 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 195 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
@@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
202 210
203/* No need to prepopulate any pagetable entries in non-PAE modes. */ 211/* No need to prepopulate any pagetable entries in non-PAE modes. */
204#define PREALLOCATED_PMDS 0 212#define PREALLOCATED_PMDS 0
205 213#define PREALLOCATED_USER_PMDS 0
206#endif /* CONFIG_X86_PAE */ 214#endif /* CONFIG_X86_PAE */
207 215
208static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) 216static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
209{ 217{
210 int i; 218 int i;
211 219
212 for(i = 0; i < PREALLOCATED_PMDS; i++) 220 for (i = 0; i < count; i++)
213 if (pmds[i]) { 221 if (pmds[i]) {
214 pgtable_pmd_page_dtor(virt_to_page(pmds[i])); 222 pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
215 free_page((unsigned long)pmds[i]); 223 free_page((unsigned long)pmds[i]);
@@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
217 } 225 }
218} 226}
219 227
220static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) 228static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
221{ 229{
222 int i; 230 int i;
223 bool failed = false; 231 bool failed = false;
@@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
226 if (mm == &init_mm) 234 if (mm == &init_mm)
227 gfp &= ~__GFP_ACCOUNT; 235 gfp &= ~__GFP_ACCOUNT;
228 236
229 for(i = 0; i < PREALLOCATED_PMDS; i++) { 237 for (i = 0; i < count; i++) {
230 pmd_t *pmd = (pmd_t *)__get_free_page(gfp); 238 pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
231 if (!pmd) 239 if (!pmd)
232 failed = true; 240 failed = true;
@@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
241 } 249 }
242 250
243 if (failed) { 251 if (failed) {
244 free_pmds(mm, pmds); 252 free_pmds(mm, pmds, count);
245 return -ENOMEM; 253 return -ENOMEM;
246 } 254 }
247 255
@@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
254 * preallocate which never got a corresponding vma will need to be 262 * preallocate which never got a corresponding vma will need to be
255 * freed manually. 263 * freed manually.
256 */ 264 */
265static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
266{
267 pgd_t pgd = *pgdp;
268
269 if (pgd_val(pgd) != 0) {
270 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
271
272 *pgdp = native_make_pgd(0);
273
274 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
275 pmd_free(mm, pmd);
276 mm_dec_nr_pmds(mm);
277 }
278}
279
257static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 280static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
258{ 281{
259 int i; 282 int i;
260 283
261 for(i = 0; i < PREALLOCATED_PMDS; i++) { 284 for (i = 0; i < PREALLOCATED_PMDS; i++)
262 pgd_t pgd = pgdp[i]; 285 mop_up_one_pmd(mm, &pgdp[i]);
263 286
264 if (pgd_val(pgd) != 0) { 287#ifdef CONFIG_PAGE_TABLE_ISOLATION
265 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
266 288
267 pgdp[i] = native_make_pgd(0); 289 if (!static_cpu_has(X86_FEATURE_PTI))
290 return;
268 291
269 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 292 pgdp = kernel_to_user_pgdp(pgdp);
270 pmd_free(mm, pmd); 293
271 mm_dec_nr_pmds(mm); 294 for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
272 } 295 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
273 } 296#endif
274} 297}
275 298
276static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 299static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
@@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
296 } 319 }
297} 320}
298 321
322#ifdef CONFIG_PAGE_TABLE_ISOLATION
323static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
324 pgd_t *k_pgd, pmd_t *pmds[])
325{
326 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
327 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
328 p4d_t *u_p4d;
329 pud_t *u_pud;
330 int i;
331
332 u_p4d = p4d_offset(u_pgd, 0);
333 u_pud = pud_offset(u_p4d, 0);
334
335 s_pgd += KERNEL_PGD_BOUNDARY;
336 u_pud += KERNEL_PGD_BOUNDARY;
337
338 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
339 pmd_t *pmd = pmds[i];
340
341 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
342 sizeof(pmd_t) * PTRS_PER_PMD);
343
344 pud_populate(mm, u_pud, pmd);
345 }
346
347}
348#else
349static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
350 pgd_t *k_pgd, pmd_t *pmds[])
351{
352}
353#endif
299/* 354/*
300 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also 355 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
301 * assumes that pgd should be in one page. 356 * assumes that pgd should be in one page.
@@ -340,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void)
340 * We allocate one page for pgd. 395 * We allocate one page for pgd.
341 */ 396 */
342 if (!SHARED_KERNEL_PMD) 397 if (!SHARED_KERNEL_PMD)
343 return (pgd_t *)__get_free_page(PGALLOC_GFP); 398 return (pgd_t *)__get_free_pages(PGALLOC_GFP,
399 PGD_ALLOCATION_ORDER);
344 400
345 /* 401 /*
346 * Now PAE kernel is not running as a Xen domain. We can allocate 402 * Now PAE kernel is not running as a Xen domain. We can allocate
@@ -352,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void)
352static inline void _pgd_free(pgd_t *pgd) 408static inline void _pgd_free(pgd_t *pgd)
353{ 409{
354 if (!SHARED_KERNEL_PMD) 410 if (!SHARED_KERNEL_PMD)
355 free_page((unsigned long)pgd); 411 free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
356 else 412 else
357 kmem_cache_free(pgd_cache, pgd); 413 kmem_cache_free(pgd_cache, pgd);
358} 414}
@@ -372,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd)
372pgd_t *pgd_alloc(struct mm_struct *mm) 428pgd_t *pgd_alloc(struct mm_struct *mm)
373{ 429{
374 pgd_t *pgd; 430 pgd_t *pgd;
431 pmd_t *u_pmds[PREALLOCATED_USER_PMDS];
375 pmd_t *pmds[PREALLOCATED_PMDS]; 432 pmd_t *pmds[PREALLOCATED_PMDS];
376 433
377 pgd = _pgd_alloc(); 434 pgd = _pgd_alloc();
@@ -381,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
381 438
382 mm->pgd = pgd; 439 mm->pgd = pgd;
383 440
384 if (preallocate_pmds(mm, pmds) != 0) 441 if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
385 goto out_free_pgd; 442 goto out_free_pgd;
386 443
387 if (paravirt_pgd_alloc(mm) != 0) 444 if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
388 goto out_free_pmds; 445 goto out_free_pmds;
389 446
447 if (paravirt_pgd_alloc(mm) != 0)
448 goto out_free_user_pmds;
449
390 /* 450 /*
391 * Make sure that pre-populating the pmds is atomic with 451 * Make sure that pre-populating the pmds is atomic with
392 * respect to anything walking the pgd_list, so that they 452 * respect to anything walking the pgd_list, so that they
@@ -396,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
396 456
397 pgd_ctor(mm, pgd); 457 pgd_ctor(mm, pgd);
398 pgd_prepopulate_pmd(mm, pgd, pmds); 458 pgd_prepopulate_pmd(mm, pgd, pmds);
459 pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
399 460
400 spin_unlock(&pgd_lock); 461 spin_unlock(&pgd_lock);
401 462
402 return pgd; 463 return pgd;
403 464
465out_free_user_pmds:
466 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
404out_free_pmds: 467out_free_pmds:
405 free_pmds(mm, pmds); 468 free_pmds(mm, pmds, PREALLOCATED_PMDS);
406out_free_pgd: 469out_free_pgd:
407 _pgd_free(pgd); 470 _pgd_free(pgd);
408out: 471out:
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e705878..d58b4aba9510 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -54,6 +54,16 @@
54#define __GFP_NOTRACK 0 54#define __GFP_NOTRACK 0
55#endif 55#endif
56 56
57/*
58 * Define the page-table levels we clone for user-space on 32
59 * and 64 bit.
60 */
61#ifdef CONFIG_X86_64
62#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD
63#else
64#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE
65#endif
66
57static void __init pti_print_if_insecure(const char *reason) 67static void __init pti_print_if_insecure(const char *reason)
58{ 68{
59 if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 69 if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
@@ -117,7 +127,7 @@ enable:
117 setup_force_cpu_cap(X86_FEATURE_PTI); 127 setup_force_cpu_cap(X86_FEATURE_PTI);
118} 128}
119 129
120pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 130pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
121{ 131{
122 /* 132 /*
123 * Changes to the high (kernel) portion of the kernelmode page 133 * Changes to the high (kernel) portion of the kernelmode page
@@ -176,7 +186,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
176 186
177 if (pgd_none(*pgd)) { 187 if (pgd_none(*pgd)) {
178 unsigned long new_p4d_page = __get_free_page(gfp); 188 unsigned long new_p4d_page = __get_free_page(gfp);
179 if (!new_p4d_page) 189 if (WARN_ON_ONCE(!new_p4d_page))
180 return NULL; 190 return NULL;
181 191
182 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 192 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,13 +205,17 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
195static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 205static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
196{ 206{
197 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 207 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
198 p4d_t *p4d = pti_user_pagetable_walk_p4d(address); 208 p4d_t *p4d;
199 pud_t *pud; 209 pud_t *pud;
200 210
211 p4d = pti_user_pagetable_walk_p4d(address);
212 if (!p4d)
213 return NULL;
214
201 BUILD_BUG_ON(p4d_large(*p4d) != 0); 215 BUILD_BUG_ON(p4d_large(*p4d) != 0);
202 if (p4d_none(*p4d)) { 216 if (p4d_none(*p4d)) {
203 unsigned long new_pud_page = __get_free_page(gfp); 217 unsigned long new_pud_page = __get_free_page(gfp);
204 if (!new_pud_page) 218 if (WARN_ON_ONCE(!new_pud_page))
205 return NULL; 219 return NULL;
206 220
207 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); 221 set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -215,7 +229,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
215 } 229 }
216 if (pud_none(*pud)) { 230 if (pud_none(*pud)) {
217 unsigned long new_pmd_page = __get_free_page(gfp); 231 unsigned long new_pmd_page = __get_free_page(gfp);
218 if (!new_pmd_page) 232 if (WARN_ON_ONCE(!new_pmd_page))
219 return NULL; 233 return NULL;
220 234
221 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); 235 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -224,7 +238,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
224 return pmd_offset(pud, address); 238 return pmd_offset(pud, address);
225} 239}
226 240
227#ifdef CONFIG_X86_VSYSCALL_EMULATION
228/* 241/*
229 * Walk the shadow copy of the page tables (optionally) trying to allocate 242 * Walk the shadow copy of the page tables (optionally) trying to allocate
230 * page table pages on the way down. Does not support large pages. 243 * page table pages on the way down. Does not support large pages.
@@ -237,9 +250,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
237static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) 250static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
238{ 251{
239 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 252 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
240 pmd_t *pmd = pti_user_pagetable_walk_pmd(address); 253 pmd_t *pmd;
241 pte_t *pte; 254 pte_t *pte;
242 255
256 pmd = pti_user_pagetable_walk_pmd(address);
257 if (!pmd)
258 return NULL;
259
243 /* We can't do anything sensible if we hit a large mapping. */ 260 /* We can't do anything sensible if we hit a large mapping. */
244 if (pmd_large(*pmd)) { 261 if (pmd_large(*pmd)) {
245 WARN_ON(1); 262 WARN_ON(1);
@@ -262,6 +279,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
262 return pte; 279 return pte;
263} 280}
264 281
282#ifdef CONFIG_X86_VSYSCALL_EMULATION
265static void __init pti_setup_vsyscall(void) 283static void __init pti_setup_vsyscall(void)
266{ 284{
267 pte_t *pte, *target_pte; 285 pte_t *pte, *target_pte;
@@ -282,8 +300,14 @@ static void __init pti_setup_vsyscall(void)
282static void __init pti_setup_vsyscall(void) { } 300static void __init pti_setup_vsyscall(void) { }
283#endif 301#endif
284 302
303enum pti_clone_level {
304 PTI_CLONE_PMD,
305 PTI_CLONE_PTE,
306};
307
285static void 308static void
286pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) 309pti_clone_pgtable(unsigned long start, unsigned long end,
310 enum pti_clone_level level)
287{ 311{
288 unsigned long addr; 312 unsigned long addr;
289 313
@@ -291,59 +315,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
291 * Clone the populated PMDs which cover start to end. These PMD areas 315 * Clone the populated PMDs which cover start to end. These PMD areas
292 * can have holes. 316 * can have holes.
293 */ 317 */
294 for (addr = start; addr < end; addr += PMD_SIZE) { 318 for (addr = start; addr < end;) {
319 pte_t *pte, *target_pte;
295 pmd_t *pmd, *target_pmd; 320 pmd_t *pmd, *target_pmd;
296 pgd_t *pgd; 321 pgd_t *pgd;
297 p4d_t *p4d; 322 p4d_t *p4d;
298 pud_t *pud; 323 pud_t *pud;
299 324
325 /* Overflow check */
326 if (addr < start)
327 break;
328
300 pgd = pgd_offset_k(addr); 329 pgd = pgd_offset_k(addr);
301 if (WARN_ON(pgd_none(*pgd))) 330 if (WARN_ON(pgd_none(*pgd)))
302 return; 331 return;
303 p4d = p4d_offset(pgd, addr); 332 p4d = p4d_offset(pgd, addr);
304 if (WARN_ON(p4d_none(*p4d))) 333 if (WARN_ON(p4d_none(*p4d)))
305 return; 334 return;
335
306 pud = pud_offset(p4d, addr); 336 pud = pud_offset(p4d, addr);
307 if (pud_none(*pud)) 337 if (pud_none(*pud)) {
338 addr += PUD_SIZE;
308 continue; 339 continue;
340 }
341
309 pmd = pmd_offset(pud, addr); 342 pmd = pmd_offset(pud, addr);
310 if (pmd_none(*pmd)) 343 if (pmd_none(*pmd)) {
344 addr += PMD_SIZE;
311 continue; 345 continue;
346 }
312 347
313 target_pmd = pti_user_pagetable_walk_pmd(addr); 348 if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
314 if (WARN_ON(!target_pmd)) 349 target_pmd = pti_user_pagetable_walk_pmd(addr);
315 return; 350 if (WARN_ON(!target_pmd))
316 351 return;
317 /* 352
318 * Only clone present PMDs. This ensures only setting 353 /*
319 * _PAGE_GLOBAL on present PMDs. This should only be 354 * Only clone present PMDs. This ensures only setting
320 * called on well-known addresses anyway, so a non- 355 * _PAGE_GLOBAL on present PMDs. This should only be
321 * present PMD would be a surprise. 356 * called on well-known addresses anyway, so a non-
322 */ 357 * present PMD would be a surprise.
323 if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) 358 */
324 return; 359 if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
325 360 return;
326 /* 361
327 * Setting 'target_pmd' below creates a mapping in both 362 /*
328 * the user and kernel page tables. It is effectively 363 * Setting 'target_pmd' below creates a mapping in both
329 * global, so set it as global in both copies. Note: 364 * the user and kernel page tables. It is effectively
330 * the X86_FEATURE_PGE check is not _required_ because 365 * global, so set it as global in both copies. Note:
331 * the CPU ignores _PAGE_GLOBAL when PGE is not 366 * the X86_FEATURE_PGE check is not _required_ because
332 * supported. The check keeps consistentency with 367 * the CPU ignores _PAGE_GLOBAL when PGE is not
333 * code that only set this bit when supported. 368 * supported. The check keeps consistentency with
334 */ 369 * code that only set this bit when supported.
335 if (boot_cpu_has(X86_FEATURE_PGE)) 370 */
336 *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); 371 if (boot_cpu_has(X86_FEATURE_PGE))
337 372 *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
338 /* 373
339 * Copy the PMD. That is, the kernelmode and usermode 374 /*
340 * tables will share the last-level page tables of this 375 * Copy the PMD. That is, the kernelmode and usermode
341 * address range 376 * tables will share the last-level page tables of this
342 */ 377 * address range
343 *target_pmd = pmd_clear_flags(*pmd, clear); 378 */
379 *target_pmd = *pmd;
380
381 addr += PMD_SIZE;
382
383 } else if (level == PTI_CLONE_PTE) {
384
385 /* Walk the page-table down to the pte level */
386 pte = pte_offset_kernel(pmd, addr);
387 if (pte_none(*pte)) {
388 addr += PAGE_SIZE;
389 continue;
390 }
391
392 /* Only clone present PTEs */
393 if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
394 return;
395
396 /* Allocate PTE in the user page-table */
397 target_pte = pti_user_pagetable_walk_pte(addr);
398 if (WARN_ON(!target_pte))
399 return;
400
401 /* Set GLOBAL bit in both PTEs */
402 if (boot_cpu_has(X86_FEATURE_PGE))
403 *pte = pte_set_flags(*pte, _PAGE_GLOBAL);
404
405 /* Clone the PTE */
406 *target_pte = *pte;
407
408 addr += PAGE_SIZE;
409
410 } else {
411 BUG();
412 }
344 } 413 }
345} 414}
346 415
416#ifdef CONFIG_X86_64
347/* 417/*
348 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a 418 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
349 * next-level entry on 5-level systems. 419 * next-level entry on 5-level systems.
@@ -354,6 +424,9 @@ static void __init pti_clone_p4d(unsigned long addr)
354 pgd_t *kernel_pgd; 424 pgd_t *kernel_pgd;
355 425
356 user_p4d = pti_user_pagetable_walk_p4d(addr); 426 user_p4d = pti_user_pagetable_walk_p4d(addr);
427 if (!user_p4d)
428 return;
429
357 kernel_pgd = pgd_offset_k(addr); 430 kernel_pgd = pgd_offset_k(addr);
358 kernel_p4d = p4d_offset(kernel_pgd, addr); 431 kernel_p4d = p4d_offset(kernel_pgd, addr);
359 *user_p4d = *kernel_p4d; 432 *user_p4d = *kernel_p4d;
@@ -367,6 +440,25 @@ static void __init pti_clone_user_shared(void)
367 pti_clone_p4d(CPU_ENTRY_AREA_BASE); 440 pti_clone_p4d(CPU_ENTRY_AREA_BASE);
368} 441}
369 442
443#else /* CONFIG_X86_64 */
444
445/*
446 * On 32 bit PAE systems with 1GB of Kernel address space there is only
447 * one pgd/p4d for the whole kernel. Cloning that would map the whole
448 * address space into the user page-tables, making PTI useless. So clone
449 * the page-table on the PMD level to prevent that.
450 */
451static void __init pti_clone_user_shared(void)
452{
453 unsigned long start, end;
454
455 start = CPU_ENTRY_AREA_BASE;
456 end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
457
458 pti_clone_pgtable(start, end, PTI_CLONE_PMD);
459}
460#endif /* CONFIG_X86_64 */
461
370/* 462/*
371 * Clone the ESPFIX P4D into the user space visible page table 463 * Clone the ESPFIX P4D into the user space visible page table
372 */ 464 */
@@ -380,11 +472,11 @@ static void __init pti_setup_espfix64(void)
380/* 472/*
381 * Clone the populated PMDs of the entry and irqentry text and force it RO. 473 * Clone the populated PMDs of the entry and irqentry text and force it RO.
382 */ 474 */
383static void __init pti_clone_entry_text(void) 475static void pti_clone_entry_text(void)
384{ 476{
385 pti_clone_pmds((unsigned long) __entry_text_start, 477 pti_clone_pgtable((unsigned long) __entry_text_start,
386 (unsigned long) __irqentry_text_end, 478 (unsigned long) __irqentry_text_end,
387 _PAGE_RW); 479 PTI_CLONE_PMD);
388} 480}
389 481
390/* 482/*
@@ -435,10 +527,17 @@ static inline bool pti_kernel_image_global_ok(void)
435} 527}
436 528
437/* 529/*
530 * This is the only user for these and it is not arch-generic
531 * like the other set_memory.h functions. Just extern them.
532 */
533extern int set_memory_nonglobal(unsigned long addr, int numpages);
534extern int set_memory_global(unsigned long addr, int numpages);
535
536/*
438 * For some configurations, map all of kernel text into the user page 537 * For some configurations, map all of kernel text into the user page
439 * tables. This reduces TLB misses, especially on non-PCID systems. 538 * tables. This reduces TLB misses, especially on non-PCID systems.
440 */ 539 */
441void pti_clone_kernel_text(void) 540static void pti_clone_kernel_text(void)
442{ 541{
443 /* 542 /*
444 * rodata is part of the kernel image and is normally 543 * rodata is part of the kernel image and is normally
@@ -446,7 +545,8 @@ void pti_clone_kernel_text(void)
446 * clone the areas past rodata, they might contain secrets. 545 * clone the areas past rodata, they might contain secrets.
447 */ 546 */
448 unsigned long start = PFN_ALIGN(_text); 547 unsigned long start = PFN_ALIGN(_text);
449 unsigned long end = (unsigned long)__end_rodata_hpage_align; 548 unsigned long end_clone = (unsigned long)__end_rodata_aligned;
549 unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
450 550
451 if (!pti_kernel_image_global_ok()) 551 if (!pti_kernel_image_global_ok())
452 return; 552 return;
@@ -458,14 +558,18 @@ void pti_clone_kernel_text(void)
458 * pti_set_kernel_image_nonglobal() did to clear the 558 * pti_set_kernel_image_nonglobal() did to clear the
459 * global bit. 559 * global bit.
460 */ 560 */
461 pti_clone_pmds(start, end, _PAGE_RW); 561 pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
562
563 /*
564 * pti_clone_pgtable() will set the global bit in any PMDs
565 * that it clones, but we also need to get any PTEs in
566 * the last level for areas that are not huge-page-aligned.
567 */
568
569 /* Set the global bit for normal non-__init kernel text: */
570 set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
462} 571}
463 572
464/*
465 * This is the only user for it and it is not arch-generic like
466 * the other set_memory.h functions. Just extern it.
467 */
468extern int set_memory_nonglobal(unsigned long addr, int numpages);
469void pti_set_kernel_image_nonglobal(void) 573void pti_set_kernel_image_nonglobal(void)
470{ 574{
471 /* 575 /*
@@ -477,9 +581,11 @@ void pti_set_kernel_image_nonglobal(void)
477 unsigned long start = PFN_ALIGN(_text); 581 unsigned long start = PFN_ALIGN(_text);
478 unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); 582 unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
479 583
480 if (pti_kernel_image_global_ok()) 584 /*
481 return; 585 * This clears _PAGE_GLOBAL from the entire kernel image.
482 586 * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
587 * areas that are mapped to userspace.
588 */
483 set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); 589 set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
484} 590}
485 591
@@ -493,6 +599,28 @@ void __init pti_init(void)
493 599
494 pr_info("enabled\n"); 600 pr_info("enabled\n");
495 601
602#ifdef CONFIG_X86_32
603 /*
604 * We check for X86_FEATURE_PCID here. But the init-code will
605 * clear the feature flag on 32 bit because the feature is not
606 * supported on 32 bit anyway. To print the warning we need to
607 * check with cpuid directly again.
608 */
609 if (cpuid_ecx(0x1) & BIT(17)) {
610 /* Use printk to work around pr_fmt() */
611 printk(KERN_WARNING "\n");
612 printk(KERN_WARNING "************************************************************\n");
613 printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
614 printk(KERN_WARNING "** **\n");
615 printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
616 printk(KERN_WARNING "** Your performance will increase dramatically if you **\n");
617 printk(KERN_WARNING "** switch to a 64-bit kernel! **\n");
618 printk(KERN_WARNING "** **\n");
619 printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
620 printk(KERN_WARNING "************************************************************\n");
621 }
622#endif
623
496 pti_clone_user_shared(); 624 pti_clone_user_shared();
497 625
498 /* Undo all global bits from the init pagetables in head_64.S: */ 626 /* Undo all global bits from the init pagetables in head_64.S: */
@@ -502,3 +630,22 @@ void __init pti_init(void)
502 pti_setup_espfix64(); 630 pti_setup_espfix64();
503 pti_setup_vsyscall(); 631 pti_setup_vsyscall();
504} 632}
633
634/*
635 * Finalize the kernel mappings in the userspace page-table. Some of the
636 * mappings for the kernel image might have changed since pti_init()
637 * cloned them. This is because parts of the kernel image have been
638 * mapped RO and/or NX. These changes need to be cloned again to the
639 * userspace page-table.
640 */
641void pti_finalize(void)
642{
643 /*
644 * We need to clone everything (again) that maps parts of the
645 * kernel image.
646 */
647 pti_clone_entry_text();
648 pti_clone_kernel_text();
649
650 debug_checkwx_user();
651}
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 220e97841e49..3a6c8ebc8032 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
67 "__tracedata_(start|end)|" 67 "__tracedata_(start|end)|"
68 "__(start|stop)_notes|" 68 "__(start|stop)_notes|"
69 "__end_rodata|" 69 "__end_rodata|"
70 "__end_rodata_aligned|"
70 "__initramfs_start|" 71 "__initramfs_start|"
71 "(jiffies|jiffies_64)|" 72 "(jiffies|jiffies_64)|"
72#if ELF_BITS == 64 73#if ELF_BITS == 64
diff --git a/include/linux/pti.h b/include/linux/pti.h
index 0174883a935a..1a941efcaa62 100644
--- a/include/linux/pti.h
+++ b/include/linux/pti.h
@@ -6,6 +6,7 @@
6#include <asm/pti.h> 6#include <asm/pti.h>
7#else 7#else
8static inline void pti_init(void) { } 8static inline void pti_init(void) { }
9static inline void pti_finalize(void) { }
9#endif 10#endif
10 11
11#endif 12#endif
diff --git a/init/main.c b/init/main.c
index 5e13c544bbf4..f86b64c35d38 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1065,6 +1065,13 @@ static int __ref kernel_init(void *unused)
1065 jump_label_invalidate_initmem(); 1065 jump_label_invalidate_initmem();
1066 free_initmem(); 1066 free_initmem();
1067 mark_readonly(); 1067 mark_readonly();
1068
1069 /*
1070 * Kernel mappings are now finalized - update the userspace page-table
1071 * to finalize PTI.
1072 */
1073 pti_finalize();
1074
1068 system_state = SYSTEM_RUNNING; 1075 system_state = SYSTEM_RUNNING;
1069 numa_default_policy(); 1076 numa_default_policy();
1070 1077
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a790ef4be74e..3222193c46c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6939,9 +6939,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
6939 start = (void *)PAGE_ALIGN((unsigned long)start); 6939 start = (void *)PAGE_ALIGN((unsigned long)start);
6940 end = (void *)((unsigned long)end & PAGE_MASK); 6940 end = (void *)((unsigned long)end & PAGE_MASK);
6941 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 6941 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
6942 struct page *page = virt_to_page(pos);
6943 void *direct_map_addr;
6944
6945 /*
6946 * 'direct_map_addr' might be different from 'pos'
6947 * because some architectures' virt_to_page()
6948 * work with aliases. Getting the direct map
6949 * address ensures that we get a _writeable_
6950 * alias for the memset().
6951 */
6952 direct_map_addr = page_address(page);
6942 if ((unsigned int)poison <= 0xFF) 6953 if ((unsigned int)poison <= 0xFF)
6943 memset(pos, poison, PAGE_SIZE); 6954 memset(direct_map_addr, poison, PAGE_SIZE);
6944 free_reserved_page(virt_to_page(pos)); 6955
6956 free_reserved_page(page);
6945 } 6957 }
6946 6958
6947 if (pages && s) 6959 if (pages && s)
diff --git a/security/Kconfig b/security/Kconfig
index c4302067a3ad..afa91c6f06bb 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -57,7 +57,7 @@ config SECURITY_NETWORK
57config PAGE_TABLE_ISOLATION 57config PAGE_TABLE_ISOLATION
58 bool "Remove the kernel mapping in user mode" 58 bool "Remove the kernel mapping in user mode"
59 default y 59 default y
60 depends on X86_64 && !UML 60 depends on X86 && !UML
61 help 61 help
62 This feature reduces the number of hardware side channels by 62 This feature reduces the number of hardware side channels by
63 ensuring that the majority of kernel addresses are not mapped 63 ensuring that the majority of kernel addresses are not mapped