aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt9
-rw-r--r--Documentation/x86/x86_64/5level-paging.txt9
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/head_64.S168
-rw-r--r--arch/x86/boot/compressed/kaslr.c14
-rw-r--r--arch/x86/boot/compressed/kaslr_64.c (renamed from arch/x86/boot/compressed/pagetable.c)14
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S17
-rw-r--r--arch/x86/boot/compressed/misc.c22
-rw-r--r--arch/x86/boot/compressed/misc.h7
-rw-r--r--arch/x86/boot/compressed/pgtable.h20
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c148
-rw-r--r--arch/x86/entry/entry_64.S5
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/intel_pconfig.h65
-rw-r--r--arch/x86/include/asm/kaslr.h4
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h20
-rw-r--r--arch/x86/include/asm/paravirt.h21
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h23
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h70
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h9
-rw-r--r--arch/x86/include/asm/x86_init.h11
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/intel.c87
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c82
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/head64.c81
-rw-r--r--arch/x86/kernel/head_64.S22
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kernel/x86_init.c15
-rw-r--r--arch/x86/mm/Makefile15
-rw-r--r--arch/x86/mm/debug_pagetables.c32
-rw-r--r--arch/x86/mm/dump_pagetables.c125
-rw-r--r--arch/x86/mm/fault.c60
-rw-r--r--arch/x86/mm/ident_map.c2
-rw-r--r--arch/x86/mm/init_64.c32
-rw-r--r--arch/x86/mm/kasan_init_64.c20
-rw-r--r--arch/x86/mm/kaslr.c29
-rw-r--r--arch/x86/mm/mem_encrypt.c578
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c564
-rw-r--r--arch/x86/mm/numa_32.c11
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/platform/efi/efi_64.c10
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c6
-rw-r--r--arch/x86/power/hibernate_64.c6
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/enlighten_pvh.c14
-rw-r--r--arch/x86/xen/mmu_pv.c21
-rw-r--r--drivers/acpi/osl.c5
-rw-r--r--include/asm-generic/5level-fixup.h1
-rw-r--r--include/asm-generic/pgtable-nop4d.h9
-rw-r--r--include/linux/acpi.h7
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--mm/kasan/kasan_init.c2
-rw-r--r--mm/sparse.c22
-rw-r--r--mm/zsmalloc.c13
68 files changed, 1657 insertions, 1016 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 50b9837e985b..b37c1c30c16f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2248,6 +2248,15 @@
2248 The memory region may be marked as e820 type 12 (0xc) 2248 The memory region may be marked as e820 type 12 (0xc)
2249 and is NVDIMM or ADR memory. 2249 and is NVDIMM or ADR memory.
2250 2250
2251 memmap=<size>%<offset>-<oldtype>+<newtype>
2252 [KNL,ACPI] Convert memory within the specified region
2253 from <oldtype> to <newtype>. If "-<oldtype>" is left
2254 out, the whole region will be marked as <newtype>,
2255 even if previously unavailable. If "+<newtype>" is left
2256 out, matching memory will be removed. Types are
2257 specified as e820 types, e.g., 1 = RAM, 2 = reserved,
2258 3 = ACPI, 12 = PRAM.
2259
2251 memory_corruption_check=0/1 [X86] 2260 memory_corruption_check=0/1 [X86]
2252 Some BIOSes seem to corrupt the first 64k of 2261 Some BIOSes seem to corrupt the first 64k of
2253 memory when doing things like suspend/resume. 2262 memory when doing things like suspend/resume.
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
index 087251a0d99c..2432a5ef86d9 100644
--- a/Documentation/x86/x86_64/5level-paging.txt
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt
20 20
21CONFIG_X86_5LEVEL=y enables the feature. 21CONFIG_X86_5LEVEL=y enables the feature.
22 22
23So far, a kernel compiled with the option enabled will be able to boot 23Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
24only on machines that supports the feature -- see for 'la57' flag in 24In this case additional page table level -- p4d -- will be folded at
25/proc/cpuinfo. 25runtime.
26
27The plan is to implement boot-time switching between 4- and 5-level paging
28in the future.
29 26
30== User-space and large virtual address space == 27== User-space and large virtual address space ==
31 28
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb5b5907dbd6..518b41b097dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1461,6 +1461,8 @@ config X86_PAE
1461 1461
1462config X86_5LEVEL 1462config X86_5LEVEL
1463 bool "Enable 5-level page tables support" 1463 bool "Enable 5-level page tables support"
1464 select DYNAMIC_MEMORY_LAYOUT
1465 select SPARSEMEM_VMEMMAP
1464 depends on X86_64 1466 depends on X86_64
1465 ---help--- 1467 ---help---
1466 5-level paging enables access to larger address space: 1468 5-level paging enables access to larger address space:
@@ -1469,8 +1471,8 @@ config X86_5LEVEL
1469 1471
1470 It will be supported by future Intel CPUs. 1472 It will be supported by future Intel CPUs.
1471 1473
1472 Note: a kernel with this option enabled can only be booted 1474 A kernel with the option enabled can be booted on machines that
1473 on machines that support the feature. 1475 support 4- or 5-level paging.
1474 1476
1475 See Documentation/x86/x86_64/5level-paging.txt for more 1477 See Documentation/x86/x86_64/5level-paging.txt for more
1476 information. 1478 information.
@@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT
1595 def_bool y 1597 def_bool y
1596 depends on X86_32 && DISCONTIGMEM 1598 depends on X86_32 && DISCONTIGMEM
1597 1599
1598config NEED_NODE_MEMMAP_SIZE
1599 def_bool y
1600 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1601
1602config ARCH_FLATMEM_ENABLE 1600config ARCH_FLATMEM_ENABLE
1603 def_bool y 1601 def_bool y
1604 depends on X86_32 && !NUMA 1602 depends on X86_32 && !NUMA
@@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN
2174 2172
2175 Don't change this unless you know what you are doing. 2173 Don't change this unless you know what you are doing.
2176 2174
2175config DYNAMIC_MEMORY_LAYOUT
2176 bool
2177 ---help---
2178 This option makes base addresses of vmalloc and vmemmap as well as
2179 __PAGE_OFFSET movable during boot.
2180
2177config RANDOMIZE_MEMORY 2181config RANDOMIZE_MEMORY
2178 bool "Randomize the kernel memory sections" 2182 bool "Randomize the kernel memory sections"
2179 depends on X86_64 2183 depends on X86_64
2180 depends on RANDOMIZE_BASE 2184 depends on RANDOMIZE_BASE
2185 select DYNAMIC_MEMORY_LAYOUT
2181 default RANDOMIZE_BASE 2186 default RANDOMIZE_BASE
2182 ---help--- 2187 ---help---
2183 Randomizes the base virtual address of kernel memory sections 2188 Randomizes the base virtual address of kernel memory sections
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f484ae0ece93..fa42f895fdde 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
78vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o 78vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
79vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o 79vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
80ifdef CONFIG_X86_64 80ifdef CONFIG_X86_64
81 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o 81 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
82 vmlinux-objs-y += $(obj)/mem_encrypt.o 82 vmlinux-objs-y += $(obj)/mem_encrypt.o
83 vmlinux-objs-y += $(obj)/pgtable_64.o 83 vmlinux-objs-y += $(obj)/pgtable_64.o
84endif 84endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fc313e29fe2c..fca012baba19 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -33,6 +33,7 @@
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
35#include <asm/bootparam.h> 35#include <asm/bootparam.h>
36#include "pgtable.h"
36 37
37/* 38/*
38 * Locally defined symbols should be marked hidden: 39 * Locally defined symbols should be marked hidden:
@@ -304,55 +305,77 @@ ENTRY(startup_64)
304 /* Set up the stack */ 305 /* Set up the stack */
305 leaq boot_stack_end(%rbx), %rsp 306 leaq boot_stack_end(%rbx), %rsp
306 307
307#ifdef CONFIG_X86_5LEVEL
308 /* 308 /*
309 * Check if we need to enable 5-level paging. 309 * At this point we are in long mode with 4-level paging enabled,
310 * RSI holds real mode data and need to be preserved across 310 * but we might want to enable 5-level paging or vice versa.
311 * a function call. 311 *
312 * The problem is that we cannot do it directly. Setting or clearing
313 * CR4.LA57 in long mode would trigger #GP. So we need to switch off
314 * long mode and paging first.
315 *
316 * We also need a trampoline in lower memory to switch over from
317 * 4- to 5-level paging for cases when the bootloader puts the kernel
318 * above 4G, but didn't enable 5-level paging for us.
319 *
320 * The same trampoline can be used to switch from 5- to 4-level paging
321 * mode, like when starting 4-level paging kernel via kexec() when
322 * original kernel worked in 5-level paging mode.
323 *
324 * For the trampoline, we need the top page table to reside in lower
325 * memory as we don't have a way to load 64-bit values into CR3 in
326 * 32-bit mode.
327 *
328 * We go though the trampoline even if we don't have to: if we're
329 * already in a desired paging mode. This way the trampoline code gets
330 * tested on every boot.
312 */ 331 */
313 pushq %rsi
314 call l5_paging_required
315 popq %rsi
316 332
317 /* If l5_paging_required() returned zero, we're done here. */ 333 /* Make sure we have GDT with 32-bit code segment */
318 cmpq $0, %rax 334 leaq gdt(%rip), %rax
319 je lvl5 335 movq %rax, gdt64+2(%rip)
336 lgdt gdt64(%rip)
320 337
321 /* 338 /*
322 * At this point we are in long mode with 4-level paging enabled, 339 * paging_prepare() sets up the trampoline and checks if we need to
323 * but we want to enable 5-level paging. 340 * enable 5-level paging.
324 * 341 *
325 * The problem is that we cannot do it directly. Setting LA57 in 342 * Address of the trampoline is returned in RAX.
326 * long mode would trigger #GP. So we need to switch off long mode 343 * Non zero RDX on return means we need to enable 5-level paging.
327 * first.
328 * 344 *
329 * NOTE: This is not going to work if bootloader put us above 4G 345 * RSI holds real mode data and needs to be preserved across
330 * limit. 346 * this function call.
331 *
332 * The first step is go into compatibility mode.
333 */ 347 */
348 pushq %rsi
349 call paging_prepare
350 popq %rsi
334 351
335 /* Clear additional page table */ 352 /* Save the trampoline address in RCX */
336 leaq lvl5_pgtable(%rbx), %rdi 353 movq %rax, %rcx
337 xorq %rax, %rax
338 movq $(PAGE_SIZE/8), %rcx
339 rep stosq
340 354
341 /* 355 /*
342 * Setup current CR3 as the first and only entry in a new top level 356 * Load the address of trampoline_return() into RDI.
343 * page table. 357 * It will be used by the trampoline to return to the main code.
344 */ 358 */
345 movq %cr3, %rdi 359 leaq trampoline_return(%rip), %rdi
346 leaq 0x7 (%rdi), %rax
347 movq %rax, lvl5_pgtable(%rbx)
348 360
349 /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ 361 /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
350 pushq $__KERNEL32_CS 362 pushq $__KERNEL32_CS
351 leaq compatible_mode(%rip), %rax 363 leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
352 pushq %rax 364 pushq %rax
353 lretq 365 lretq
354lvl5: 366trampoline_return:
355#endif 367 /* Restore the stack, the 32-bit trampoline uses its own stack */
368 leaq boot_stack_end(%rbx), %rsp
369
370 /*
371 * cleanup_trampoline() would restore trampoline memory.
372 *
373 * RSI holds real mode data and needs to be preserved across
374 * this function call.
375 */
376 pushq %rsi
377 call cleanup_trampoline
378 popq %rsi
356 379
357 /* Zero EFLAGS */ 380 /* Zero EFLAGS */
358 pushq $0 381 pushq $0
@@ -490,46 +513,82 @@ relocated:
490 jmp *%rax 513 jmp *%rax
491 514
492 .code32 515 .code32
493#ifdef CONFIG_X86_5LEVEL 516/*
494compatible_mode: 517 * This is the 32-bit trampoline that will be copied over to low memory.
495 /* Setup data and stack segments */ 518 *
519 * RDI contains the return address (might be above 4G).
520 * ECX contains the base address of the trampoline memory.
521 * Non zero RDX on return means we need to enable 5-level paging.
522 */
523ENTRY(trampoline_32bit_src)
524 /* Set up data and stack segments */
496 movl $__KERNEL_DS, %eax 525 movl $__KERNEL_DS, %eax
497 movl %eax, %ds 526 movl %eax, %ds
498 movl %eax, %ss 527 movl %eax, %ss
499 528
529 /* Set up new stack */
530 leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
531
500 /* Disable paging */ 532 /* Disable paging */
501 movl %cr0, %eax 533 movl %cr0, %eax
502 btrl $X86_CR0_PG_BIT, %eax 534 btrl $X86_CR0_PG_BIT, %eax
503 movl %eax, %cr0 535 movl %eax, %cr0
504 536
505 /* Point CR3 to 5-level paging */ 537 /* Check what paging mode we want to be in after the trampoline */
506 leal lvl5_pgtable(%ebx), %eax 538 cmpl $0, %edx
507 movl %eax, %cr3 539 jz 1f
508 540
509 /* Enable PAE and LA57 mode */ 541 /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
542 movl %cr4, %eax
543 testl $X86_CR4_LA57, %eax
544 jnz 3f
545 jmp 2f
5461:
547 /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
510 movl %cr4, %eax 548 movl %cr4, %eax
511 orl $(X86_CR4_PAE | X86_CR4_LA57), %eax 549 testl $X86_CR4_LA57, %eax
550 jz 3f
5512:
552 /* Point CR3 to the trampoline's new top level page table */
553 leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
554 movl %eax, %cr3
5553:
556 /* Enable PAE and LA57 (if required) paging modes */
557 movl $X86_CR4_PAE, %eax
558 cmpl $0, %edx
559 jz 1f
560 orl $X86_CR4_LA57, %eax
5611:
512 movl %eax, %cr4 562 movl %eax, %cr4
513 563
514 /* Calculate address we are running at */ 564 /* Calculate address of paging_enabled() once we are executing in the trampoline */
515 call 1f 565 leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
5161: popl %edi
517 subl $1b, %edi
518 566
519 /* Prepare stack for far return to Long Mode */ 567 /* Prepare the stack for far return to Long Mode */
520 pushl $__KERNEL_CS 568 pushl $__KERNEL_CS
521 leal lvl5(%edi), %eax 569 pushl %eax
522 push %eax
523 570
524 /* Enable paging back */ 571 /* Enable paging again */
525 movl $(X86_CR0_PG | X86_CR0_PE), %eax 572 movl $(X86_CR0_PG | X86_CR0_PE), %eax
526 movl %eax, %cr0 573 movl %eax, %cr0
527 574
528 lret 575 lret
529#endif
530 576
577 .code64
578paging_enabled:
579 /* Return from the trampoline */
580 jmp *%rdi
581
582 /*
583 * The trampoline code has a size limit.
584 * Make sure we fail to compile if the trampoline code grows
585 * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
586 */
587 .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
588
589 .code32
531no_longmode: 590no_longmode:
532 /* This isn't an x86-64 CPU so hang */ 591 /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
5331: 5921:
534 hlt 593 hlt
535 jmp 1b 594 jmp 1b
@@ -537,6 +596,11 @@ no_longmode:
537#include "../../kernel/verify_cpu.S" 596#include "../../kernel/verify_cpu.S"
538 597
539 .data 598 .data
599gdt64:
600 .word gdt_end - gdt
601 .long 0
602 .word 0
603 .quad 0
540gdt: 604gdt:
541 .word gdt_end - gdt 605 .word gdt_end - gdt
542 .long gdt 606 .long gdt
@@ -585,7 +649,3 @@ boot_stack_end:
585 .balign 4096 649 .balign 4096
586pgtable: 650pgtable:
587 .fill BOOT_PGT_SIZE, 1, 0 651 .fill BOOT_PGT_SIZE, 1, 0
588#ifdef CONFIG_X86_5LEVEL
589lvl5_pgtable:
590 .fill PAGE_SIZE, 1, 0
591#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 8199a6187251..66e42a098d70 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -46,6 +46,12 @@
46#define STATIC 46#define STATIC
47#include <linux/decompress/mm.h> 47#include <linux/decompress/mm.h>
48 48
49#ifdef CONFIG_X86_5LEVEL
50unsigned int pgtable_l5_enabled __ro_after_init;
51unsigned int pgdir_shift __ro_after_init = 39;
52unsigned int ptrs_per_p4d __ro_after_init = 1;
53#endif
54
49extern unsigned long get_cmd_line_ptr(void); 55extern unsigned long get_cmd_line_ptr(void);
50 56
51/* Simplified build-specific string for starting entropy. */ 57/* Simplified build-specific string for starting entropy. */
@@ -723,6 +729,14 @@ void choose_random_location(unsigned long input,
723 return; 729 return;
724 } 730 }
725 731
732#ifdef CONFIG_X86_5LEVEL
733 if (__read_cr4() & X86_CR4_LA57) {
734 pgtable_l5_enabled = 1;
735 pgdir_shift = 48;
736 ptrs_per_p4d = 512;
737 }
738#endif
739
726 boot_params->hdr.loadflags |= KASLR_FLAG; 740 boot_params->hdr.loadflags |= KASLR_FLAG;
727 741
728 /* Prepare to add new identity pagetables on demand. */ 742 /* Prepare to add new identity pagetables on demand. */
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c
index b5e5e02f8cde..522d11431433 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -16,13 +16,6 @@
16#define __pa(x) ((unsigned long)(x)) 16#define __pa(x) ((unsigned long)(x))
17#define __va(x) ((void *)((unsigned long)(x))) 17#define __va(x) ((void *)((unsigned long)(x)))
18 18
19/*
20 * The pgtable.h and mm/ident_map.c includes make use of the SME related
21 * information which is not used in the compressed image support. Un-define
22 * the SME support to avoid any compile and link errors.
23 */
24#undef CONFIG_AMD_MEM_ENCRYPT
25
26/* No PAGE_TABLE_ISOLATION support needed either: */ 19/* No PAGE_TABLE_ISOLATION support needed either: */
27#undef CONFIG_PAGE_TABLE_ISOLATION 20#undef CONFIG_PAGE_TABLE_ISOLATION
28 21
@@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info;
85/* Locates and clears a region for a new top level page table. */ 78/* Locates and clears a region for a new top level page table. */
86void initialize_identity_maps(void) 79void initialize_identity_maps(void)
87{ 80{
88 unsigned long sev_me_mask = get_sev_encryption_mask(); 81 /* If running as an SEV guest, the encryption mask is required. */
82 set_sev_encryption_mask();
89 83
90 /* Init mapping_info with run-time function/buffer pointers. */ 84 /* Init mapping_info with run-time function/buffer pointers. */
91 mapping_info.alloc_pgt_page = alloc_pgt_page; 85 mapping_info.alloc_pgt_page = alloc_pgt_page;
92 mapping_info.context = &pgt_data; 86 mapping_info.context = &pgt_data;
93 mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; 87 mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
94 mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; 88 mapping_info.kernpg_flag = _KERNPG_TABLE;
95 89
96 /* 90 /*
97 * It should be impossible for this not to already be true, 91 * It should be impossible for this not to already be true,
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index 54f5f6625a73..eaa843a52907 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit)
88ENDPROC(get_sev_encryption_bit) 88ENDPROC(get_sev_encryption_bit)
89 89
90 .code64 90 .code64
91ENTRY(get_sev_encryption_mask) 91ENTRY(set_sev_encryption_mask)
92 xor %rax, %rax
93
94#ifdef CONFIG_AMD_MEM_ENCRYPT 92#ifdef CONFIG_AMD_MEM_ENCRYPT
95 push %rbp 93 push %rbp
96 push %rdx 94 push %rdx
@@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask)
101 testl %eax, %eax 99 testl %eax, %eax
102 jz .Lno_sev_mask 100 jz .Lno_sev_mask
103 101
104 xor %rdx, %rdx 102 bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
105 bts %rax, %rdx /* Create the encryption mask */
106 mov %rdx, %rax /* ... and return it */
107 103
108.Lno_sev_mask: 104.Lno_sev_mask:
109 movq %rbp, %rsp /* Restore original stack pointer */ 105 movq %rbp, %rsp /* Restore original stack pointer */
@@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask)
112 pop %rbp 108 pop %rbp
113#endif 109#endif
114 110
111 xor %rax, %rax
115 ret 112 ret
116ENDPROC(get_sev_encryption_mask) 113ENDPROC(set_sev_encryption_mask)
117 114
118 .data 115 .data
119enc_bit: 116enc_bit:
120 .int 0xffffffff 117 .int 0xffffffff
118
119#ifdef CONFIG_AMD_MEM_ENCRYPT
120 .balign 8
121GLOBAL(sme_me_mask)
122 .quad 0
123#endif
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 252fee320816..8dd1d5ccae58 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -14,6 +14,7 @@
14 14
15#include "misc.h" 15#include "misc.h"
16#include "error.h" 16#include "error.h"
17#include "pgtable.h"
17#include "../string.h" 18#include "../string.h"
18#include "../voffset.h" 19#include "../voffset.h"
19 20
@@ -169,16 +170,6 @@ void __puthex(unsigned long value)
169 } 170 }
170} 171}
171 172
172static bool l5_supported(void)
173{
174 /* Check if leaf 7 is supported. */
175 if (native_cpuid_eax(0) < 7)
176 return 0;
177
178 /* Check if la57 is supported. */
179 return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
180}
181
182#if CONFIG_X86_NEED_RELOCS 173#if CONFIG_X86_NEED_RELOCS
183static void handle_relocations(void *output, unsigned long output_len, 174static void handle_relocations(void *output, unsigned long output_len,
184 unsigned long virt_addr) 175 unsigned long virt_addr)
@@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
376 console_init(); 367 console_init();
377 debug_putstr("early console in extract_kernel\n"); 368 debug_putstr("early console in extract_kernel\n");
378 369
379 if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
380 error("This linux kernel as configured requires 5-level paging\n"
381 "This CPU does not support the required 'cr4.la57' feature\n"
382 "Unable to boot - please use a kernel appropriate for your CPU\n");
383 }
384
385 free_mem_ptr = heap; /* Heap */ 370 free_mem_ptr = heap; /* Heap */
386 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 371 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
387 372
@@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
392 debug_putaddr(output_len); 377 debug_putaddr(output_len);
393 debug_putaddr(kernel_total_size); 378 debug_putaddr(kernel_total_size);
394 379
380#ifdef CONFIG_X86_64
381 /* Report address of 32-bit trampoline */
382 debug_putaddr(trampoline_32bit);
383#endif
384
395 /* 385 /*
396 * The memory hole needed for the kernel is the larger of either 386 * The memory hole needed for the kernel is the larger of either
397 * the entire decompressed kernel plus relocation table, or the 387 * the entire decompressed kernel plus relocation table, or the
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9d323dc6b159..9e11be4cae19 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,6 +12,11 @@
12#undef CONFIG_PARAVIRT_SPINLOCKS 12#undef CONFIG_PARAVIRT_SPINLOCKS
13#undef CONFIG_KASAN 13#undef CONFIG_KASAN
14 14
15#ifdef CONFIG_X86_5LEVEL
16/* cpu_feature_enabled() cannot be used that early */
17#define pgtable_l5_enabled __pgtable_l5_enabled
18#endif
19
15#include <linux/linkage.h> 20#include <linux/linkage.h>
16#include <linux/screen_info.h> 21#include <linux/screen_info.h>
17#include <linux/elf.h> 22#include <linux/elf.h>
@@ -109,6 +114,6 @@ static inline void console_init(void)
109{ } 114{ }
110#endif 115#endif
111 116
112unsigned long get_sev_encryption_mask(void); 117void set_sev_encryption_mask(void);
113 118
114#endif 119#endif
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
new file mode 100644
index 000000000000..91f75638f6e6
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -0,0 +1,20 @@
1#ifndef BOOT_COMPRESSED_PAGETABLE_H
2#define BOOT_COMPRESSED_PAGETABLE_H
3
4#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
5
6#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
7
8#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
9#define TRAMPOLINE_32BIT_CODE_SIZE 0x60
10
11#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
12
13#ifndef __ASSEMBLER__
14
15extern unsigned long *trampoline_32bit;
16
17extern void trampoline_32bit_src(void *return_ptr);
18
19#endif /* __ASSEMBLER__ */
20#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index b4469a37e9a1..32af1cbcd903 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,4 +1,6 @@
1#include <asm/processor.h> 1#include <asm/processor.h>
2#include "pgtable.h"
3#include "../string.h"
2 4
3/* 5/*
4 * __force_order is used by special_insns.h asm code to force instruction 6 * __force_order is used by special_insns.h asm code to force instruction
@@ -9,20 +11,144 @@
9 */ 11 */
10unsigned long __force_order; 12unsigned long __force_order;
11 13
12int l5_paging_required(void) 14#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
15#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
16
17struct paging_config {
18 unsigned long trampoline_start;
19 unsigned long l5_required;
20};
21
22/* Buffer to preserve trampoline memory */
23static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
24
25/*
26 * The page table is going to be used instead of page table in the trampoline
27 * memory.
28 *
29 * It must not be in BSS as BSS is cleared after cleanup_trampoline().
30 */
31static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
32
33/*
34 * Trampoline address will be printed by extract_kernel() for debugging
35 * purposes.
36 *
37 * Avoid putting the pointer into .bss as it will be cleared between
38 * paging_prepare() and extract_kernel().
39 */
40unsigned long *trampoline_32bit __section(.data);
41
42struct paging_config paging_prepare(void)
13{ 43{
14 /* Check if leaf 7 is supported. */ 44 struct paging_config paging_config = {};
45 unsigned long bios_start, ebda_start;
46
47 /*
48 * Check if LA57 is desired and supported.
49 *
50 * There are two parts to the check:
51 * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
52 * - if the machine supports 5-level paging:
53 * + CPUID leaf 7 is supported
54 * + the leaf has the feature bit set
55 *
56 * That's substitute for boot_cpu_has() in early boot code.
57 */
58 if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
59 native_cpuid_eax(0) >= 7 &&
60 (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
61 paging_config.l5_required = 1;
62 }
63
64 /*
65 * Find a suitable spot for the trampoline.
66 * This code is based on reserve_bios_regions().
67 */
68
69 ebda_start = *(unsigned short *)0x40e << 4;
70 bios_start = *(unsigned short *)0x413 << 10;
15 71
16 if (native_cpuid_eax(0) < 7) 72 if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
17 return 0; 73 bios_start = BIOS_START_MAX;
74
75 if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
76 bios_start = ebda_start;
77
78 /* Place the trampoline just below the end of low memory, aligned to 4k */
79 paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE;
80 paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE);
81
82 trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
83
84 /* Preserve trampoline memory */
85 memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
86
87 /* Clear trampoline memory first */
88 memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
89
90 /* Copy trampoline code in place */
91 memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
92 &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
93
94 /*
95 * The code below prepares page table in trampoline memory.
96 *
97 * The new page table will be used by trampoline code for switching
98 * from 4- to 5-level paging or vice versa.
99 *
100 * If switching is not required, the page table is unused: trampoline
101 * code wouldn't touch CR3.
102 */
103
104 /*
105 * We are not going to use the page table in trampoline memory if we
106 * are already in the desired paging mode.
107 */
108 if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
109 goto out;
110
111 if (paging_config.l5_required) {
112 /*
113 * For 4- to 5-level paging transition, set up current CR3 as
114 * the first and the only entry in a new top-level page table.
115 */
116 trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
117 } else {
118 unsigned long src;
119
120 /*
121 * For 5- to 4-level paging transition, copy page table pointed
122 * by first entry in the current top-level page table as our
123 * new top-level page table.
124 *
125 * We cannot just point to the page table from trampoline as it
126 * may be above 4G.
127 */
128 src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
129 memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
130 (void *)src, PAGE_SIZE);
131 }
132
133out:
134 return paging_config;
135}
136
137void cleanup_trampoline(void)
138{
139 void *trampoline_pgtable;
18 140
19 /* Check if la57 is supported. */ 141 trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET;
20 if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
21 return 0;
22 142
23 /* Check if 5-level paging has already been enabled. */ 143 /*
24 if (native_read_cr4() & X86_CR4_LA57) 144 * Move the top level page table out of trampoline memory,
25 return 0; 145 * if it's there.
146 */
147 if ((void *)__native_read_cr3() == trampoline_pgtable) {
148 memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
149 native_write_cr3((unsigned long)top_pgtable);
150 }
26 151
27 return 1; 152 /* Restore trampoline memory */
153 memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
28} 154}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 18ed349b4f83..936e19642eab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -260,8 +260,13 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
260 * Change top bits to match most significant bit (47th or 56th bit 260 * Change top bits to match most significant bit (47th or 56th bit
261 * depending on paging mode) in the address. 261 * depending on paging mode) in the address.
262 */ 262 */
263#ifdef CONFIG_X86_5LEVEL
264 ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
265 "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
266#else
263 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 267 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
264 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 268 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
269#endif
265 270
266 /* If this changed %rcx, it was not canonical */ 271 /* If this changed %rcx, it was not canonical */
267 cmpq %rcx, %r11 272 cmpq %rcx, %r11
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 11881726ed37..a303d7b7d763 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -31,6 +31,7 @@
31#include <asm/mmu.h> 31#include <asm/mmu.h>
32#include <asm/mpspec.h> 32#include <asm/mpspec.h>
33#include <asm/realmode.h> 33#include <asm/realmode.h>
34#include <asm/x86_init.h>
34 35
35#ifdef CONFIG_ACPI_APEI 36#ifdef CONFIG_ACPI_APEI
36# include <asm/pgtable_types.h> 37# include <asm/pgtable_types.h>
@@ -133,6 +134,14 @@ static inline bool acpi_has_cpu_in_madt(void)
133 return !!acpi_lapic; 134 return !!acpi_lapic;
134} 135}
135 136
137#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
138static inline u64 acpi_arch_get_root_pointer(void)
139{
140 return x86_init.acpi.get_root_pointer();
141}
142
143void acpi_generic_reduced_hw_init(void);
144
136#else /* !CONFIG_ACPI */ 145#else /* !CONFIG_ACPI */
137 146
138#define acpi_lapic 0 147#define acpi_lapic 0
@@ -142,6 +151,8 @@ static inline void acpi_noirq_set(void) { }
142static inline void acpi_disable_pci(void) { } 151static inline void acpi_disable_pci(void) { }
143static inline void disable_acpi(void) { } 152static inline void disable_acpi(void) { }
144 153
154static inline void acpi_generic_reduced_hw_init(void) { }
155
145#endif /* !CONFIG_ACPI */ 156#endif /* !CONFIG_ACPI */
146 157
147#define ARCH_HAS_POWER_INIT 1 158#define ARCH_HAS_POWER_INIT 1
diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h
new file mode 100644
index 000000000000..3cb002b1d0f9
--- /dev/null
+++ b/arch/x86/include/asm/intel_pconfig.h
@@ -0,0 +1,65 @@
1#ifndef _ASM_X86_INTEL_PCONFIG_H
2#define _ASM_X86_INTEL_PCONFIG_H
3
4#include <asm/asm.h>
5#include <asm/processor.h>
6
7enum pconfig_target {
8 INVALID_TARGET = 0,
9 MKTME_TARGET = 1,
10 PCONFIG_TARGET_NR
11};
12
13int pconfig_target_supported(enum pconfig_target target);
14
15enum pconfig_leaf {
16 MKTME_KEY_PROGRAM = 0,
17 PCONFIG_LEAF_INVALID,
18};
19
20#define PCONFIG ".byte 0x0f, 0x01, 0xc5"
21
22/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */
23
24/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */
25#define MKTME_KEYID_SET_KEY_DIRECT 0
26#define MKTME_KEYID_SET_KEY_RANDOM 1
27#define MKTME_KEYID_CLEAR_KEY 2
28#define MKTME_KEYID_NO_ENCRYPT 3
29
30/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */
31#define MKTME_AES_XTS_128 (1 << 8)
32
33/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */
34#define MKTME_PROG_SUCCESS 0
35#define MKTME_INVALID_PROG_CMD 1
36#define MKTME_ENTROPY_ERROR 2
37#define MKTME_INVALID_KEYID 3
38#define MKTME_INVALID_ENC_ALG 4
39#define MKTME_DEVICE_BUSY 5
40
41/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */
42struct mktme_key_program {
43 u16 keyid;
44 u32 keyid_ctrl;
45 u8 __rsvd[58];
46 u8 key_field_1[64];
47 u8 key_field_2[64];
48} __packed __aligned(256);
49
50static inline int mktme_key_program(struct mktme_key_program *key_program)
51{
52 unsigned long rax = MKTME_KEY_PROGRAM;
53
54 if (!pconfig_target_supported(MKTME_TARGET))
55 return -ENXIO;
56
57 asm volatile(PCONFIG
58 : "=a" (rax), "=b" (key_program)
59 : "0" (rax), "1" (key_program)
60 : "memory", "cc");
61
62 return rax;
63}
64
65#endif /* _ASM_X86_INTEL_PCONFIG_H */
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index 460991e3b529..db7ba2feb947 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -5,10 +5,6 @@
5unsigned long kaslr_get_random_long(const char *purpose); 5unsigned long kaslr_get_random_long(const char *purpose);
6 6
7#ifdef CONFIG_RANDOMIZE_MEMORY 7#ifdef CONFIG_RANDOMIZE_MEMORY
8extern unsigned long page_offset_base;
9extern unsigned long vmalloc_base;
10extern unsigned long vmemmap_base;
11
12void kernel_randomize_memory(void); 8void kernel_randomize_memory(void);
13#else 9#else
14static inline void kernel_randomize_memory(void) { } 10static inline void kernel_randomize_memory(void) { }
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 22c5f3e6f820..8fe61ad21047 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -22,6 +22,7 @@
22#ifdef CONFIG_AMD_MEM_ENCRYPT 22#ifdef CONFIG_AMD_MEM_ENCRYPT
23 23
24extern u64 sme_me_mask; 24extern u64 sme_me_mask;
25extern bool sev_enabled;
25 26
26void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, 27void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
27 unsigned long decrypted_kernel_vaddr, 28 unsigned long decrypted_kernel_vaddr,
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 9ca8dae9c716..939b1cff4a7b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -11,6 +11,10 @@
11extern unsigned long max_pfn; 11extern unsigned long max_pfn;
12extern unsigned long phys_base; 12extern unsigned long phys_base;
13 13
14extern unsigned long page_offset_base;
15extern unsigned long vmalloc_base;
16extern unsigned long vmemmap_base;
17
14static inline unsigned long __phys_addr_nodebug(unsigned long x) 18static inline unsigned long __phys_addr_nodebug(unsigned long x)
15{ 19{
16 unsigned long y = x - __START_KERNEL_map; 20 unsigned long y = x - __START_KERNEL_map;
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index e1407312c412..2c5a966dc222 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -37,26 +37,24 @@
37 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's 37 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
38 * what Xen requires. 38 * what Xen requires.
39 */ 39 */
40#ifdef CONFIG_X86_5LEVEL 40#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL)
41#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) 41#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL)
42#else
43#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
44#endif
45 42
46#ifdef CONFIG_RANDOMIZE_MEMORY 43#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
47#define __PAGE_OFFSET page_offset_base 44#define __PAGE_OFFSET page_offset_base
48#else 45#else
49#define __PAGE_OFFSET __PAGE_OFFSET_BASE 46#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4
50#endif /* CONFIG_RANDOMIZE_MEMORY */ 47#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
51 48
52#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 49#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
53 50
54/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 51/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
55#ifdef CONFIG_X86_5LEVEL 52
56#define __PHYSICAL_MASK_SHIFT 52 53#define __PHYSICAL_MASK_SHIFT 52
57#define __VIRTUAL_MASK_SHIFT 56 54
55#ifdef CONFIG_X86_5LEVEL
56#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47)
58#else 57#else
59#define __PHYSICAL_MASK_SHIFT 46
60#define __VIRTUAL_MASK_SHIFT 47 58#define __VIRTUAL_MASK_SHIFT 47
61#endif 59#endif
62 60
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c83a2f418cea..9be2bf13825b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -568,17 +568,22 @@ static inline p4dval_t p4d_val(p4d_t p4d)
568 return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); 568 return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
569} 569}
570 570
571static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) 571static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
572{ 572{
573 pgdval_t val = native_pgd_val(pgd); 573 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
574
575 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
576} 574}
577 575
578static inline void pgd_clear(pgd_t *pgdp) 576#define set_pgd(pgdp, pgdval) do { \
579{ 577 if (pgtable_l5_enabled) \
580 set_pgd(pgdp, __pgd(0)); 578 __set_pgd(pgdp, pgdval); \
581} 579 else \
580 set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
581} while (0)
582
583#define pgd_clear(pgdp) do { \
584 if (pgtable_l5_enabled) \
585 set_pgd(pgdp, __pgd(0)); \
586} while (0)
582 587
583#endif /* CONFIG_PGTABLE_LEVELS == 5 */ 588#endif /* CONFIG_PGTABLE_LEVELS == 5 */
584 589
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index aff42e1da6ee..263c142a6a6c 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
167#if CONFIG_PGTABLE_LEVELS > 4 167#if CONFIG_PGTABLE_LEVELS > 4
168static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) 168static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
169{ 169{
170 if (!pgtable_l5_enabled)
171 return;
170 paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); 172 paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
171 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); 173 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
172} 174}
@@ -191,7 +193,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
191static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, 193static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
192 unsigned long address) 194 unsigned long address)
193{ 195{
194 ___p4d_free_tlb(tlb, p4d); 196 if (pgtable_l5_enabled)
197 ___p4d_free_tlb(tlb, p4d);
195} 198}
196 199
197#endif /* CONFIG_PGTABLE_LEVELS > 4 */ 200#endif /* CONFIG_PGTABLE_LEVELS > 4 */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 876b4c77d983..6a59a6d0cc50 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -44,5 +44,6 @@ typedef union {
44 */ 44 */
45#define PTRS_PER_PTE 512 45#define PTRS_PER_PTE 512
46 46
47#define MAX_POSSIBLE_PHYSMEM_BITS 36
47 48
48#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ 49#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b444d83cfc95..89d5c8886c85 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
65 65
66#ifndef __PAGETABLE_P4D_FOLDED 66#ifndef __PAGETABLE_P4D_FOLDED
67#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) 67#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
68#define pgd_clear(pgd) native_pgd_clear(pgd) 68#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
69#endif 69#endif
70 70
71#ifndef set_p4d 71#ifndef set_p4d
@@ -859,6 +859,8 @@ static inline unsigned long p4d_index(unsigned long address)
859#if CONFIG_PGTABLE_LEVELS > 4 859#if CONFIG_PGTABLE_LEVELS > 4
860static inline int pgd_present(pgd_t pgd) 860static inline int pgd_present(pgd_t pgd)
861{ 861{
862 if (!pgtable_l5_enabled)
863 return 1;
862 return pgd_flags(pgd) & _PAGE_PRESENT; 864 return pgd_flags(pgd) & _PAGE_PRESENT;
863} 865}
864 866
@@ -876,6 +878,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
876/* to find an entry in a page-table-directory. */ 878/* to find an entry in a page-table-directory. */
877static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) 879static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
878{ 880{
881 if (!pgtable_l5_enabled)
882 return (p4d_t *)pgd;
879 return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); 883 return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
880} 884}
881 885
@@ -883,6 +887,9 @@ static inline int pgd_bad(pgd_t pgd)
883{ 887{
884 unsigned long ignore_flags = _PAGE_USER; 888 unsigned long ignore_flags = _PAGE_USER;
885 889
890 if (!pgtable_l5_enabled)
891 return 0;
892
886 if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 893 if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
887 ignore_flags |= _PAGE_NX; 894 ignore_flags |= _PAGE_NX;
888 895
@@ -891,6 +898,8 @@ static inline int pgd_bad(pgd_t pgd)
891 898
892static inline int pgd_none(pgd_t pgd) 899static inline int pgd_none(pgd_t pgd)
893{ 900{
901 if (!pgtable_l5_enabled)
902 return 0;
894 /* 903 /*
895 * There is no need to do a workaround for the KNL stray 904 * There is no need to do a workaround for the KNL stray
896 * A/D bit erratum here. PGDs only point to page tables 905 * A/D bit erratum here. PGDs only point to page tables
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index b3ec519e3982..88a056b01db4 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -34,6 +34,8 @@ static inline void check_pgt_cache(void) { }
34void paging_init(void); 34void paging_init(void);
35void sync_initial_page_table(void); 35void sync_initial_page_table(void);
36 36
37static inline int pgd_large(pgd_t pgd) { return 0; }
38
37/* 39/*
38 * Define this if things work differently on an i386 and an i486: 40 * Define this if things work differently on an i386 and an i486:
39 * it will (on an i486) warn about kernel memory accesses that are 41 * it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 0777e18a1d23..e3225e83db7d 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -15,6 +15,8 @@
15# include <asm/pgtable-2level_types.h> 15# include <asm/pgtable-2level_types.h>
16#endif 16#endif
17 17
18#define pgtable_l5_enabled 0
19
18#define PGDIR_SIZE (1UL << PGDIR_SHIFT) 20#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
19#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 21#define PGDIR_MASK (~(PGDIR_SIZE - 1))
20 22
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 1149d2112b2e..877bc27718ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -218,29 +218,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
218 218
219static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) 219static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
220{ 220{
221#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) 221 pgd_t pgd;
222 p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); 222
223#else 223 if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
224 *p4dp = p4d; 224 *p4dp = p4d;
225#endif 225 return;
226 }
227
228 pgd = native_make_pgd(native_p4d_val(p4d));
229 pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
230 *p4dp = native_make_p4d(native_pgd_val(pgd));
226} 231}
227 232
228static inline void native_p4d_clear(p4d_t *p4d) 233static inline void native_p4d_clear(p4d_t *p4d)
229{ 234{
230#ifdef CONFIG_X86_5LEVEL
231 native_set_p4d(p4d, native_make_p4d(0)); 235 native_set_p4d(p4d, native_make_p4d(0));
232#else
233 native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
234#endif
235} 236}
236 237
237static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 238static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
238{ 239{
239#ifdef CONFIG_PAGE_TABLE_ISOLATION
240 *pgdp = pti_set_user_pgd(pgdp, pgd); 240 *pgdp = pti_set_user_pgd(pgdp, pgd);
241#else
242 *pgdp = pgd;
243#endif
244} 241}
245 242
246static inline void native_pgd_clear(pgd_t *pgd) 243static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6b8f73dcbc2c..d5c21a382475 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,6 +20,18 @@ typedef unsigned long pgprotval_t;
20 20
21typedef struct { pteval_t pte; } pte_t; 21typedef struct { pteval_t pte; } pte_t;
22 22
23#ifdef CONFIG_X86_5LEVEL
24extern unsigned int __pgtable_l5_enabled;
25#ifndef pgtable_l5_enabled
26#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
27#endif
28#else
29#define pgtable_l5_enabled 0
30#endif
31
32extern unsigned int pgdir_shift;
33extern unsigned int ptrs_per_p4d;
34
23#endif /* !__ASSEMBLY__ */ 35#endif /* !__ASSEMBLY__ */
24 36
25#define SHARED_KERNEL_PMD 0 37#define SHARED_KERNEL_PMD 0
@@ -29,24 +41,28 @@ typedef struct { pteval_t pte; } pte_t;
29/* 41/*
30 * PGDIR_SHIFT determines what a top-level page table entry can map 42 * PGDIR_SHIFT determines what a top-level page table entry can map
31 */ 43 */
32#define PGDIR_SHIFT 48 44#define PGDIR_SHIFT pgdir_shift
33#define PTRS_PER_PGD 512 45#define PTRS_PER_PGD 512
34 46
35/* 47/*
36 * 4th level page in 5-level paging case 48 * 4th level page in 5-level paging case
37 */ 49 */
38#define P4D_SHIFT 39 50#define P4D_SHIFT 39
39#define PTRS_PER_P4D 512 51#define MAX_PTRS_PER_P4D 512
40#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) 52#define PTRS_PER_P4D ptrs_per_p4d
41#define P4D_MASK (~(P4D_SIZE - 1)) 53#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
54#define P4D_MASK (~(P4D_SIZE - 1))
55
56#define MAX_POSSIBLE_PHYSMEM_BITS 52
42 57
43#else /* CONFIG_X86_5LEVEL */ 58#else /* CONFIG_X86_5LEVEL */
44 59
45/* 60/*
46 * PGDIR_SHIFT determines what a top-level page table entry can map 61 * PGDIR_SHIFT determines what a top-level page table entry can map
47 */ 62 */
48#define PGDIR_SHIFT 39 63#define PGDIR_SHIFT 39
49#define PTRS_PER_PGD 512 64#define PTRS_PER_PGD 512
65#define MAX_PTRS_PER_P4D 1
50 66
51#endif /* CONFIG_X86_5LEVEL */ 67#endif /* CONFIG_X86_5LEVEL */
52 68
@@ -82,31 +98,33 @@ typedef struct { pteval_t pte; } pte_t;
82 * range must not overlap with anything except the KASAN shadow area, which 98 * range must not overlap with anything except the KASAN shadow area, which
83 * is correct as KASAN disables KASLR. 99 * is correct as KASAN disables KASLR.
84 */ 100 */
85#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 101#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
86 102
87#ifdef CONFIG_X86_5LEVEL 103#define LDT_PGD_ENTRY_L4 -3UL
88# define VMALLOC_SIZE_TB _AC(12800, UL) 104#define LDT_PGD_ENTRY_L5 -112UL
89# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) 105#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
90# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) 106#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
91# define LDT_PGD_ENTRY _AC(-112, UL) 107
92# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) 108#define __VMALLOC_BASE_L4 0xffffc90000000000
93#else 109#define __VMALLOC_BASE_L5 0xffa0000000000000
94# define VMALLOC_SIZE_TB _AC(32, UL) 110
95# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 111#define VMALLOC_SIZE_TB_L4 32UL
96# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 112#define VMALLOC_SIZE_TB_L5 12800UL
97# define LDT_PGD_ENTRY _AC(-3, UL) 113
98# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) 114#define __VMEMMAP_BASE_L4 0xffffea0000000000
99#endif 115#define __VMEMMAP_BASE_L5 0xffd4000000000000
100 116
101#ifdef CONFIG_RANDOMIZE_MEMORY 117#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
102# define VMALLOC_START vmalloc_base 118# define VMALLOC_START vmalloc_base
119# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
103# define VMEMMAP_START vmemmap_base 120# define VMEMMAP_START vmemmap_base
104#else 121#else
105# define VMALLOC_START __VMALLOC_BASE 122# define VMALLOC_START __VMALLOC_BASE_L4
106# define VMEMMAP_START __VMEMMAP_BASE 123# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4
107#endif /* CONFIG_RANDOMIZE_MEMORY */ 124# define VMEMMAP_START __VMEMMAP_BASE_L4
125#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
108 126
109#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) 127#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
110 128
111#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 129#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
112/* The module sections ends with the start of the fixmap */ 130/* The module sections ends with the start of the fixmap */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fb3a6de7440b..6847d85400a8 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,12 +53,6 @@
53# define NEED_MOVBE 0 53# define NEED_MOVBE 0
54#endif 54#endif
55 55
56#ifdef CONFIG_X86_5LEVEL
57# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
58#else
59# define NEED_LA57 0
60#endif
61
62#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
63#ifdef CONFIG_PARAVIRT 57#ifdef CONFIG_PARAVIRT
64/* Paravirtualized systems may not have PSE or PGE available */ 58/* Paravirtualized systems may not have PSE or PGE available */
@@ -104,7 +98,7 @@
104#define REQUIRED_MASK13 0 98#define REQUIRED_MASK13 0
105#define REQUIRED_MASK14 0 99#define REQUIRED_MASK14 0
106#define REQUIRED_MASK15 0 100#define REQUIRED_MASK15 0
107#define REQUIRED_MASK16 (NEED_LA57) 101#define REQUIRED_MASK16 0
108#define REQUIRED_MASK17 0 102#define REQUIRED_MASK17 0
109#define REQUIRED_MASK18 0 103#define REQUIRED_MASK18 0
110#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) 104#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4fc1e9d3c43e..4617a2bf123c 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,13 +27,8 @@
27# endif 27# endif
28#else /* CONFIG_X86_32 */ 28#else /* CONFIG_X86_32 */
29# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 29# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
30# ifdef CONFIG_X86_5LEVEL 30# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44)
31# define MAX_PHYSADDR_BITS 52 31# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46)
32# define MAX_PHYSMEM_BITS 52
33# else
34# define MAX_PHYSADDR_BITS 44
35# define MAX_PHYSMEM_BITS 46
36# endif
37#endif 32#endif
38 33
39#endif /* CONFIG_SPARSEMEM */ 34#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 88306054bd98..199e15bd3ec5 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -131,6 +131,16 @@ struct x86_hyper_init {
131}; 131};
132 132
133/** 133/**
134 * struct x86_init_acpi - x86 ACPI init functions
135 * @get_root_pointer: get RSDP address
136 * @reduced_hw_early_init: hardware reduced platform early init
137 */
138struct x86_init_acpi {
139 u64 (*get_root_pointer)(void);
140 void (*reduced_hw_early_init)(void);
141};
142
143/**
134 * struct x86_init_ops - functions for platform specific setup 144 * struct x86_init_ops - functions for platform specific setup
135 * 145 *
136 */ 146 */
@@ -144,6 +154,7 @@ struct x86_init_ops {
144 struct x86_init_iommu iommu; 154 struct x86_init_iommu iommu;
145 struct x86_init_pci pci; 155 struct x86_init_pci pci;
146 struct x86_hyper_init hyper; 156 struct x86_hyper_init hyper;
157 struct x86_init_acpi acpi;
147}; 158};
148 159
149/** 160/**
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2aa92094b59d..7a37d9357bc4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1376,17 +1376,21 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1376 * 1376 *
1377 * We initialize the Hardware-reduced ACPI model here: 1377 * We initialize the Hardware-reduced ACPI model here:
1378 */ 1378 */
1379void __init acpi_generic_reduced_hw_init(void)
1380{
1381 /*
1382 * Override x86_init functions and bypass legacy PIC in
1383 * hardware reduced ACPI mode.
1384 */
1385 x86_init.timers.timer_init = x86_init_noop;
1386 x86_init.irqs.pre_vector_init = x86_init_noop;
1387 legacy_pic = &null_legacy_pic;
1388}
1389
1379static void __init acpi_reduced_hw_init(void) 1390static void __init acpi_reduced_hw_init(void)
1380{ 1391{
1381 if (acpi_gbl_reduced_hardware) { 1392 if (acpi_gbl_reduced_hardware)
1382 /* 1393 x86_init.acpi.reduced_hw_early_init();
1383 * Override x86_init functions and bypass legacy pic
1384 * in Hardware-reduced ACPI mode
1385 */
1386 x86_init.timers.timer_init = x86_init_noop;
1387 x86_init.irqs.pre_vector_init = x86_init_noop;
1388 legacy_pic = &null_legacy_pic;
1389 }
1390} 1394}
1391 1395
1392/* 1396/*
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 570e8bb1f386..a66229f51b12 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -28,7 +28,7 @@ obj-y += cpuid-deps.o
28obj-$(CONFIG_PROC_FS) += proc.o 28obj-$(CONFIG_PROC_FS) += proc.o
29obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o 29obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
30 30
31obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 31obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o
32obj-$(CONFIG_CPU_SUP_AMD) += amd.o 32obj-$(CONFIG_CPU_SUP_AMD) += amd.o
33obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 33obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
34obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o 34obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c3af167d0a70..b9693b80fc21 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -509,6 +509,90 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
509 } 509 }
510} 510}
511 511
512#define MSR_IA32_TME_ACTIVATE 0x982
513
514/* Helpers to access TME_ACTIVATE MSR */
515#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
516#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
517
518#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
519#define TME_ACTIVATE_POLICY_AES_XTS_128 0
520
521#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
522
523#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
524#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
525
526/* Values for mktme_status (SW only construct) */
527#define MKTME_ENABLED 0
528#define MKTME_DISABLED 1
529#define MKTME_UNINITIALIZED 2
530static int mktme_status = MKTME_UNINITIALIZED;
531
532static void detect_tme(struct cpuinfo_x86 *c)
533{
534 u64 tme_activate, tme_policy, tme_crypto_algs;
535 int keyid_bits = 0, nr_keyids = 0;
536 static u64 tme_activate_cpu0 = 0;
537
538 rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
539
540 if (mktme_status != MKTME_UNINITIALIZED) {
541 if (tme_activate != tme_activate_cpu0) {
542 /* Broken BIOS? */
543 pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
544 pr_err_once("x86/tme: MKTME is not usable\n");
545 mktme_status = MKTME_DISABLED;
546
547 /* Proceed. We may need to exclude bits from x86_phys_bits. */
548 }
549 } else {
550 tme_activate_cpu0 = tme_activate;
551 }
552
553 if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
554 pr_info_once("x86/tme: not enabled by BIOS\n");
555 mktme_status = MKTME_DISABLED;
556 return;
557 }
558
559 if (mktme_status != MKTME_UNINITIALIZED)
560 goto detect_keyid_bits;
561
562 pr_info("x86/tme: enabled by BIOS\n");
563
564 tme_policy = TME_ACTIVATE_POLICY(tme_activate);
565 if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
566 pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
567
568 tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
569 if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
570 pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
571 tme_crypto_algs);
572 mktme_status = MKTME_DISABLED;
573 }
574detect_keyid_bits:
575 keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
576 nr_keyids = (1UL << keyid_bits) - 1;
577 if (nr_keyids) {
578 pr_info_once("x86/mktme: enabled by BIOS\n");
579 pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
580 } else {
581 pr_info_once("x86/mktme: disabled by BIOS\n");
582 }
583
584 if (mktme_status == MKTME_UNINITIALIZED) {
585 /* MKTME is usable */
586 mktme_status = MKTME_ENABLED;
587 }
588
589 /*
590 * KeyID bits effectively lower the number of physical address
591 * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
592 */
593 c->x86_phys_bits -= keyid_bits;
594}
595
512static void init_intel_energy_perf(struct cpuinfo_x86 *c) 596static void init_intel_energy_perf(struct cpuinfo_x86 *c)
513{ 597{
514 u64 epb; 598 u64 epb;
@@ -679,6 +763,9 @@ static void init_intel(struct cpuinfo_x86 *c)
679 if (cpu_has(c, X86_FEATURE_VMX)) 763 if (cpu_has(c, X86_FEATURE_VMX))
680 detect_vmx_virtcap(c); 764 detect_vmx_virtcap(c);
681 765
766 if (cpu_has(c, X86_FEATURE_TME))
767 detect_tme(c);
768
682 init_intel_energy_perf(c); 769 init_intel_energy_perf(c);
683 770
684 init_intel_misc_features(c); 771 init_intel_misc_features(c);
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
new file mode 100644
index 000000000000..0771a905b286
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pconfig.c
@@ -0,0 +1,82 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Intel PCONFIG instruction support.
4 *
5 * Copyright (C) 2017 Intel Corporation
6 *
7 * Author:
8 * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
9 */
10
11#include <asm/cpufeature.h>
12#include <asm/intel_pconfig.h>
13
14#define PCONFIG_CPUID 0x1b
15
16#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
17
18/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
19enum {
20 PCONFIG_CPUID_SUBLEAF_INVALID = 0,
21 PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
22};
23
24/* Bitmask of supported targets */
25static u64 targets_supported __read_mostly;
26
27int pconfig_target_supported(enum pconfig_target target)
28{
29 /*
30 * We would need to re-think the implementation once we get > 64
31 * PCONFIG targets. Spec allows up to 2^32 targets.
32 */
33 BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
34
35 if (WARN_ON_ONCE(target >= 64))
36 return 0;
37 return targets_supported & (1ULL << target);
38}
39
40static int __init intel_pconfig_init(void)
41{
42 int subleaf;
43
44 if (!boot_cpu_has(X86_FEATURE_PCONFIG))
45 return 0;
46
47 /*
48 * Scan subleafs of PCONFIG CPUID leaf.
49 *
50 * Subleafs of the same type need not to be consecutive.
51 *
52 * Stop on the first invalid subleaf type. All subleafs after the first
53 * invalid are invalid too.
54 */
55 for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
56 struct cpuid_regs regs;
57
58 cpuid_count(PCONFIG_CPUID, subleaf,
59 &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
60
61 switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
62 case PCONFIG_CPUID_SUBLEAF_INVALID:
63 /* Stop on the first invalid subleaf */
64 goto out;
65 case PCONFIG_CPUID_SUBLEAF_TARGETID:
66 /* Mark supported PCONFIG targets */
67 if (regs.ebx < 64)
68 targets_supported |= (1ULL << regs.ebx);
69 if (regs.ecx < 64)
70 targets_supported |= (1ULL << regs.ecx);
71 if (regs.edx < 64)
72 targets_supported |= (1ULL << regs.edx);
73 break;
74 default:
75 /* Unknown CPUID.PCONFIG subleaf: ignore */
76 break;
77 }
78 }
79out:
80 return 0;
81}
82arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c1eec17312b..42cf2880d0ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1095,19 +1095,7 @@ static void mce_unmap_kpfn(unsigned long pfn)
1095 * a legal address. 1095 * a legal address.
1096 */ 1096 */
1097 1097
1098/*
1099 * Build time check to see if we have a spare virtual bit. Don't want
1100 * to leave this until run time because most developers don't have a
1101 * system that can exercise this code path. This will only become a
1102 * problem if/when we move beyond 5-level page tables.
1103 *
1104 * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
1105 */
1106#if PGDIR_SHIFT + 9 < 63
1107 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 1098 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1108#else
1109#error "no unused virtual bit available"
1110#endif
1111 1099
1112 if (set_memory_np(decoy_addr, 1)) 1100 if (set_memory_np(decoy_addr, 1))
1113 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 1101 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
@@ -2357,6 +2345,12 @@ static __init int mcheck_init_device(void)
2357{ 2345{
2358 int err; 2346 int err;
2359 2347
2348 /*
2349 * Check if we have a spare virtual bit. This will only become
2350 * a problem if/when we move beyond 5-level page tables.
2351 */
2352 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2353
2360 if (!mce_available(&boot_cpu_data)) { 2354 if (!mce_available(&boot_cpu_data)) {
2361 err = -EIO; 2355 err = -EIO;
2362 goto err_out; 2356 goto err_out;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 71c11ad5643e..6a2cb1442e05 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -924,6 +924,24 @@ static int __init parse_memmap_one(char *p)
924 } else if (*p == '!') { 924 } else if (*p == '!') {
925 start_at = memparse(p+1, &p); 925 start_at = memparse(p+1, &p);
926 e820__range_add(start_at, mem_size, E820_TYPE_PRAM); 926 e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
927 } else if (*p == '%') {
928 enum e820_type from = 0, to = 0;
929
930 start_at = memparse(p + 1, &p);
931 if (*p == '-')
932 from = simple_strtoull(p + 1, &p, 0);
933 if (*p == '+')
934 to = simple_strtoull(p + 1, &p, 0);
935 if (*p != '\0')
936 return -EINVAL;
937 if (from && to)
938 e820__range_update(start_at, mem_size, from, to);
939 else if (to)
940 e820__range_add(start_at, mem_size, to);
941 else if (from)
942 e820__range_remove(start_at, mem_size, from, 1);
943 else
944 e820__range_remove(start_at, mem_size, 0, 0);
927 } else { 945 } else {
928 e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); 946 e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
929 } 947 }
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7ba5d819ebe3..0c855deee165 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -32,6 +32,11 @@
32#include <asm/microcode.h> 32#include <asm/microcode.h>
33#include <asm/kasan.h> 33#include <asm/kasan.h>
34 34
35#ifdef CONFIG_X86_5LEVEL
36#undef pgtable_l5_enabled
37#define pgtable_l5_enabled __pgtable_l5_enabled
38#endif
39
35/* 40/*
36 * Manage page tables very early on. 41 * Manage page tables very early on.
37 */ 42 */
@@ -39,6 +44,24 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
39static unsigned int __initdata next_early_pgt; 44static unsigned int __initdata next_early_pgt;
40pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); 45pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
41 46
47#ifdef CONFIG_X86_5LEVEL
48unsigned int __pgtable_l5_enabled __ro_after_init;
49EXPORT_SYMBOL(__pgtable_l5_enabled);
50unsigned int pgdir_shift __ro_after_init = 39;
51EXPORT_SYMBOL(pgdir_shift);
52unsigned int ptrs_per_p4d __ro_after_init = 1;
53EXPORT_SYMBOL(ptrs_per_p4d);
54#endif
55
56#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
57unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
58EXPORT_SYMBOL(page_offset_base);
59unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
60EXPORT_SYMBOL(vmalloc_base);
61unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
62EXPORT_SYMBOL(vmemmap_base);
63#endif
64
42#define __head __section(.head.text) 65#define __head __section(.head.text)
43 66
44static void __head *fixup_pointer(void *ptr, unsigned long physaddr) 67static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
@@ -46,6 +69,41 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
46 return ptr - (void *)_text + (void *)physaddr; 69 return ptr - (void *)_text + (void *)physaddr;
47} 70}
48 71
72static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
73{
74 return fixup_pointer(ptr, physaddr);
75}
76
77#ifdef CONFIG_X86_5LEVEL
78static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
79{
80 return fixup_pointer(ptr, physaddr);
81}
82
83static bool __head check_la57_support(unsigned long physaddr)
84{
85 if (native_cpuid_eax(0) < 7)
86 return false;
87
88 if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
89 return false;
90
91 *fixup_int(&pgtable_l5_enabled, physaddr) = 1;
92 *fixup_int(&pgdir_shift, physaddr) = 48;
93 *fixup_int(&ptrs_per_p4d, physaddr) = 512;
94 *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
95 *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
96 *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
97
98 return true;
99}
100#else
101static bool __head check_la57_support(unsigned long physaddr)
102{
103 return false;
104}
105#endif
106
49unsigned long __head __startup_64(unsigned long physaddr, 107unsigned long __head __startup_64(unsigned long physaddr,
50 struct boot_params *bp) 108 struct boot_params *bp)
51{ 109{
@@ -55,9 +113,12 @@ unsigned long __head __startup_64(unsigned long physaddr,
55 p4dval_t *p4d; 113 p4dval_t *p4d;
56 pudval_t *pud; 114 pudval_t *pud;
57 pmdval_t *pmd, pmd_entry; 115 pmdval_t *pmd, pmd_entry;
116 bool la57;
58 int i; 117 int i;
59 unsigned int *next_pgt_ptr; 118 unsigned int *next_pgt_ptr;
60 119
120 la57 = check_la57_support(physaddr);
121
61 /* Is the address too large? */ 122 /* Is the address too large? */
62 if (physaddr >> MAX_PHYSMEM_BITS) 123 if (physaddr >> MAX_PHYSMEM_BITS)
63 for (;;); 124 for (;;);
@@ -81,9 +142,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
81 /* Fixup the physical addresses in the page table */ 142 /* Fixup the physical addresses in the page table */
82 143
83 pgd = fixup_pointer(&early_top_pgt, physaddr); 144 pgd = fixup_pointer(&early_top_pgt, physaddr);
84 pgd[pgd_index(__START_KERNEL_map)] += load_delta; 145 p = pgd + pgd_index(__START_KERNEL_map);
85 146 if (la57)
86 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 147 *p = (unsigned long)level4_kernel_pgt;
148 else
149 *p = (unsigned long)level3_kernel_pgt;
150 *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
151
152 if (la57) {
87 p4d = fixup_pointer(&level4_kernel_pgt, physaddr); 153 p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
88 p4d[511] += load_delta; 154 p4d[511] += load_delta;
89 } 155 }
@@ -108,7 +174,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
108 174
109 pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); 175 pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
110 176
111 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 177 if (la57) {
112 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); 178 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
113 179
114 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 180 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
@@ -154,8 +220,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
154 * Fixup phys_base - remove the memory encryption mask to obtain 220 * Fixup phys_base - remove the memory encryption mask to obtain
155 * the true physical address. 221 * the true physical address.
156 */ 222 */
157 p = fixup_pointer(&phys_base, physaddr); 223 *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
158 *p += load_delta - sme_get_me_mask();
159 224
160 /* Encrypt the kernel and related (if SME is active) */ 225 /* Encrypt the kernel and related (if SME is active) */
161 sme_encrypt_kernel(bp); 226 sme_encrypt_kernel(bp);
@@ -206,7 +271,7 @@ again:
206 * critical -- __PAGE_OFFSET would point us back into the dynamic 271 * critical -- __PAGE_OFFSET would point us back into the dynamic
207 * range and we might end up looping forever... 272 * range and we might end up looping forever...
208 */ 273 */
209 if (!IS_ENABLED(CONFIG_X86_5LEVEL)) 274 if (!pgtable_l5_enabled)
210 p4d_p = pgd_p; 275 p4d_p = pgd_p;
211 else if (pgd) 276 else if (pgd)
212 p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); 277 p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
@@ -322,7 +387,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
322 BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); 387 BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
323 BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); 388 BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
324 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 389 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
325 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 390 MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
326 (__START_KERNEL & PGDIR_MASK))); 391 (__START_KERNEL & PGDIR_MASK)));
327 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 392 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
328 393
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0f545b3cf926..48385c1074a5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -39,12 +39,12 @@
39 * 39 *
40 */ 40 */
41 41
42#define l4_index(x) (((x) >> 39) & 511)
42#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 43#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
43 44
44#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) 45L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
45PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) 46L4_START_KERNEL = l4_index(__START_KERNEL_map)
46PGD_START_KERNEL = pgd_index(__START_KERNEL_map) 47
47#endif
48L3_START_KERNEL = pud_index(__START_KERNEL_map) 48L3_START_KERNEL = pud_index(__START_KERNEL_map)
49 49
50 .text 50 .text
@@ -125,7 +125,10 @@ ENTRY(secondary_startup_64)
125 /* Enable PAE mode, PGE and LA57 */ 125 /* Enable PAE mode, PGE and LA57 */
126 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx 126 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
127#ifdef CONFIG_X86_5LEVEL 127#ifdef CONFIG_X86_5LEVEL
128 testl $1, __pgtable_l5_enabled(%rip)
129 jz 1f
128 orl $X86_CR4_LA57, %ecx 130 orl $X86_CR4_LA57, %ecx
1311:
129#endif 132#endif
130 movq %rcx, %cr4 133 movq %rcx, %cr4
131 134
@@ -374,12 +377,7 @@ GLOBAL(name)
374 377
375 __INITDATA 378 __INITDATA
376NEXT_PGD_PAGE(early_top_pgt) 379NEXT_PGD_PAGE(early_top_pgt)
377 .fill 511,8,0 380 .fill 512,8,0
378#ifdef CONFIG_X86_5LEVEL
379 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
380#else
381 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
382#endif
383 .fill PTI_USER_PGD_FILL,8,0 381 .fill PTI_USER_PGD_FILL,8,0
384 382
385NEXT_PAGE(early_dynamic_pgts) 383NEXT_PAGE(early_dynamic_pgts)
@@ -390,9 +388,9 @@ NEXT_PAGE(early_dynamic_pgts)
390#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) 388#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
391NEXT_PGD_PAGE(init_top_pgt) 389NEXT_PGD_PAGE(init_top_pgt)
392 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 390 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
393 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 391 .org init_top_pgt + L4_PAGE_OFFSET*8, 0
394 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 392 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
395 .org init_top_pgt + PGD_START_KERNEL*8, 0 393 .org init_top_pgt + L4_START_KERNEL*8, 0
396 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 394 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
397 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 395 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
398 .fill PTI_USER_PGD_FILL,8,0 396 .fill PTI_USER_PGD_FILL,8,0
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index faeea0b5cbd0..93bd4fb603d1 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -350,6 +350,7 @@ void arch_crash_save_vmcoreinfo(void)
350{ 350{
351 VMCOREINFO_NUMBER(phys_base); 351 VMCOREINFO_NUMBER(phys_base);
352 VMCOREINFO_SYMBOL(init_top_pgt); 352 VMCOREINFO_SYMBOL(init_top_pgt);
353 VMCOREINFO_NUMBER(pgtable_l5_enabled);
353 354
354#ifdef CONFIG_NUMA 355#ifdef CONFIG_NUMA
355 VMCOREINFO_SYMBOL(node_data); 356 VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c616be28506..6285697b6e56 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -189,9 +189,7 @@ struct ist_info ist_info;
189#endif 189#endif
190 190
191#else 191#else
192struct cpuinfo_x86 boot_cpu_data __read_mostly = { 192struct cpuinfo_x86 boot_cpu_data __read_mostly;
193 .x86_phys_bits = MAX_PHYSMEM_BITS,
194};
195EXPORT_SYMBOL(boot_cpu_data); 193EXPORT_SYMBOL(boot_cpu_data);
196#endif 194#endif
197 195
@@ -851,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
851 __flush_tlb_all(); 849 __flush_tlb_all();
852#else 850#else
853 printk(KERN_INFO "Command line: %s\n", boot_command_line); 851 printk(KERN_INFO "Command line: %s\n", boot_command_line);
852 boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
854#endif 853#endif
855 854
856 /* 855 /*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 2bccd03bd654..ebda84a91510 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -8,6 +8,7 @@
8#include <linux/export.h> 8#include <linux/export.h>
9#include <linux/pci.h> 9#include <linux/pci.h>
10 10
11#include <asm/acpi.h>
11#include <asm/bios_ebda.h> 12#include <asm/bios_ebda.h>
12#include <asm/paravirt.h> 13#include <asm/paravirt.h>
13#include <asm/pci_x86.h> 14#include <asm/pci_x86.h>
@@ -26,10 +27,11 @@
26 27
27void x86_init_noop(void) { } 28void x86_init_noop(void) { }
28void __init x86_init_uint_noop(unsigned int unused) { } 29void __init x86_init_uint_noop(unsigned int unused) { }
29int __init iommu_init_noop(void) { return 0; } 30static int __init iommu_init_noop(void) { return 0; }
30void iommu_shutdown_noop(void) { } 31static void iommu_shutdown_noop(void) { }
31bool __init bool_x86_init_noop(void) { return false; } 32static bool __init bool_x86_init_noop(void) { return false; }
32void x86_op_int_noop(int cpu) { } 33static void x86_op_int_noop(int cpu) { }
34static u64 u64_x86_init_noop(void) { return 0; }
33 35
34/* 36/*
35 * The platform setup functions are preset with the default functions 37 * The platform setup functions are preset with the default functions
@@ -91,6 +93,11 @@ struct x86_init_ops x86_init __initdata = {
91 .x2apic_available = bool_x86_init_noop, 93 .x2apic_available = bool_x86_init_noop,
92 .init_mem_mapping = x86_init_noop, 94 .init_mem_mapping = x86_init_noop,
93 }, 95 },
96
97 .acpi = {
98 .get_root_pointer = u64_x86_init_noop,
99 .reduced_hw_early_init = acpi_generic_reduced_hw_init,
100 },
94}; 101};
95 102
96struct x86_cpuinit_ops x86_cpuinit = { 103struct x86_cpuinit_ops x86_cpuinit = {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 27e9e90a8d35..4b101dd6e52f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,12 +1,15 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c 2# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
3KCOV_INSTRUMENT_tlb.o := n 3KCOV_INSTRUMENT_tlb.o := n
4KCOV_INSTRUMENT_mem_encrypt.o := n 4KCOV_INSTRUMENT_mem_encrypt.o := n
5KCOV_INSTRUMENT_mem_encrypt_identity.o := n
5 6
6KASAN_SANITIZE_mem_encrypt.o := n 7KASAN_SANITIZE_mem_encrypt.o := n
8KASAN_SANITIZE_mem_encrypt_identity.o := n
7 9
8ifdef CONFIG_FUNCTION_TRACER 10ifdef CONFIG_FUNCTION_TRACER
9CFLAGS_REMOVE_mem_encrypt.o = -pg 11CFLAGS_REMOVE_mem_encrypt.o = -pg
12CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
10endif 13endif
11 14
12obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 15obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
@@ -16,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
16nostackp := $(call cc-option, -fno-stack-protector) 19nostackp := $(call cc-option, -fno-stack-protector)
17CFLAGS_physaddr.o := $(nostackp) 20CFLAGS_physaddr.o := $(nostackp)
18CFLAGS_setup_nx.o := $(nostackp) 21CFLAGS_setup_nx.o := $(nostackp)
22CFLAGS_mem_encrypt_identity.o := $(nostackp)
19 23
20CFLAGS_fault.o := -I$(src)/../include/asm/trace 24CFLAGS_fault.o := -I$(src)/../include/asm/trace
21 25
@@ -47,4 +51,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
47obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o 51obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
48 52
49obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o 53obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
54obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
50obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o 55obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 421f2664ffa0..51a6f92da2bf 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -72,6 +72,31 @@ static const struct file_operations ptdump_curusr_fops = {
72}; 72};
73#endif 73#endif
74 74
75#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
76extern pgd_t *efi_pgd;
77static struct dentry *pe_efi;
78
79static int ptdump_show_efi(struct seq_file *m, void *v)
80{
81 if (efi_pgd)
82 ptdump_walk_pgd_level_debugfs(m, efi_pgd, false);
83 return 0;
84}
85
86static int ptdump_open_efi(struct inode *inode, struct file *filp)
87{
88 return single_open(filp, ptdump_show_efi, NULL);
89}
90
91static const struct file_operations ptdump_efi_fops = {
92 .owner = THIS_MODULE,
93 .open = ptdump_open_efi,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98#endif
99
75static struct dentry *dir, *pe_knl, *pe_curknl; 100static struct dentry *dir, *pe_knl, *pe_curknl;
76 101
77static int __init pt_dump_debug_init(void) 102static int __init pt_dump_debug_init(void)
@@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void)
96 if (!pe_curusr) 121 if (!pe_curusr)
97 goto err; 122 goto err;
98#endif 123#endif
124
125#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
126 pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops);
127 if (!pe_efi)
128 goto err;
129#endif
130
99 return 0; 131 return 0;
100err: 132err:
101 debugfs_remove_recursive(dir); 133 debugfs_remove_recursive(dir);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2a4849e92831..62a7e9f65dec 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -29,6 +29,7 @@
29struct pg_state { 29struct pg_state {
30 int level; 30 int level;
31 pgprot_t current_prot; 31 pgprot_t current_prot;
32 pgprotval_t effective_prot;
32 unsigned long start_address; 33 unsigned long start_address;
33 unsigned long current_address; 34 unsigned long current_address;
34 const struct addr_marker *marker; 35 const struct addr_marker *marker;
@@ -85,11 +86,15 @@ static struct addr_marker address_markers[] = {
85 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 86 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
86 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 87 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
87#ifdef CONFIG_KASAN 88#ifdef CONFIG_KASAN
88 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, 89 /*
89 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, 90 * These fields get initialized with the (dynamic)
91 * KASAN_SHADOW_{START,END} values in pt_dump_init().
92 */
93 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
94 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
90#endif 95#endif
91#ifdef CONFIG_MODIFY_LDT_SYSCALL 96#ifdef CONFIG_MODIFY_LDT_SYSCALL
92 [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, 97 [LDT_NR] = { 0UL, "LDT remap" },
93#endif 98#endif
94 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 99 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
95#ifdef CONFIG_X86_ESPFIX64 100#ifdef CONFIG_X86_ESPFIX64
@@ -231,9 +236,9 @@ static unsigned long normalize_addr(unsigned long u)
231 * print what we collected so far. 236 * print what we collected so far.
232 */ 237 */
233static void note_page(struct seq_file *m, struct pg_state *st, 238static void note_page(struct seq_file *m, struct pg_state *st,
234 pgprot_t new_prot, int level) 239 pgprot_t new_prot, pgprotval_t new_eff, int level)
235{ 240{
236 pgprotval_t prot, cur; 241 pgprotval_t prot, cur, eff;
237 static const char units[] = "BKMGTPE"; 242 static const char units[] = "BKMGTPE";
238 243
239 /* 244 /*
@@ -243,23 +248,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
243 */ 248 */
244 prot = pgprot_val(new_prot); 249 prot = pgprot_val(new_prot);
245 cur = pgprot_val(st->current_prot); 250 cur = pgprot_val(st->current_prot);
251 eff = st->effective_prot;
246 252
247 if (!st->level) { 253 if (!st->level) {
248 /* First entry */ 254 /* First entry */
249 st->current_prot = new_prot; 255 st->current_prot = new_prot;
256 st->effective_prot = new_eff;
250 st->level = level; 257 st->level = level;
251 st->marker = address_markers; 258 st->marker = address_markers;
252 st->lines = 0; 259 st->lines = 0;
253 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 260 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
254 st->marker->name); 261 st->marker->name);
255 } else if (prot != cur || level != st->level || 262 } else if (prot != cur || new_eff != eff || level != st->level ||
256 st->current_address >= st->marker[1].start_address) { 263 st->current_address >= st->marker[1].start_address) {
257 const char *unit = units; 264 const char *unit = units;
258 unsigned long delta; 265 unsigned long delta;
259 int width = sizeof(unsigned long) * 2; 266 int width = sizeof(unsigned long) * 2;
260 pgprotval_t pr = pgprot_val(st->current_prot);
261 267
262 if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { 268 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
263 WARN_ONCE(1, 269 WARN_ONCE(1,
264 "x86/mm: Found insecure W+X mapping at address %p/%pS\n", 270 "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
265 (void *)st->start_address, 271 (void *)st->start_address,
@@ -313,21 +319,30 @@ static void note_page(struct seq_file *m, struct pg_state *st,
313 319
314 st->start_address = st->current_address; 320 st->start_address = st->current_address;
315 st->current_prot = new_prot; 321 st->current_prot = new_prot;
322 st->effective_prot = new_eff;
316 st->level = level; 323 st->level = level;
317 } 324 }
318} 325}
319 326
320static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) 327static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
328{
329 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
330 ((prot1 | prot2) & _PAGE_NX);
331}
332
333static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
334 pgprotval_t eff_in, unsigned long P)
321{ 335{
322 int i; 336 int i;
323 pte_t *start; 337 pte_t *start;
324 pgprotval_t prot; 338 pgprotval_t prot, eff;
325 339
326 start = (pte_t *)pmd_page_vaddr(addr); 340 start = (pte_t *)pmd_page_vaddr(addr);
327 for (i = 0; i < PTRS_PER_PTE; i++) { 341 for (i = 0; i < PTRS_PER_PTE; i++) {
328 prot = pte_flags(*start); 342 prot = pte_flags(*start);
343 eff = effective_prot(eff_in, prot);
329 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 344 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
330 note_page(m, st, __pgprot(prot), 5); 345 note_page(m, st, __pgprot(prot), eff, 5);
331 start++; 346 start++;
332 } 347 }
333} 348}
@@ -344,12 +359,10 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
344 void *pt) 359 void *pt)
345{ 360{
346 if (__pa(pt) == __pa(kasan_zero_pmd) || 361 if (__pa(pt) == __pa(kasan_zero_pmd) ||
347#ifdef CONFIG_X86_5LEVEL 362 (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) ||
348 __pa(pt) == __pa(kasan_zero_p4d) ||
349#endif
350 __pa(pt) == __pa(kasan_zero_pud)) { 363 __pa(pt) == __pa(kasan_zero_pud)) {
351 pgprotval_t prot = pte_flags(kasan_zero_pte[0]); 364 pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
352 note_page(m, st, __pgprot(prot), 5); 365 note_page(m, st, __pgprot(prot), 0, 5);
353 return true; 366 return true;
354 } 367 }
355 return false; 368 return false;
@@ -364,42 +377,45 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
364 377
365#if PTRS_PER_PMD > 1 378#if PTRS_PER_PMD > 1
366 379
367static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) 380static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
381 pgprotval_t eff_in, unsigned long P)
368{ 382{
369 int i; 383 int i;
370 pmd_t *start, *pmd_start; 384 pmd_t *start, *pmd_start;
371 pgprotval_t prot; 385 pgprotval_t prot, eff;
372 386
373 pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 387 pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
374 for (i = 0; i < PTRS_PER_PMD; i++) { 388 for (i = 0; i < PTRS_PER_PMD; i++) {
375 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 389 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
376 if (!pmd_none(*start)) { 390 if (!pmd_none(*start)) {
391 prot = pmd_flags(*start);
392 eff = effective_prot(eff_in, prot);
377 if (pmd_large(*start) || !pmd_present(*start)) { 393 if (pmd_large(*start) || !pmd_present(*start)) {
378 prot = pmd_flags(*start); 394 note_page(m, st, __pgprot(prot), eff, 4);
379 note_page(m, st, __pgprot(prot), 4);
380 } else if (!kasan_page_table(m, st, pmd_start)) { 395 } else if (!kasan_page_table(m, st, pmd_start)) {
381 walk_pte_level(m, st, *start, 396 walk_pte_level(m, st, *start, eff,
382 P + i * PMD_LEVEL_MULT); 397 P + i * PMD_LEVEL_MULT);
383 } 398 }
384 } else 399 } else
385 note_page(m, st, __pgprot(0), 4); 400 note_page(m, st, __pgprot(0), 0, 4);
386 start++; 401 start++;
387 } 402 }
388} 403}
389 404
390#else 405#else
391#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) 406#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
392#define pud_large(a) pmd_large(__pmd(pud_val(a))) 407#define pud_large(a) pmd_large(__pmd(pud_val(a)))
393#define pud_none(a) pmd_none(__pmd(pud_val(a))) 408#define pud_none(a) pmd_none(__pmd(pud_val(a)))
394#endif 409#endif
395 410
396#if PTRS_PER_PUD > 1 411#if PTRS_PER_PUD > 1
397 412
398static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) 413static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
414 pgprotval_t eff_in, unsigned long P)
399{ 415{
400 int i; 416 int i;
401 pud_t *start, *pud_start; 417 pud_t *start, *pud_start;
402 pgprotval_t prot; 418 pgprotval_t prot, eff;
403 pud_t *prev_pud = NULL; 419 pud_t *prev_pud = NULL;
404 420
405 pud_start = start = (pud_t *)p4d_page_vaddr(addr); 421 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
@@ -407,15 +423,16 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
407 for (i = 0; i < PTRS_PER_PUD; i++) { 423 for (i = 0; i < PTRS_PER_PUD; i++) {
408 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 424 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
409 if (!pud_none(*start)) { 425 if (!pud_none(*start)) {
426 prot = pud_flags(*start);
427 eff = effective_prot(eff_in, prot);
410 if (pud_large(*start) || !pud_present(*start)) { 428 if (pud_large(*start) || !pud_present(*start)) {
411 prot = pud_flags(*start); 429 note_page(m, st, __pgprot(prot), eff, 3);
412 note_page(m, st, __pgprot(prot), 3);
413 } else if (!kasan_page_table(m, st, pud_start)) { 430 } else if (!kasan_page_table(m, st, pud_start)) {
414 walk_pmd_level(m, st, *start, 431 walk_pmd_level(m, st, *start, eff,
415 P + i * PUD_LEVEL_MULT); 432 P + i * PUD_LEVEL_MULT);
416 } 433 }
417 } else 434 } else
418 note_page(m, st, __pgprot(0), 3); 435 note_page(m, st, __pgprot(0), 0, 3);
419 436
420 prev_pud = start; 437 prev_pud = start;
421 start++; 438 start++;
@@ -423,43 +440,43 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
423} 440}
424 441
425#else 442#else
426#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) 443#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
427#define p4d_large(a) pud_large(__pud(p4d_val(a))) 444#define p4d_large(a) pud_large(__pud(p4d_val(a)))
428#define p4d_none(a) pud_none(__pud(p4d_val(a))) 445#define p4d_none(a) pud_none(__pud(p4d_val(a)))
429#endif 446#endif
430 447
431#if PTRS_PER_P4D > 1 448static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
432 449 pgprotval_t eff_in, unsigned long P)
433static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
434{ 450{
435 int i; 451 int i;
436 p4d_t *start, *p4d_start; 452 p4d_t *start, *p4d_start;
437 pgprotval_t prot; 453 pgprotval_t prot, eff;
454
455 if (PTRS_PER_P4D == 1)
456 return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
438 457
439 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 458 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
440 459
441 for (i = 0; i < PTRS_PER_P4D; i++) { 460 for (i = 0; i < PTRS_PER_P4D; i++) {
442 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); 461 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
443 if (!p4d_none(*start)) { 462 if (!p4d_none(*start)) {
463 prot = p4d_flags(*start);
464 eff = effective_prot(eff_in, prot);
444 if (p4d_large(*start) || !p4d_present(*start)) { 465 if (p4d_large(*start) || !p4d_present(*start)) {
445 prot = p4d_flags(*start); 466 note_page(m, st, __pgprot(prot), eff, 2);
446 note_page(m, st, __pgprot(prot), 2);
447 } else if (!kasan_page_table(m, st, p4d_start)) { 467 } else if (!kasan_page_table(m, st, p4d_start)) {
448 walk_pud_level(m, st, *start, 468 walk_pud_level(m, st, *start, eff,
449 P + i * P4D_LEVEL_MULT); 469 P + i * P4D_LEVEL_MULT);
450 } 470 }
451 } else 471 } else
452 note_page(m, st, __pgprot(0), 2); 472 note_page(m, st, __pgprot(0), 0, 2);
453 473
454 start++; 474 start++;
455 } 475 }
456} 476}
457 477
458#else 478#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
459#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) 479#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
460#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
461#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
462#endif
463 480
464static inline bool is_hypervisor_range(int idx) 481static inline bool is_hypervisor_range(int idx)
465{ 482{
@@ -483,7 +500,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
483#else 500#else
484 pgd_t *start = swapper_pg_dir; 501 pgd_t *start = swapper_pg_dir;
485#endif 502#endif
486 pgprotval_t prot; 503 pgprotval_t prot, eff;
487 int i; 504 int i;
488 struct pg_state st = {}; 505 struct pg_state st = {};
489 506
@@ -499,15 +516,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
499 for (i = 0; i < PTRS_PER_PGD; i++) { 516 for (i = 0; i < PTRS_PER_PGD; i++) {
500 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 517 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
501 if (!pgd_none(*start) && !is_hypervisor_range(i)) { 518 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
519 prot = pgd_flags(*start);
520#ifdef CONFIG_X86_PAE
521 eff = _PAGE_USER | _PAGE_RW;
522#else
523 eff = prot;
524#endif
502 if (pgd_large(*start) || !pgd_present(*start)) { 525 if (pgd_large(*start) || !pgd_present(*start)) {
503 prot = pgd_flags(*start); 526 note_page(m, &st, __pgprot(prot), eff, 1);
504 note_page(m, &st, __pgprot(prot), 1);
505 } else { 527 } else {
506 walk_p4d_level(m, &st, *start, 528 walk_p4d_level(m, &st, *start, eff,
507 i * PGD_LEVEL_MULT); 529 i * PGD_LEVEL_MULT);
508 } 530 }
509 } else 531 } else
510 note_page(m, &st, __pgprot(0), 1); 532 note_page(m, &st, __pgprot(0), 0, 1);
511 533
512 cond_resched(); 534 cond_resched();
513 start++; 535 start++;
@@ -515,7 +537,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
515 537
516 /* Flush out the last page */ 538 /* Flush out the last page */
517 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); 539 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
518 note_page(m, &st, __pgprot(0), 0); 540 note_page(m, &st, __pgprot(0), 0, 0);
519 if (!checkwx) 541 if (!checkwx)
520 return; 542 return;
521 if (st.wx_pages) 543 if (st.wx_pages)
@@ -570,6 +592,13 @@ static int __init pt_dump_init(void)
570 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; 592 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
571 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 593 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
572 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; 594 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
595#ifdef CONFIG_MODIFY_LDT_SYSCALL
596 address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
597#endif
598#ifdef CONFIG_KASAN
599 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
600 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
601#endif
573#endif 602#endif
574#ifdef CONFIG_X86_32 603#ifdef CONFIG_X86_32
575 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 604 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f75ea0748b9f..73bd8c95ac71 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -417,11 +417,11 @@ void vmalloc_sync_all(void)
417 */ 417 */
418static noinline int vmalloc_fault(unsigned long address) 418static noinline int vmalloc_fault(unsigned long address)
419{ 419{
420 pgd_t *pgd, *pgd_ref; 420 pgd_t *pgd, *pgd_k;
421 p4d_t *p4d, *p4d_ref; 421 p4d_t *p4d, *p4d_k;
422 pud_t *pud, *pud_ref; 422 pud_t *pud;
423 pmd_t *pmd, *pmd_ref; 423 pmd_t *pmd;
424 pte_t *pte, *pte_ref; 424 pte_t *pte;
425 425
426 /* Make sure we are in vmalloc area: */ 426 /* Make sure we are in vmalloc area: */
427 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 427 if (!(address >= VMALLOC_START && address < VMALLOC_END))
@@ -435,73 +435,51 @@ static noinline int vmalloc_fault(unsigned long address)
435 * case just flush: 435 * case just flush:
436 */ 436 */
437 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); 437 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
438 pgd_ref = pgd_offset_k(address); 438 pgd_k = pgd_offset_k(address);
439 if (pgd_none(*pgd_ref)) 439 if (pgd_none(*pgd_k))
440 return -1; 440 return -1;
441 441
442 if (CONFIG_PGTABLE_LEVELS > 4) { 442 if (pgtable_l5_enabled) {
443 if (pgd_none(*pgd)) { 443 if (pgd_none(*pgd)) {
444 set_pgd(pgd, *pgd_ref); 444 set_pgd(pgd, *pgd_k);
445 arch_flush_lazy_mmu_mode(); 445 arch_flush_lazy_mmu_mode();
446 } else { 446 } else {
447 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 447 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
448 } 448 }
449 } 449 }
450 450
451 /* With 4-level paging, copying happens on the p4d level. */ 451 /* With 4-level paging, copying happens on the p4d level. */
452 p4d = p4d_offset(pgd, address); 452 p4d = p4d_offset(pgd, address);
453 p4d_ref = p4d_offset(pgd_ref, address); 453 p4d_k = p4d_offset(pgd_k, address);
454 if (p4d_none(*p4d_ref)) 454 if (p4d_none(*p4d_k))
455 return -1; 455 return -1;
456 456
457 if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { 457 if (p4d_none(*p4d) && !pgtable_l5_enabled) {
458 set_p4d(p4d, *p4d_ref); 458 set_p4d(p4d, *p4d_k);
459 arch_flush_lazy_mmu_mode(); 459 arch_flush_lazy_mmu_mode();
460 } else { 460 } else {
461 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); 461 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
462 } 462 }
463 463
464 /*
465 * Below here mismatches are bugs because these lower tables
466 * are shared:
467 */
468 BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); 464 BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
469 465
470 pud = pud_offset(p4d, address); 466 pud = pud_offset(p4d, address);
471 pud_ref = pud_offset(p4d_ref, address); 467 if (pud_none(*pud))
472 if (pud_none(*pud_ref))
473 return -1; 468 return -1;
474 469
475 if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
476 BUG();
477
478 if (pud_large(*pud)) 470 if (pud_large(*pud))
479 return 0; 471 return 0;
480 472
481 pmd = pmd_offset(pud, address); 473 pmd = pmd_offset(pud, address);
482 pmd_ref = pmd_offset(pud_ref, address); 474 if (pmd_none(*pmd))
483 if (pmd_none(*pmd_ref))
484 return -1; 475 return -1;
485 476
486 if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
487 BUG();
488
489 if (pmd_large(*pmd)) 477 if (pmd_large(*pmd))
490 return 0; 478 return 0;
491 479
492 pte_ref = pte_offset_kernel(pmd_ref, address);
493 if (!pte_present(*pte_ref))
494 return -1;
495
496 pte = pte_offset_kernel(pmd, address); 480 pte = pte_offset_kernel(pmd, address);
497 481 if (!pte_present(*pte))
498 /* 482 return -1;
499 * Don't use pte_page here, because the mappings can point
500 * outside mem_map, and the NUMA hash lookup cannot handle
501 * that:
502 */
503 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
504 BUG();
505 483
506 return 0; 484 return 0;
507} 485}
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ab33a32df2a8..9aa22be8331e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -120,7 +120,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
120 result = ident_p4d_init(info, p4d, addr, next); 120 result = ident_p4d_init(info, p4d, addr, next);
121 if (result) 121 if (result)
122 return result; 122 return result;
123 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 123 if (pgtable_l5_enabled) {
124 set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); 124 set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
125 } else { 125 } else {
126 /* 126 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index af11a2890235..45241de66785 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str)
88} 88}
89__setup("noexec32=", nonx32_setup); 89__setup("noexec32=", nonx32_setup);
90 90
91/* 91static void sync_global_pgds_l5(unsigned long start, unsigned long end)
92 * When memory was added make sure all the processes MM have
93 * suitable PGD entries in the local PGD level page.
94 */
95#ifdef CONFIG_X86_5LEVEL
96void sync_global_pgds(unsigned long start, unsigned long end)
97{ 92{
98 unsigned long addr; 93 unsigned long addr;
99 94
@@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
129 spin_unlock(&pgd_lock); 124 spin_unlock(&pgd_lock);
130 } 125 }
131} 126}
132#else 127
133void sync_global_pgds(unsigned long start, unsigned long end) 128static void sync_global_pgds_l4(unsigned long start, unsigned long end)
134{ 129{
135 unsigned long addr; 130 unsigned long addr;
136 131
@@ -143,7 +138,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
143 * With folded p4d, pgd_none() is always false, we need to 138 * With folded p4d, pgd_none() is always false, we need to
144 * handle synchonization on p4d level. 139 * handle synchonization on p4d level.
145 */ 140 */
146 BUILD_BUG_ON(pgd_none(*pgd_ref)); 141 MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
147 p4d_ref = p4d_offset(pgd_ref, addr); 142 p4d_ref = p4d_offset(pgd_ref, addr);
148 143
149 if (p4d_none(*p4d_ref)) 144 if (p4d_none(*p4d_ref))
@@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
173 spin_unlock(&pgd_lock); 168 spin_unlock(&pgd_lock);
174 } 169 }
175} 170}
176#endif 171
172/*
173 * When memory was added make sure all the processes MM have
174 * suitable PGD entries in the local PGD level page.
175 */
176void sync_global_pgds(unsigned long start, unsigned long end)
177{
178 if (pgtable_l5_enabled)
179 sync_global_pgds_l5(start, end);
180 else
181 sync_global_pgds_l4(start, end);
182}
177 183
178/* 184/*
179 * NOTE: This function is marked __ref because it calls __init function 185 * NOTE: This function is marked __ref because it calls __init function
@@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
632 unsigned long vaddr = (unsigned long)__va(paddr); 638 unsigned long vaddr = (unsigned long)__va(paddr);
633 int i = p4d_index(vaddr); 639 int i = p4d_index(vaddr);
634 640
635 if (!IS_ENABLED(CONFIG_X86_5LEVEL)) 641 if (!pgtable_l5_enabled)
636 return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); 642 return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
637 643
638 for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { 644 for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
712 page_size_mask); 718 page_size_mask);
713 719
714 spin_lock(&init_mm.page_table_lock); 720 spin_lock(&init_mm.page_table_lock);
715 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 721 if (pgtable_l5_enabled)
716 pgd_populate(&init_mm, pgd, p4d); 722 pgd_populate(&init_mm, pgd, p4d);
717 else 723 else
718 p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); 724 p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1089,7 +1095,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
1089 * 5-level case we should free them. This code will have to change 1095 * 5-level case we should free them. This code will have to change
1090 * to adapt for boot-time switching between 4 and 5 level page tables. 1096 * to adapt for boot-time switching between 4 and 5 level page tables.
1091 */ 1097 */
1092 if (CONFIG_PGTABLE_LEVELS == 5) 1098 if (pgtable_l5_enabled)
1093 free_pud_table(pud_base, p4d); 1099 free_pud_table(pud_base, p4d);
1094 } 1100 }
1095 1101
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index af6f2f9c6a26..d8ff013ea9d0 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,6 +1,12 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#define DISABLE_BRANCH_PROFILING 2#define DISABLE_BRANCH_PROFILING
3#define pr_fmt(fmt) "kasan: " fmt 3#define pr_fmt(fmt) "kasan: " fmt
4
5#ifdef CONFIG_X86_5LEVEL
6/* Too early to use cpu_feature_enabled() */
7#define pgtable_l5_enabled __pgtable_l5_enabled
8#endif
9
4#include <linux/bootmem.h> 10#include <linux/bootmem.h>
5#include <linux/kasan.h> 11#include <linux/kasan.h>
6#include <linux/kdebug.h> 12#include <linux/kdebug.h>
@@ -19,7 +25,7 @@
19 25
20extern struct range pfn_mapped[E820_MAX_ENTRIES]; 26extern struct range pfn_mapped[E820_MAX_ENTRIES];
21 27
22static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); 28static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
23 29
24static __init void *early_alloc(size_t size, int nid, bool panic) 30static __init void *early_alloc(size_t size, int nid, bool panic)
25{ 31{
@@ -176,10 +182,10 @@ static void __init clear_pgds(unsigned long start,
176 * With folded p4d, pgd_clear() is nop, use p4d_clear() 182 * With folded p4d, pgd_clear() is nop, use p4d_clear()
177 * instead. 183 * instead.
178 */ 184 */
179 if (CONFIG_PGTABLE_LEVELS < 5) 185 if (pgtable_l5_enabled)
180 p4d_clear(p4d_offset(pgd, start));
181 else
182 pgd_clear(pgd); 186 pgd_clear(pgd);
187 else
188 p4d_clear(p4d_offset(pgd, start));
183 } 189 }
184 190
185 pgd = pgd_offset_k(start); 191 pgd = pgd_offset_k(start);
@@ -191,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
191{ 197{
192 unsigned long p4d; 198 unsigned long p4d;
193 199
194 if (!IS_ENABLED(CONFIG_X86_5LEVEL)) 200 if (!pgtable_l5_enabled)
195 return (p4d_t *)pgd; 201 return (p4d_t *)pgd;
196 202
197 p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; 203 p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -272,7 +278,7 @@ void __init kasan_early_init(void)
272 for (i = 0; i < PTRS_PER_PUD; i++) 278 for (i = 0; i < PTRS_PER_PUD; i++)
273 kasan_zero_pud[i] = __pud(pud_val); 279 kasan_zero_pud[i] = __pud(pud_val);
274 280
275 for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) 281 for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
276 kasan_zero_p4d[i] = __p4d(p4d_val); 282 kasan_zero_p4d[i] = __p4d(p4d_val);
277 283
278 kasan_map_early_shadow(early_top_pgt); 284 kasan_map_early_shadow(early_top_pgt);
@@ -303,7 +309,7 @@ void __init kasan_init(void)
303 * bunch of things like kernel code, modules, EFI mapping, etc. 309 * bunch of things like kernel code, modules, EFI mapping, etc.
304 * We need to take extra steps to not overwrite them. 310 * We need to take extra steps to not overwrite them.
305 */ 311 */
306 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 312 if (pgtable_l5_enabled) {
307 void *ptr; 313 void *ptr;
308 314
309 ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); 315 ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aedebd2ebf1e..615cc03ced84 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,23 +34,12 @@
34#define TB_SHIFT 40 34#define TB_SHIFT 40
35 35
36/* 36/*
37 * Virtual address start and end range for randomization.
38 *
39 * The end address could depend on more configuration options to make the 37 * The end address could depend on more configuration options to make the
40 * highest amount of space for randomization available, but that's too hard 38 * highest amount of space for randomization available, but that's too hard
41 * to keep straight and caused issues already. 39 * to keep straight and caused issues already.
42 */ 40 */
43static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
44static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; 41static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
45 42
46/* Default values */
47unsigned long page_offset_base = __PAGE_OFFSET_BASE;
48EXPORT_SYMBOL(page_offset_base);
49unsigned long vmalloc_base = __VMALLOC_BASE;
50EXPORT_SYMBOL(vmalloc_base);
51unsigned long vmemmap_base = __VMEMMAP_BASE;
52EXPORT_SYMBOL(vmemmap_base);
53
54/* 43/*
55 * Memory regions randomized by KASLR (except modules that use a separate logic 44 * Memory regions randomized by KASLR (except modules that use a separate logic
56 * earlier during boot). The list is ordered based on virtual addresses. This 45 * earlier during boot). The list is ordered based on virtual addresses. This
@@ -60,8 +49,8 @@ static __initdata struct kaslr_memory_region {
60 unsigned long *base; 49 unsigned long *base;
61 unsigned long size_tb; 50 unsigned long size_tb;
62} kaslr_regions[] = { 51} kaslr_regions[] = {
63 { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, 52 { &page_offset_base, 0 },
64 { &vmalloc_base, VMALLOC_SIZE_TB }, 53 { &vmalloc_base, 0 },
65 { &vmemmap_base, 1 }, 54 { &vmemmap_base, 1 },
66}; 55};
67 56
@@ -84,11 +73,14 @@ static inline bool kaslr_memory_enabled(void)
84void __init kernel_randomize_memory(void) 73void __init kernel_randomize_memory(void)
85{ 74{
86 size_t i; 75 size_t i;
87 unsigned long vaddr = vaddr_start; 76 unsigned long vaddr_start, vaddr;
88 unsigned long rand, memory_tb; 77 unsigned long rand, memory_tb;
89 struct rnd_state rand_state; 78 struct rnd_state rand_state;
90 unsigned long remain_entropy; 79 unsigned long remain_entropy;
91 80
81 vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
82 vaddr = vaddr_start;
83
92 /* 84 /*
93 * These BUILD_BUG_ON checks ensure the memory layout is consistent 85 * These BUILD_BUG_ON checks ensure the memory layout is consistent
94 * with the vaddr_start/vaddr_end variables. These checks are very 86 * with the vaddr_start/vaddr_end variables. These checks are very
@@ -101,6 +93,9 @@ void __init kernel_randomize_memory(void)
101 if (!kaslr_memory_enabled()) 93 if (!kaslr_memory_enabled())
102 return; 94 return;
103 95
96 kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
97 kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
98
104 /* 99 /*
105 * Update Physical memory mapping to available and 100 * Update Physical memory mapping to available and
106 * add padding if needed (especially for memory hotplug support). 101 * add padding if needed (especially for memory hotplug support).
@@ -129,7 +124,7 @@ void __init kernel_randomize_memory(void)
129 */ 124 */
130 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); 125 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
131 prandom_bytes_state(&rand_state, &rand, sizeof(rand)); 126 prandom_bytes_state(&rand_state, &rand, sizeof(rand));
132 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 127 if (pgtable_l5_enabled)
133 entropy = (rand % (entropy + 1)) & P4D_MASK; 128 entropy = (rand % (entropy + 1)) & P4D_MASK;
134 else 129 else
135 entropy = (rand % (entropy + 1)) & PUD_MASK; 130 entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -141,7 +136,7 @@ void __init kernel_randomize_memory(void)
141 * randomization alignment. 136 * randomization alignment.
142 */ 137 */
143 vaddr += get_padding(&kaslr_regions[i]); 138 vaddr += get_padding(&kaslr_regions[i]);
144 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 139 if (pgtable_l5_enabled)
145 vaddr = round_up(vaddr + 1, P4D_SIZE); 140 vaddr = round_up(vaddr + 1, P4D_SIZE);
146 else 141 else
147 vaddr = round_up(vaddr + 1, PUD_SIZE); 142 vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -217,7 +212,7 @@ void __meminit init_trampoline(void)
217 return; 212 return;
218 } 213 }
219 214
220 if (IS_ENABLED(CONFIG_X86_5LEVEL)) 215 if (pgtable_l5_enabled)
221 init_trampoline_p4d(); 216 init_trampoline_p4d();
222 else 217 else
223 init_trampoline_pud(); 218 init_trampoline_pud();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 1a53071e2e17..3a1b5fe4c2ca 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -25,17 +25,12 @@
25#include <asm/bootparam.h> 25#include <asm/bootparam.h>
26#include <asm/set_memory.h> 26#include <asm/set_memory.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/sections.h>
29#include <asm/processor-flags.h> 28#include <asm/processor-flags.h>
30#include <asm/msr.h> 29#include <asm/msr.h>
31#include <asm/cmdline.h> 30#include <asm/cmdline.h>
32 31
33#include "mm_internal.h" 32#include "mm_internal.h"
34 33
35static char sme_cmdline_arg[] __initdata = "mem_encrypt";
36static char sme_cmdline_on[] __initdata = "on";
37static char sme_cmdline_off[] __initdata = "off";
38
39/* 34/*
40 * Since SME related variables are set early in the boot process they must 35 * Since SME related variables are set early in the boot process they must
41 * reside in the .data section so as not to be zeroed out when the .bss 36 * reside in the .data section so as not to be zeroed out when the .bss
@@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask);
46DEFINE_STATIC_KEY_FALSE(sev_enable_key); 41DEFINE_STATIC_KEY_FALSE(sev_enable_key);
47EXPORT_SYMBOL_GPL(sev_enable_key); 42EXPORT_SYMBOL_GPL(sev_enable_key);
48 43
49static bool sev_enabled __section(.data); 44bool sev_enabled __section(.data);
50 45
51/* Buffer used for early in-place encryption by BSP, no locking needed */ 46/* Buffer used for early in-place encryption by BSP, no locking needed */
52static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); 47static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -463,574 +458,3 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
463 /* Make the SWIOTLB buffer area decrypted */ 458 /* Make the SWIOTLB buffer area decrypted */
464 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); 459 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
465} 460}
466
467struct sme_populate_pgd_data {
468 void *pgtable_area;
469 pgd_t *pgd;
470
471 pmdval_t pmd_flags;
472 pteval_t pte_flags;
473 unsigned long paddr;
474
475 unsigned long vaddr;
476 unsigned long vaddr_end;
477};
478
479static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
480{
481 unsigned long pgd_start, pgd_end, pgd_size;
482 pgd_t *pgd_p;
483
484 pgd_start = ppd->vaddr & PGDIR_MASK;
485 pgd_end = ppd->vaddr_end & PGDIR_MASK;
486
487 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
488
489 pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
490
491 memset(pgd_p, 0, pgd_size);
492}
493
494#define PGD_FLAGS _KERNPG_TABLE_NOENC
495#define P4D_FLAGS _KERNPG_TABLE_NOENC
496#define PUD_FLAGS _KERNPG_TABLE_NOENC
497#define PMD_FLAGS _KERNPG_TABLE_NOENC
498
499#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
500
501#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
502#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
503 (_PAGE_PAT | _PAGE_PWT))
504
505#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
506
507#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
508
509#define PTE_FLAGS_DEC PTE_FLAGS
510#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
511 (_PAGE_PAT | _PAGE_PWT))
512
513#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
514
515static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
516{
517 pgd_t *pgd_p;
518 p4d_t *p4d_p;
519 pud_t *pud_p;
520 pmd_t *pmd_p;
521
522 pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
523 if (native_pgd_val(*pgd_p)) {
524 if (IS_ENABLED(CONFIG_X86_5LEVEL))
525 p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
526 else
527 pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
528 } else {
529 pgd_t pgd;
530
531 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
532 p4d_p = ppd->pgtable_area;
533 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
534 ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
535
536 pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
537 } else {
538 pud_p = ppd->pgtable_area;
539 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
540 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
541
542 pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
543 }
544 native_set_pgd(pgd_p, pgd);
545 }
546
547 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
548 p4d_p += p4d_index(ppd->vaddr);
549 if (native_p4d_val(*p4d_p)) {
550 pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
551 } else {
552 p4d_t p4d;
553
554 pud_p = ppd->pgtable_area;
555 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
556 ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
557
558 p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
559 native_set_p4d(p4d_p, p4d);
560 }
561 }
562
563 pud_p += pud_index(ppd->vaddr);
564 if (native_pud_val(*pud_p)) {
565 if (native_pud_val(*pud_p) & _PAGE_PSE)
566 return NULL;
567
568 pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
569 } else {
570 pud_t pud;
571
572 pmd_p = ppd->pgtable_area;
573 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
574 ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
575
576 pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
577 native_set_pud(pud_p, pud);
578 }
579
580 return pmd_p;
581}
582
583static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
584{
585 pmd_t *pmd_p;
586
587 pmd_p = sme_prepare_pgd(ppd);
588 if (!pmd_p)
589 return;
590
591 pmd_p += pmd_index(ppd->vaddr);
592 if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
593 native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
594}
595
596static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
597{
598 pmd_t *pmd_p;
599 pte_t *pte_p;
600
601 pmd_p = sme_prepare_pgd(ppd);
602 if (!pmd_p)
603 return;
604
605 pmd_p += pmd_index(ppd->vaddr);
606 if (native_pmd_val(*pmd_p)) {
607 if (native_pmd_val(*pmd_p) & _PAGE_PSE)
608 return;
609
610 pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
611 } else {
612 pmd_t pmd;
613
614 pte_p = ppd->pgtable_area;
615 memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
616 ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
617
618 pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
619 native_set_pmd(pmd_p, pmd);
620 }
621
622 pte_p += pte_index(ppd->vaddr);
623 if (!native_pte_val(*pte_p))
624 native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
625}
626
627static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
628{
629 while (ppd->vaddr < ppd->vaddr_end) {
630 sme_populate_pgd_large(ppd);
631
632 ppd->vaddr += PMD_PAGE_SIZE;
633 ppd->paddr += PMD_PAGE_SIZE;
634 }
635}
636
637static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
638{
639 while (ppd->vaddr < ppd->vaddr_end) {
640 sme_populate_pgd(ppd);
641
642 ppd->vaddr += PAGE_SIZE;
643 ppd->paddr += PAGE_SIZE;
644 }
645}
646
647static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
648 pmdval_t pmd_flags, pteval_t pte_flags)
649{
650 unsigned long vaddr_end;
651
652 ppd->pmd_flags = pmd_flags;
653 ppd->pte_flags = pte_flags;
654
655 /* Save original end value since we modify the struct value */
656 vaddr_end = ppd->vaddr_end;
657
658 /* If start is not 2MB aligned, create PTE entries */
659 ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
660 __sme_map_range_pte(ppd);
661
662 /* Create PMD entries */
663 ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
664 __sme_map_range_pmd(ppd);
665
666 /* If end is not 2MB aligned, create PTE entries */
667 ppd->vaddr_end = vaddr_end;
668 __sme_map_range_pte(ppd);
669}
670
671static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
672{
673 __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
674}
675
676static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
677{
678 __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
679}
680
681static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
682{
683 __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
684}
685
686static unsigned long __init sme_pgtable_calc(unsigned long len)
687{
688 unsigned long p4d_size, pud_size, pmd_size, pte_size;
689 unsigned long total;
690
691 /*
692 * Perform a relatively simplistic calculation of the pagetable
693 * entries that are needed. Those mappings will be covered mostly
694 * by 2MB PMD entries so we can conservatively calculate the required
695 * number of P4D, PUD and PMD structures needed to perform the
696 * mappings. For mappings that are not 2MB aligned, PTE mappings
697 * would be needed for the start and end portion of the address range
698 * that fall outside of the 2MB alignment. This results in, at most,
699 * two extra pages to hold PTE entries for each range that is mapped.
700 * Incrementing the count for each covers the case where the addresses
701 * cross entries.
702 */
703 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
704 p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
705 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
706 pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
707 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
708 } else {
709 p4d_size = 0;
710 pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
711 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
712 }
713 pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
714 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
715 pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
716
717 total = p4d_size + pud_size + pmd_size + pte_size;
718
719 /*
720 * Now calculate the added pagetable structures needed to populate
721 * the new pagetables.
722 */
723 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
724 p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
725 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
726 pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
727 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
728 } else {
729 p4d_size = 0;
730 pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
731 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
732 }
733 pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
734 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
735
736 total += p4d_size + pud_size + pmd_size;
737
738 return total;
739}
740
741void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
742{
743 unsigned long workarea_start, workarea_end, workarea_len;
744 unsigned long execute_start, execute_end, execute_len;
745 unsigned long kernel_start, kernel_end, kernel_len;
746 unsigned long initrd_start, initrd_end, initrd_len;
747 struct sme_populate_pgd_data ppd;
748 unsigned long pgtable_area_len;
749 unsigned long decrypted_base;
750
751 if (!sme_active())
752 return;
753
754 /*
755 * Prepare for encrypting the kernel and initrd by building new
756 * pagetables with the necessary attributes needed to encrypt the
757 * kernel in place.
758 *
759 * One range of virtual addresses will map the memory occupied
760 * by the kernel and initrd as encrypted.
761 *
762 * Another range of virtual addresses will map the memory occupied
763 * by the kernel and initrd as decrypted and write-protected.
764 *
765 * The use of write-protect attribute will prevent any of the
766 * memory from being cached.
767 */
768
769 /* Physical addresses gives us the identity mapped virtual addresses */
770 kernel_start = __pa_symbol(_text);
771 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
772 kernel_len = kernel_end - kernel_start;
773
774 initrd_start = 0;
775 initrd_end = 0;
776 initrd_len = 0;
777#ifdef CONFIG_BLK_DEV_INITRD
778 initrd_len = (unsigned long)bp->hdr.ramdisk_size |
779 ((unsigned long)bp->ext_ramdisk_size << 32);
780 if (initrd_len) {
781 initrd_start = (unsigned long)bp->hdr.ramdisk_image |
782 ((unsigned long)bp->ext_ramdisk_image << 32);
783 initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
784 initrd_len = initrd_end - initrd_start;
785 }
786#endif
787
788 /* Set the encryption workarea to be immediately after the kernel */
789 workarea_start = kernel_end;
790
791 /*
792 * Calculate required number of workarea bytes needed:
793 * executable encryption area size:
794 * stack page (PAGE_SIZE)
795 * encryption routine page (PAGE_SIZE)
796 * intermediate copy buffer (PMD_PAGE_SIZE)
797 * pagetable structures for the encryption of the kernel
798 * pagetable structures for workarea (in case not currently mapped)
799 */
800 execute_start = workarea_start;
801 execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
802 execute_len = execute_end - execute_start;
803
804 /*
805 * One PGD for both encrypted and decrypted mappings and a set of
806 * PUDs and PMDs for each of the encrypted and decrypted mappings.
807 */
808 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
809 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
810 if (initrd_len)
811 pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
812
813 /* PUDs and PMDs needed in the current pagetables for the workarea */
814 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
815
816 /*
817 * The total workarea includes the executable encryption area and
818 * the pagetable area. The start of the workarea is already 2MB
819 * aligned, align the end of the workarea on a 2MB boundary so that
820 * we don't try to create/allocate PTE entries from the workarea
821 * before it is mapped.
822 */
823 workarea_len = execute_len + pgtable_area_len;
824 workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
825
826 /*
827 * Set the address to the start of where newly created pagetable
828 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
829 * structures are created when the workarea is added to the current
830 * pagetables and when the new encrypted and decrypted kernel
831 * mappings are populated.
832 */
833 ppd.pgtable_area = (void *)execute_end;
834
835 /*
836 * Make sure the current pagetable structure has entries for
837 * addressing the workarea.
838 */
839 ppd.pgd = (pgd_t *)native_read_cr3_pa();
840 ppd.paddr = workarea_start;
841 ppd.vaddr = workarea_start;
842 ppd.vaddr_end = workarea_end;
843 sme_map_range_decrypted(&ppd);
844
845 /* Flush the TLB - no globals so cr3 is enough */
846 native_write_cr3(__native_read_cr3());
847
848 /*
849 * A new pagetable structure is being built to allow for the kernel
850 * and initrd to be encrypted. It starts with an empty PGD that will
851 * then be populated with new PUDs and PMDs as the encrypted and
852 * decrypted kernel mappings are created.
853 */
854 ppd.pgd = ppd.pgtable_area;
855 memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
856 ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
857
858 /*
859 * A different PGD index/entry must be used to get different
860 * pagetable entries for the decrypted mapping. Choose the next
861 * PGD index and convert it to a virtual address to be used as
862 * the base of the mapping.
863 */
864 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
865 if (initrd_len) {
866 unsigned long check_base;
867
868 check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
869 decrypted_base = max(decrypted_base, check_base);
870 }
871 decrypted_base <<= PGDIR_SHIFT;
872
873 /* Add encrypted kernel (identity) mappings */
874 ppd.paddr = kernel_start;
875 ppd.vaddr = kernel_start;
876 ppd.vaddr_end = kernel_end;
877 sme_map_range_encrypted(&ppd);
878
879 /* Add decrypted, write-protected kernel (non-identity) mappings */
880 ppd.paddr = kernel_start;
881 ppd.vaddr = kernel_start + decrypted_base;
882 ppd.vaddr_end = kernel_end + decrypted_base;
883 sme_map_range_decrypted_wp(&ppd);
884
885 if (initrd_len) {
886 /* Add encrypted initrd (identity) mappings */
887 ppd.paddr = initrd_start;
888 ppd.vaddr = initrd_start;
889 ppd.vaddr_end = initrd_end;
890 sme_map_range_encrypted(&ppd);
891 /*
892 * Add decrypted, write-protected initrd (non-identity) mappings
893 */
894 ppd.paddr = initrd_start;
895 ppd.vaddr = initrd_start + decrypted_base;
896 ppd.vaddr_end = initrd_end + decrypted_base;
897 sme_map_range_decrypted_wp(&ppd);
898 }
899
900 /* Add decrypted workarea mappings to both kernel mappings */
901 ppd.paddr = workarea_start;
902 ppd.vaddr = workarea_start;
903 ppd.vaddr_end = workarea_end;
904 sme_map_range_decrypted(&ppd);
905
906 ppd.paddr = workarea_start;
907 ppd.vaddr = workarea_start + decrypted_base;
908 ppd.vaddr_end = workarea_end + decrypted_base;
909 sme_map_range_decrypted(&ppd);
910
911 /* Perform the encryption */
912 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
913 kernel_len, workarea_start, (unsigned long)ppd.pgd);
914
915 if (initrd_len)
916 sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
917 initrd_len, workarea_start,
918 (unsigned long)ppd.pgd);
919
920 /*
921 * At this point we are running encrypted. Remove the mappings for
922 * the decrypted areas - all that is needed for this is to remove
923 * the PGD entry/entries.
924 */
925 ppd.vaddr = kernel_start + decrypted_base;
926 ppd.vaddr_end = kernel_end + decrypted_base;
927 sme_clear_pgd(&ppd);
928
929 if (initrd_len) {
930 ppd.vaddr = initrd_start + decrypted_base;
931 ppd.vaddr_end = initrd_end + decrypted_base;
932 sme_clear_pgd(&ppd);
933 }
934
935 ppd.vaddr = workarea_start + decrypted_base;
936 ppd.vaddr_end = workarea_end + decrypted_base;
937 sme_clear_pgd(&ppd);
938
939 /* Flush the TLB - no globals so cr3 is enough */
940 native_write_cr3(__native_read_cr3());
941}
942
943void __init __nostackprotector sme_enable(struct boot_params *bp)
944{
945 const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
946 unsigned int eax, ebx, ecx, edx;
947 unsigned long feature_mask;
948 bool active_by_default;
949 unsigned long me_mask;
950 char buffer[16];
951 u64 msr;
952
953 /* Check for the SME/SEV support leaf */
954 eax = 0x80000000;
955 ecx = 0;
956 native_cpuid(&eax, &ebx, &ecx, &edx);
957 if (eax < 0x8000001f)
958 return;
959
960#define AMD_SME_BIT BIT(0)
961#define AMD_SEV_BIT BIT(1)
962 /*
963 * Set the feature mask (SME or SEV) based on whether we are
964 * running under a hypervisor.
965 */
966 eax = 1;
967 ecx = 0;
968 native_cpuid(&eax, &ebx, &ecx, &edx);
969 feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
970
971 /*
972 * Check for the SME/SEV feature:
973 * CPUID Fn8000_001F[EAX]
974 * - Bit 0 - Secure Memory Encryption support
975 * - Bit 1 - Secure Encrypted Virtualization support
976 * CPUID Fn8000_001F[EBX]
977 * - Bits 5:0 - Pagetable bit position used to indicate encryption
978 */
979 eax = 0x8000001f;
980 ecx = 0;
981 native_cpuid(&eax, &ebx, &ecx, &edx);
982 if (!(eax & feature_mask))
983 return;
984
985 me_mask = 1UL << (ebx & 0x3f);
986
987 /* Check if memory encryption is enabled */
988 if (feature_mask == AMD_SME_BIT) {
989 /* For SME, check the SYSCFG MSR */
990 msr = __rdmsr(MSR_K8_SYSCFG);
991 if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
992 return;
993 } else {
994 /* For SEV, check the SEV MSR */
995 msr = __rdmsr(MSR_AMD64_SEV);
996 if (!(msr & MSR_AMD64_SEV_ENABLED))
997 return;
998
999 /* SEV state cannot be controlled by a command line option */
1000 sme_me_mask = me_mask;
1001 sev_enabled = true;
1002 return;
1003 }
1004
1005 /*
1006 * Fixups have not been applied to phys_base yet and we're running
1007 * identity mapped, so we must obtain the address to the SME command
1008 * line argument data using rip-relative addressing.
1009 */
1010 asm ("lea sme_cmdline_arg(%%rip), %0"
1011 : "=r" (cmdline_arg)
1012 : "p" (sme_cmdline_arg));
1013 asm ("lea sme_cmdline_on(%%rip), %0"
1014 : "=r" (cmdline_on)
1015 : "p" (sme_cmdline_on));
1016 asm ("lea sme_cmdline_off(%%rip), %0"
1017 : "=r" (cmdline_off)
1018 : "p" (sme_cmdline_off));
1019
1020 if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
1021 active_by_default = true;
1022 else
1023 active_by_default = false;
1024
1025 cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
1026 ((u64)bp->ext_cmd_line_ptr << 32));
1027
1028 cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
1029
1030 if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
1031 sme_me_mask = me_mask;
1032 else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
1033 sme_me_mask = 0;
1034 else
1035 sme_me_mask = active_by_default ? me_mask : 0;
1036}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
new file mode 100644
index 000000000000..1b2197d13832
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -0,0 +1,564 @@
1/*
2 * AMD Memory Encryption Support
3 *
4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
5 *
6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#define DISABLE_BRANCH_PROFILING
14
15/*
16 * Since we're dealing with identity mappings, physical and virtual
17 * addresses are the same, so override these defines which are ultimately
18 * used by the headers in misc.h.
19 */
20#define __pa(x) ((unsigned long)(x))
21#define __va(x) ((void *)((unsigned long)(x)))
22
23/*
24 * Special hack: we have to be careful, because no indirections are
25 * allowed here, and paravirt_ops is a kind of one. As it will only run in
26 * baremetal anyway, we just keep it from happening. (This list needs to
27 * be extended when new paravirt and debugging variants are added.)
28 */
29#undef CONFIG_PARAVIRT
30#undef CONFIG_PARAVIRT_SPINLOCKS
31
32#include <linux/kernel.h>
33#include <linux/mm.h>
34#include <linux/mem_encrypt.h>
35
36#include <asm/setup.h>
37#include <asm/sections.h>
38#include <asm/cmdline.h>
39
40#include "mm_internal.h"
41
42#define PGD_FLAGS _KERNPG_TABLE_NOENC
43#define P4D_FLAGS _KERNPG_TABLE_NOENC
44#define PUD_FLAGS _KERNPG_TABLE_NOENC
45#define PMD_FLAGS _KERNPG_TABLE_NOENC
46
47#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
48
49#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
50#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
51 (_PAGE_PAT | _PAGE_PWT))
52
53#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
54
55#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
56
57#define PTE_FLAGS_DEC PTE_FLAGS
58#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
59 (_PAGE_PAT | _PAGE_PWT))
60
61#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
62
63struct sme_populate_pgd_data {
64 void *pgtable_area;
65 pgd_t *pgd;
66
67 pmdval_t pmd_flags;
68 pteval_t pte_flags;
69 unsigned long paddr;
70
71 unsigned long vaddr;
72 unsigned long vaddr_end;
73};
74
75static char sme_cmdline_arg[] __initdata = "mem_encrypt";
76static char sme_cmdline_on[] __initdata = "on";
77static char sme_cmdline_off[] __initdata = "off";
78
79static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
80{
81 unsigned long pgd_start, pgd_end, pgd_size;
82 pgd_t *pgd_p;
83
84 pgd_start = ppd->vaddr & PGDIR_MASK;
85 pgd_end = ppd->vaddr_end & PGDIR_MASK;
86
87 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
88
89 pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
90
91 memset(pgd_p, 0, pgd_size);
92}
93
94static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
95{
96 pgd_t *pgd;
97 p4d_t *p4d;
98 pud_t *pud;
99 pmd_t *pmd;
100
101 pgd = ppd->pgd + pgd_index(ppd->vaddr);
102 if (pgd_none(*pgd)) {
103 p4d = ppd->pgtable_area;
104 memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
105 ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
106 set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
107 }
108
109 p4d = p4d_offset(pgd, ppd->vaddr);
110 if (p4d_none(*p4d)) {
111 pud = ppd->pgtable_area;
112 memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
113 ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
114 set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
115 }
116
117 pud = pud_offset(p4d, ppd->vaddr);
118 if (pud_none(*pud)) {
119 pmd = ppd->pgtable_area;
120 memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
121 ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
122 set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
123 }
124
125 if (pud_large(*pud))
126 return NULL;
127
128 return pud;
129}
130
131static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
132{
133 pud_t *pud;
134 pmd_t *pmd;
135
136 pud = sme_prepare_pgd(ppd);
137 if (!pud)
138 return;
139
140 pmd = pmd_offset(pud, ppd->vaddr);
141 if (pmd_large(*pmd))
142 return;
143
144 set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
145}
146
147static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
148{
149 pud_t *pud;
150 pmd_t *pmd;
151 pte_t *pte;
152
153 pud = sme_prepare_pgd(ppd);
154 if (!pud)
155 return;
156
157 pmd = pmd_offset(pud, ppd->vaddr);
158 if (pmd_none(*pmd)) {
159 pte = ppd->pgtable_area;
160 memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
161 ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
162 set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
163 }
164
165 if (pmd_large(*pmd))
166 return;
167
168 pte = pte_offset_map(pmd, ppd->vaddr);
169 if (pte_none(*pte))
170 set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
171}
172
173static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
174{
175 while (ppd->vaddr < ppd->vaddr_end) {
176 sme_populate_pgd_large(ppd);
177
178 ppd->vaddr += PMD_PAGE_SIZE;
179 ppd->paddr += PMD_PAGE_SIZE;
180 }
181}
182
183static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
184{
185 while (ppd->vaddr < ppd->vaddr_end) {
186 sme_populate_pgd(ppd);
187
188 ppd->vaddr += PAGE_SIZE;
189 ppd->paddr += PAGE_SIZE;
190 }
191}
192
193static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
194 pmdval_t pmd_flags, pteval_t pte_flags)
195{
196 unsigned long vaddr_end;
197
198 ppd->pmd_flags = pmd_flags;
199 ppd->pte_flags = pte_flags;
200
201 /* Save original end value since we modify the struct value */
202 vaddr_end = ppd->vaddr_end;
203
204 /* If start is not 2MB aligned, create PTE entries */
205 ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
206 __sme_map_range_pte(ppd);
207
208 /* Create PMD entries */
209 ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
210 __sme_map_range_pmd(ppd);
211
212 /* If end is not 2MB aligned, create PTE entries */
213 ppd->vaddr_end = vaddr_end;
214 __sme_map_range_pte(ppd);
215}
216
217static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
218{
219 __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
220}
221
222static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
223{
224 __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
225}
226
227static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
228{
229 __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
230}
231
232static unsigned long __init sme_pgtable_calc(unsigned long len)
233{
234 unsigned long entries = 0, tables = 0;
235
236 /*
237 * Perform a relatively simplistic calculation of the pagetable
238 * entries that are needed. Those mappings will be covered mostly
239 * by 2MB PMD entries so we can conservatively calculate the required
240 * number of P4D, PUD and PMD structures needed to perform the
241 * mappings. For mappings that are not 2MB aligned, PTE mappings
242 * would be needed for the start and end portion of the address range
243 * that fall outside of the 2MB alignment. This results in, at most,
244 * two extra pages to hold PTE entries for each range that is mapped.
245 * Incrementing the count for each covers the case where the addresses
246 * cross entries.
247 */
248
249 /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
250 if (PTRS_PER_P4D > 1)
251 entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
252 entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
253 entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
254 entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
255
256 /*
257 * Now calculate the added pagetable structures needed to populate
258 * the new pagetables.
259 */
260
261 if (PTRS_PER_P4D > 1)
262 tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
263 tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
264 tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
265
266 return entries + tables;
267}
268
269void __init sme_encrypt_kernel(struct boot_params *bp)
270{
271 unsigned long workarea_start, workarea_end, workarea_len;
272 unsigned long execute_start, execute_end, execute_len;
273 unsigned long kernel_start, kernel_end, kernel_len;
274 unsigned long initrd_start, initrd_end, initrd_len;
275 struct sme_populate_pgd_data ppd;
276 unsigned long pgtable_area_len;
277 unsigned long decrypted_base;
278
279 if (!sme_active())
280 return;
281
282 /*
283 * Prepare for encrypting the kernel and initrd by building new
284 * pagetables with the necessary attributes needed to encrypt the
285 * kernel in place.
286 *
287 * One range of virtual addresses will map the memory occupied
288 * by the kernel and initrd as encrypted.
289 *
290 * Another range of virtual addresses will map the memory occupied
291 * by the kernel and initrd as decrypted and write-protected.
292 *
293 * The use of write-protect attribute will prevent any of the
294 * memory from being cached.
295 */
296
297 /* Physical addresses gives us the identity mapped virtual addresses */
298 kernel_start = __pa_symbol(_text);
299 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
300 kernel_len = kernel_end - kernel_start;
301
302 initrd_start = 0;
303 initrd_end = 0;
304 initrd_len = 0;
305#ifdef CONFIG_BLK_DEV_INITRD
306 initrd_len = (unsigned long)bp->hdr.ramdisk_size |
307 ((unsigned long)bp->ext_ramdisk_size << 32);
308 if (initrd_len) {
309 initrd_start = (unsigned long)bp->hdr.ramdisk_image |
310 ((unsigned long)bp->ext_ramdisk_image << 32);
311 initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
312 initrd_len = initrd_end - initrd_start;
313 }
314#endif
315
316 /* Set the encryption workarea to be immediately after the kernel */
317 workarea_start = kernel_end;
318
319 /*
320 * Calculate required number of workarea bytes needed:
321 * executable encryption area size:
322 * stack page (PAGE_SIZE)
323 * encryption routine page (PAGE_SIZE)
324 * intermediate copy buffer (PMD_PAGE_SIZE)
325 * pagetable structures for the encryption of the kernel
326 * pagetable structures for workarea (in case not currently mapped)
327 */
328 execute_start = workarea_start;
329 execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
330 execute_len = execute_end - execute_start;
331
332 /*
333 * One PGD for both encrypted and decrypted mappings and a set of
334 * PUDs and PMDs for each of the encrypted and decrypted mappings.
335 */
336 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
337 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
338 if (initrd_len)
339 pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
340
341 /* PUDs and PMDs needed in the current pagetables for the workarea */
342 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
343
344 /*
345 * The total workarea includes the executable encryption area and
346 * the pagetable area. The start of the workarea is already 2MB
347 * aligned, align the end of the workarea on a 2MB boundary so that
348 * we don't try to create/allocate PTE entries from the workarea
349 * before it is mapped.
350 */
351 workarea_len = execute_len + pgtable_area_len;
352 workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
353
354 /*
355 * Set the address to the start of where newly created pagetable
356 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
357 * structures are created when the workarea is added to the current
358 * pagetables and when the new encrypted and decrypted kernel
359 * mappings are populated.
360 */
361 ppd.pgtable_area = (void *)execute_end;
362
363 /*
364 * Make sure the current pagetable structure has entries for
365 * addressing the workarea.
366 */
367 ppd.pgd = (pgd_t *)native_read_cr3_pa();
368 ppd.paddr = workarea_start;
369 ppd.vaddr = workarea_start;
370 ppd.vaddr_end = workarea_end;
371 sme_map_range_decrypted(&ppd);
372
373 /* Flush the TLB - no globals so cr3 is enough */
374 native_write_cr3(__native_read_cr3());
375
376 /*
377 * A new pagetable structure is being built to allow for the kernel
378 * and initrd to be encrypted. It starts with an empty PGD that will
379 * then be populated with new PUDs and PMDs as the encrypted and
380 * decrypted kernel mappings are created.
381 */
382 ppd.pgd = ppd.pgtable_area;
383 memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
384 ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
385
386 /*
387 * A different PGD index/entry must be used to get different
388 * pagetable entries for the decrypted mapping. Choose the next
389 * PGD index and convert it to a virtual address to be used as
390 * the base of the mapping.
391 */
392 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
393 if (initrd_len) {
394 unsigned long check_base;
395
396 check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
397 decrypted_base = max(decrypted_base, check_base);
398 }
399 decrypted_base <<= PGDIR_SHIFT;
400
401 /* Add encrypted kernel (identity) mappings */
402 ppd.paddr = kernel_start;
403 ppd.vaddr = kernel_start;
404 ppd.vaddr_end = kernel_end;
405 sme_map_range_encrypted(&ppd);
406
407 /* Add decrypted, write-protected kernel (non-identity) mappings */
408 ppd.paddr = kernel_start;
409 ppd.vaddr = kernel_start + decrypted_base;
410 ppd.vaddr_end = kernel_end + decrypted_base;
411 sme_map_range_decrypted_wp(&ppd);
412
413 if (initrd_len) {
414 /* Add encrypted initrd (identity) mappings */
415 ppd.paddr = initrd_start;
416 ppd.vaddr = initrd_start;
417 ppd.vaddr_end = initrd_end;
418 sme_map_range_encrypted(&ppd);
419 /*
420 * Add decrypted, write-protected initrd (non-identity) mappings
421 */
422 ppd.paddr = initrd_start;
423 ppd.vaddr = initrd_start + decrypted_base;
424 ppd.vaddr_end = initrd_end + decrypted_base;
425 sme_map_range_decrypted_wp(&ppd);
426 }
427
428 /* Add decrypted workarea mappings to both kernel mappings */
429 ppd.paddr = workarea_start;
430 ppd.vaddr = workarea_start;
431 ppd.vaddr_end = workarea_end;
432 sme_map_range_decrypted(&ppd);
433
434 ppd.paddr = workarea_start;
435 ppd.vaddr = workarea_start + decrypted_base;
436 ppd.vaddr_end = workarea_end + decrypted_base;
437 sme_map_range_decrypted(&ppd);
438
439 /* Perform the encryption */
440 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
441 kernel_len, workarea_start, (unsigned long)ppd.pgd);
442
443 if (initrd_len)
444 sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
445 initrd_len, workarea_start,
446 (unsigned long)ppd.pgd);
447
448 /*
449 * At this point we are running encrypted. Remove the mappings for
450 * the decrypted areas - all that is needed for this is to remove
451 * the PGD entry/entries.
452 */
453 ppd.vaddr = kernel_start + decrypted_base;
454 ppd.vaddr_end = kernel_end + decrypted_base;
455 sme_clear_pgd(&ppd);
456
457 if (initrd_len) {
458 ppd.vaddr = initrd_start + decrypted_base;
459 ppd.vaddr_end = initrd_end + decrypted_base;
460 sme_clear_pgd(&ppd);
461 }
462
463 ppd.vaddr = workarea_start + decrypted_base;
464 ppd.vaddr_end = workarea_end + decrypted_base;
465 sme_clear_pgd(&ppd);
466
467 /* Flush the TLB - no globals so cr3 is enough */
468 native_write_cr3(__native_read_cr3());
469}
470
471void __init sme_enable(struct boot_params *bp)
472{
473 const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
474 unsigned int eax, ebx, ecx, edx;
475 unsigned long feature_mask;
476 bool active_by_default;
477 unsigned long me_mask;
478 char buffer[16];
479 u64 msr;
480
481 /* Check for the SME/SEV support leaf */
482 eax = 0x80000000;
483 ecx = 0;
484 native_cpuid(&eax, &ebx, &ecx, &edx);
485 if (eax < 0x8000001f)
486 return;
487
488#define AMD_SME_BIT BIT(0)
489#define AMD_SEV_BIT BIT(1)
490 /*
491 * Set the feature mask (SME or SEV) based on whether we are
492 * running under a hypervisor.
493 */
494 eax = 1;
495 ecx = 0;
496 native_cpuid(&eax, &ebx, &ecx, &edx);
497 feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
498
499 /*
500 * Check for the SME/SEV feature:
501 * CPUID Fn8000_001F[EAX]
502 * - Bit 0 - Secure Memory Encryption support
503 * - Bit 1 - Secure Encrypted Virtualization support
504 * CPUID Fn8000_001F[EBX]
505 * - Bits 5:0 - Pagetable bit position used to indicate encryption
506 */
507 eax = 0x8000001f;
508 ecx = 0;
509 native_cpuid(&eax, &ebx, &ecx, &edx);
510 if (!(eax & feature_mask))
511 return;
512
513 me_mask = 1UL << (ebx & 0x3f);
514
515 /* Check if memory encryption is enabled */
516 if (feature_mask == AMD_SME_BIT) {
517 /* For SME, check the SYSCFG MSR */
518 msr = __rdmsr(MSR_K8_SYSCFG);
519 if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
520 return;
521 } else {
522 /* For SEV, check the SEV MSR */
523 msr = __rdmsr(MSR_AMD64_SEV);
524 if (!(msr & MSR_AMD64_SEV_ENABLED))
525 return;
526
527 /* SEV state cannot be controlled by a command line option */
528 sme_me_mask = me_mask;
529 sev_enabled = true;
530 return;
531 }
532
533 /*
534 * Fixups have not been applied to phys_base yet and we're running
535 * identity mapped, so we must obtain the address to the SME command
536 * line argument data using rip-relative addressing.
537 */
538 asm ("lea sme_cmdline_arg(%%rip), %0"
539 : "=r" (cmdline_arg)
540 : "p" (sme_cmdline_arg));
541 asm ("lea sme_cmdline_on(%%rip), %0"
542 : "=r" (cmdline_on)
543 : "p" (sme_cmdline_on));
544 asm ("lea sme_cmdline_off(%%rip), %0"
545 : "=r" (cmdline_off)
546 : "p" (sme_cmdline_off));
547
548 if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
549 active_by_default = true;
550 else
551 active_by_default = false;
552
553 cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
554 ((u64)bp->ext_cmd_line_ptr << 32));
555
556 cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
557
558 if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
559 sme_me_mask = me_mask;
560 else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
561 sme_me_mask = 0;
562 else
563 sme_me_mask = active_by_default ? me_mask : 0;
564}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index aca6295350f3..e8a4a09e20f1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end)
60 } 60 }
61 printk(KERN_CONT "\n"); 61 printk(KERN_CONT "\n");
62} 62}
63
64unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
65 unsigned long end_pfn)
66{
67 unsigned long nr_pages = end_pfn - start_pfn;
68
69 if (!nr_pages)
70 return 0;
71
72 return (nr_pages + 1) * sizeof(struct page);
73}
74#endif 63#endif
75 64
76extern unsigned long highend_pfn, highstart_pfn; 65extern unsigned long highend_pfn, highstart_pfn;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7f1a51399674..e055d1a06699 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
157 unsigned long sp = current_stack_pointer; 157 unsigned long sp = current_stack_pointer;
158 pgd_t *pgd = pgd_offset(mm, sp); 158 pgd_t *pgd = pgd_offset(mm, sp);
159 159
160 if (CONFIG_PGTABLE_LEVELS > 4) { 160 if (pgtable_l5_enabled) {
161 if (unlikely(pgd_none(*pgd))) { 161 if (unlikely(pgd_none(*pgd))) {
162 pgd_t *pgd_ref = pgd_offset_k(sp); 162 pgd_t *pgd_ref = pgd_offset_k(sp);
163 163
@@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
613{ 613{
614 int cpu; 614 int cpu;
615 615
616 struct flush_tlb_info info = { 616 struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
617 .mm = mm, 617 .mm = mm,
618 }; 618 };
619 619
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index f9cfbc0d1f33..7f443bd1411d 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -27,6 +27,7 @@
27#include <linux/ioport.h> 27#include <linux/ioport.h>
28#include <linux/mc146818rtc.h> 28#include <linux/mc146818rtc.h>
29#include <linux/efi.h> 29#include <linux/efi.h>
30#include <linux/export.h>
30#include <linux/uaccess.h> 31#include <linux/uaccess.h>
31#include <linux/io.h> 32#include <linux/io.h>
32#include <linux/reboot.h> 33#include <linux/reboot.h>
@@ -190,7 +191,8 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
190 early_code_mapping_set_exec(0); 191 early_code_mapping_set_exec(0);
191} 192}
192 193
193static pgd_t *efi_pgd; 194pgd_t *efi_pgd;
195EXPORT_SYMBOL_GPL(efi_pgd);
194 196
195/* 197/*
196 * We need our own copy of the higher levels of the page tables 198 * We need our own copy of the higher levels of the page tables
@@ -225,7 +227,7 @@ int __init efi_alloc_page_tables(void)
225 227
226 pud = pud_alloc(&init_mm, p4d, EFI_VA_END); 228 pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
227 if (!pud) { 229 if (!pud) {
228 if (CONFIG_PGTABLE_LEVELS > 4) 230 if (pgtable_l5_enabled)
229 free_page((unsigned long) pgd_page_vaddr(*pgd)); 231 free_page((unsigned long) pgd_page_vaddr(*pgd));
230 free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); 232 free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
231 return -ENOMEM; 233 return -ENOMEM;
@@ -255,8 +257,8 @@ void efi_sync_low_kernel_mappings(void)
255 * only span a single PGD entry and that the entry also maps 257 * only span a single PGD entry and that the entry also maps
256 * other important kernel regions. 258 * other important kernel regions.
257 */ 259 */
258 BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); 260 MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
259 BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != 261 MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
260 (EFI_VA_END & PGDIR_MASK)); 262 (EFI_VA_END & PGDIR_MASK));
261 263
262 pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); 264 pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index fb1df9488e98..2ebdf31d9996 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -199,6 +199,12 @@ void __init x86_intel_mid_early_setup(void)
199 199
200 legacy_pic = &null_legacy_pic; 200 legacy_pic = &null_legacy_pic;
201 201
202 /*
203 * Do nothing for now as everything needed done in
204 * x86_intel_mid_early_setup() below.
205 */
206 x86_init.acpi.reduced_hw_early_init = x86_init_noop;
207
202 pm_power_off = intel_mid_power_off; 208 pm_power_off = intel_mid_power_off;
203 machine_ops.emergency_restart = intel_mid_reboot; 209 machine_ops.emergency_restart = intel_mid_reboot;
204 210
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 0ef5e5204968..74a532989308 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
50{ 50{
51 pmd_t *pmd; 51 pmd_t *pmd;
52 pud_t *pud; 52 pud_t *pud;
53 p4d_t *p4d; 53 p4d_t *p4d = NULL;
54 54
55 /* 55 /*
56 * The new mapping only has to cover the page containing the image 56 * The new mapping only has to cover the page containing the image
@@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
66 * tables used by the image kernel. 66 * tables used by the image kernel.
67 */ 67 */
68 68
69 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 69 if (pgtable_l5_enabled) {
70 p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); 70 p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
71 if (!p4d) 71 if (!p4d)
72 return -ENOMEM; 72 return -ENOMEM;
@@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
84 __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); 84 __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
85 set_pud(pud + pud_index(restore_jump_address), 85 set_pud(pud + pud_index(restore_jump_address),
86 __pud(__pa(pmd) | _KERNPG_TABLE)); 86 __pud(__pa(pmd) | _KERNPG_TABLE));
87 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 87 if (p4d) {
88 set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); 88 set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
89 set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); 89 set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
90 } else { 90 } else {
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index f605825a04ab..c1f98f32c45f 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -18,9 +18,6 @@ config XEN_PV
18 bool "Xen PV guest support" 18 bool "Xen PV guest support"
19 default y 19 default y
20 depends on XEN 20 depends on XEN
21 # XEN_PV is not ready to work with 5-level paging.
22 # Changes to hypervisor are also required.
23 depends on !X86_5LEVEL
24 select XEN_HAVE_PVMMU 21 select XEN_HAVE_PVMMU
25 select XEN_HAVE_VPMU 22 select XEN_HAVE_VPMU
26 help 23 help
@@ -79,6 +76,4 @@ config XEN_DEBUG_FS
79config XEN_PVH 76config XEN_PVH
80 bool "Support for running as a PVH guest" 77 bool "Support for running as a PVH guest"
81 depends on XEN && XEN_PVHVM && ACPI 78 depends on XEN && XEN_PVHVM && ACPI
82 # Pre-built page tables are not ready to handle 5-level paging.
83 depends on !X86_5LEVEL
84 def_bool n 79 def_bool n
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 436c4f003e17..aa1c6a6831a9 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -6,6 +6,7 @@
6#include <asm/io_apic.h> 6#include <asm/io_apic.h>
7#include <asm/hypervisor.h> 7#include <asm/hypervisor.h>
8#include <asm/e820/api.h> 8#include <asm/e820/api.h>
9#include <asm/x86_init.h>
9 10
10#include <asm/xen/interface.h> 11#include <asm/xen/interface.h>
11#include <asm/xen/hypercall.h> 12#include <asm/xen/hypercall.h>
@@ -16,15 +17,20 @@
16/* 17/*
17 * PVH variables. 18 * PVH variables.
18 * 19 *
19 * xen_pvh and pvh_bootparams need to live in data segment since they 20 * xen_pvh pvh_bootparams and pvh_start_info need to live in data segment
20 * are used after startup_{32|64}, which clear .bss, are invoked. 21 * since they are used after startup_{32|64}, which clear .bss, are invoked.
21 */ 22 */
22bool xen_pvh __attribute__((section(".data"))) = 0; 23bool xen_pvh __attribute__((section(".data"))) = 0;
23struct boot_params pvh_bootparams __attribute__((section(".data"))); 24struct boot_params pvh_bootparams __attribute__((section(".data")));
25struct hvm_start_info pvh_start_info __attribute__((section(".data")));
24 26
25struct hvm_start_info pvh_start_info;
26unsigned int pvh_start_info_sz = sizeof(pvh_start_info); 27unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
27 28
29static u64 pvh_get_root_pointer(void)
30{
31 return pvh_start_info.rsdp_paddr;
32}
33
28static void __init init_pvh_bootparams(void) 34static void __init init_pvh_bootparams(void)
29{ 35{
30 struct xen_memory_map memmap; 36 struct xen_memory_map memmap;
@@ -71,6 +77,8 @@ static void __init init_pvh_bootparams(void)
71 */ 77 */
72 pvh_bootparams.hdr.version = 0x212; 78 pvh_bootparams.hdr.version = 0x212;
73 pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ 79 pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
80
81 x86_init.acpi.get_root_pointer = pvh_get_root_pointer;
74} 82}
75 83
76/* 84/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index aae88fec9941..d20763472920 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -538,6 +538,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
538 538
539 xen_mc_issue(PARAVIRT_LAZY_MMU); 539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540} 540}
541
542#if CONFIG_PGTABLE_LEVELS >= 5
543__visible p4dval_t xen_p4d_val(p4d_t p4d)
544{
545 return pte_mfn_to_pfn(p4d.p4d);
546}
547PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
548
549__visible p4d_t xen_make_p4d(p4dval_t p4d)
550{
551 p4d = pte_pfn_to_mfn(p4d);
552
553 return native_make_p4d(p4d);
554}
555PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
556#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
541#endif /* CONFIG_X86_64 */ 557#endif /* CONFIG_X86_64 */
542 558
543static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, 559static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
@@ -2411,6 +2427,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2411 2427
2412 .alloc_pud = xen_alloc_pmd_init, 2428 .alloc_pud = xen_alloc_pmd_init,
2413 .release_pud = xen_release_pmd_init, 2429 .release_pud = xen_release_pmd_init,
2430
2431#if CONFIG_PGTABLE_LEVELS >= 5
2432 .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
2433 .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
2434#endif
2414#endif /* CONFIG_X86_64 */ 2435#endif /* CONFIG_X86_64 */
2415 2436
2416 .activate_mm = xen_activate_mm, 2437 .activate_mm = xen_activate_mm,
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 3bb46cb24a99..7ca41bf023c9 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -189,12 +189,15 @@ early_param("acpi_rsdp", setup_acpi_rsdp);
189 189
190acpi_physical_address __init acpi_os_get_root_pointer(void) 190acpi_physical_address __init acpi_os_get_root_pointer(void)
191{ 191{
192 acpi_physical_address pa = 0; 192 acpi_physical_address pa;
193 193
194#ifdef CONFIG_KEXEC 194#ifdef CONFIG_KEXEC
195 if (acpi_rsdp) 195 if (acpi_rsdp)
196 return acpi_rsdp; 196 return acpi_rsdp;
197#endif 197#endif
198 pa = acpi_arch_get_root_pointer();
199 if (pa)
200 return pa;
198 201
199 if (efi_enabled(EFI_CONFIG_TABLES)) { 202 if (efi_enabled(EFI_CONFIG_TABLES)) {
200 if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) 203 if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index dfbd9d990637..9c2e0708eb82 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -8,6 +8,7 @@
8#define P4D_SHIFT PGDIR_SHIFT 8#define P4D_SHIFT PGDIR_SHIFT
9#define P4D_SIZE PGDIR_SIZE 9#define P4D_SIZE PGDIR_SIZE
10#define P4D_MASK PGDIR_MASK 10#define P4D_MASK PGDIR_MASK
11#define MAX_PTRS_PER_P4D 1
11#define PTRS_PER_P4D 1 12#define PTRS_PER_P4D 1
12 13
13#define p4d_t pgd_t 14#define p4d_t pgd_t
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index 8f22f55de17a..1a29b2a0282b 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -8,10 +8,11 @@
8 8
9typedef struct { pgd_t pgd; } p4d_t; 9typedef struct { pgd_t pgd; } p4d_t;
10 10
11#define P4D_SHIFT PGDIR_SHIFT 11#define P4D_SHIFT PGDIR_SHIFT
12#define PTRS_PER_P4D 1 12#define MAX_PTRS_PER_P4D 1
13#define P4D_SIZE (1UL << P4D_SHIFT) 13#define PTRS_PER_P4D 1
14#define P4D_MASK (~(P4D_SIZE-1)) 14#define P4D_SIZE (1UL << P4D_SHIFT)
15#define P4D_MASK (~(P4D_SIZE-1))
15 16
16/* 17/*
17 * The "pgd_xxx()" functions here are trivial for a folded two-level 18 * The "pgd_xxx()" functions here are trivial for a folded two-level
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 968173ec2726..15bfb15c2fa5 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -623,6 +623,13 @@ bool acpi_gtdt_c3stop(int type);
623int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); 623int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
624#endif 624#endif
625 625
626#ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
627static inline u64 acpi_arch_get_root_pointer(void)
628{
629 return 0;
630}
631#endif
632
626#else /* !CONFIG_ACPI */ 633#else /* !CONFIG_ACPI */
627 634
628#define acpi_disabled 1 635#define acpi_disabled 1
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc13474a53b..d6459bd1376d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
18extern pte_t kasan_zero_pte[PTRS_PER_PTE]; 18extern pte_t kasan_zero_pte[PTRS_PER_PTE];
19extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; 19extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
20extern pud_t kasan_zero_pud[PTRS_PER_PUD]; 20extern pud_t kasan_zero_pud[PTRS_PER_PUD];
21extern p4d_t kasan_zero_p4d[PTRS_PER_P4D]; 21extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
22 22
23void kasan_populate_zero_shadow(const void *shadow_start, 23void kasan_populate_zero_shadow(const void *shadow_start,
24 const void *shadow_end); 24 const void *shadow_end);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7522a6987595..a2db4576e499 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -816,10 +816,6 @@ int local_memory_node(int node_id);
816static inline int local_memory_node(int node_id) { return node_id; }; 816static inline int local_memory_node(int node_id) { return node_id; };
817#endif 817#endif
818 818
819#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
820unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
821#endif
822
823/* 819/*
824 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 820 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
825 */ 821 */
@@ -1289,7 +1285,6 @@ struct mminit_pfnnid_cache {
1289#endif 1285#endif
1290 1286
1291void memory_present(int nid, unsigned long start, unsigned long end); 1287void memory_present(int nid, unsigned long start, unsigned long end);
1292unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
1293 1288
1294/* 1289/*
1295 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we 1290 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 554e4c0f23a2..f436246ccc79 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -31,7 +31,7 @@
31unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; 31unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
32 32
33#if CONFIG_PGTABLE_LEVELS > 4 33#if CONFIG_PGTABLE_LEVELS > 4
34p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; 34p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
35#endif 35#endif
36#if CONFIG_PGTABLE_LEVELS > 3 36#if CONFIG_PGTABLE_LEVELS > 3
37pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; 37pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7af5e7a92528..79b26f98d793 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -236,28 +236,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
236} 236}
237 237
238/* 238/*
239 * Only used by the i386 NUMA architecures, but relatively
240 * generic code.
241 */
242unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
243 unsigned long end_pfn)
244{
245 unsigned long pfn;
246 unsigned long nr_pages = 0;
247
248 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
249 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
250 if (nid != early_pfn_to_nid(pfn))
251 continue;
252
253 if (pfn_present(pfn))
254 nr_pages += PAGES_PER_SECTION;
255 }
256
257 return nr_pages * sizeof(struct page);
258}
259
260/*
261 * Subtle, we encode the real pfn into the mem_map such that 239 * Subtle, we encode the real pfn into the mem_map such that
262 * the identity pfn - section_mem_map will return the actual 240 * the identity pfn - section_mem_map will return the actual
263 * physical page frame number. 241 * physical page frame number.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c3013505c305..b7f61cd1c709 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -84,18 +84,19 @@
84 * This is made more complicated by various memory models and PAE. 84 * This is made more complicated by various memory models and PAE.
85 */ 85 */
86 86
87#ifndef MAX_PHYSMEM_BITS 87#ifndef MAX_POSSIBLE_PHYSMEM_BITS
88#ifdef CONFIG_HIGHMEM64G 88#ifdef MAX_PHYSMEM_BITS
89#define MAX_PHYSMEM_BITS 36 89#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
90#else /* !CONFIG_HIGHMEM64G */ 90#else
91/* 91/*
92 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 92 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
93 * be PAGE_SHIFT 93 * be PAGE_SHIFT
94 */ 94 */
95#define MAX_PHYSMEM_BITS BITS_PER_LONG 95#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
96#endif 96#endif
97#endif 97#endif
98#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 98
99#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
99 100
100/* 101/*
101 * Memory for allocating for handle keeps object position by 102 * Memory for allocating for handle keeps object position by